diff --git a/Dockerfile b/Dockerfile index 46c09fcb2d..ed17dd92d8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,12 +3,13 @@ FROM ubuntu:18.04 ARG PREFIX=/usr/local ARG GPU_ARCH=";" ARG MIOTENSILE_VER="default" +ARG USE_TARGETID="OFF" # Support multiarch RUN dpkg --add-architecture i386 # Add rocm repository -RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/.apt_3.7/ xenial main > /etc/apt/sources.list.d/rocm.list' +RUN if [ "$USE_TARGETID" = "ON" ] ; then sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang 6416 > /etc/apt/sources.list.d/rocm.list'; else sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/.apt_3.7/ xenial main > /etc/apt/sources.list.d/rocm.list'; fi RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu xenial main universe | tee -a /etc/apt/sources.list" # Install dependencies @@ -84,15 +85,16 @@ ADD min-requirements.txt /min-requirements.txt RUN CXXFLAGS='-isystem $PREFIX/include' cget -p $PREFIX install -f /min-requirements.txt RUN cget -p $PREFIX install danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f -RUN export HIPCC_LINK_FLAGS_APPEND='-O3 -parallel-jobs=4' -RUN export HIPCC_COMPILE_FLAGS_APPEND='-O3 -Wno-format-nonliteral -parallel-jobs=4' - # Install doc requirements ADD doc/requirements.txt /doc-requirements.txt RUN pip install -r /doc-requirements.txt -# install last released miopentensile in default, install latest commits when MIOTENSILE_VER="latest" -RUN if [ "$MIOTENSILE_VER" = "latest" ] ; then cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@7568654c938d42e9a91c6b18fb382f5b978d12fd; else cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@5fe0bf4a8dc59f3ab62df929297280915372ce16; fi +# Use parallel job to accelerate tensile build +# Workaround for Tensile with TargetID feature +RUN if [ "$USE_TARGETID" = "ON" ] ; then export HIPCC_LINK_FLAGS_APPEND='-O3 -parallel-jobs=4' && export HIPCC_COMPILE_FLAGS_APPEND='-O3 -Wno-format-nonliteral -parallel-jobs=4' && rm /usr/bin/hipcc; fi + +# install last released miopentensile in default (master), install latest commits when MIOTENSILE_VER="latest" (develop) +RUN if [ "$USE_TARGETID" = "OFF" ] ; then echo "MIOpenTensile is not installed."; elif [ "$MIOTENSILE_VER" = "latest" ] ; then cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@be26d30d3d7509a414134a45f4a6d49e5da250b8; else cget -p $PREFIX install ROCmSoftwarePlatform/MIOpenTensile@4bfe00a8de61d12862d9fa803b8ea9a981a50f97; fi RUN cd ~ && \ export MLIR_COMMIT=31d92f4c64ae6fa6b7c5d543f68b69300b4513ce && \ diff --git a/Jenkinsfile b/Jenkinsfile index 8defe21def..79bdda87fa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -63,10 +63,11 @@ def buildHipClangJob(Map conf, compiler){ def image = "miopen" def cmd = conf.get("cmd", "") def gpu_arch = conf.get("gpu_arch", "gfx900;gfx906") + def target_id = conf.get("target_id", "OFF") def codecov = conf.get("codecov", false) def miotensile_version = conf.get("miotensile_version", "default") def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" - def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg MIOTENSILE_VER='${miotensile_version}' " + def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg MIOTENSILE_VER='${miotensile_version}' --build-arg USE_TARGETID='${target_id}' " def extradebugflags = "" def variant = env.STAGE_NAME if (codecov) { @@ -131,6 +132,20 @@ def reboot(){ build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),] } +def tensileStage(cmd, gpu_arch, miotensile_version, target_id){ + try{ + buildHipClangJob('/opt/rocm/llvm/bin/clang++', cmd: cmd, gpu_arch: gpu_arch, miotensile_version: miotensile_version, target_id: target_id) + } + catch(e){ + echo "throwing error exception for the stage" + echo 'Exception occurred: ' + e.toString() + throw e + } + finally{ + reboot() + } +} + /// Stage name format: /// [DataType] Backend[/Compiler] BuildType [TestSet] [Target] /// @@ -173,6 +188,10 @@ pipeline { description: "") booleanParam( name: "MIOPENTENSILE", + defaultValue: false, + description: "") + booleanParam( + name: "MIOPENTENSILE_LATEST", defaultValue: true, description: "") booleanParam( @@ -881,20 +900,68 @@ pipeline { } steps{ script{ - try{ - buildHipClangJob('/opt/rocm/llvm/bin/clang++', cmd: cmd) - } - catch(e){ - echo "throwing error exception for the stage" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - reboot() - } + tensileStage(cmd, "gfx906:xnack-", "default", "ON") + } + } + } + + stage('Fp16 Hip Tensile All Vega20') { + agent{ label rocmnode("vega20") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx906:xnack-", "default", "ON") + } + } + } + + stage('Bf16 Hip Tensile All Vega20') { + agent{ label rocmnode("vega20") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx906:xnack-", "default", "ON") + } + } + } + + stage('Int8 Hip Tensile All Vega20') { + agent{ label rocmnode("vega20") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx906:xnack-", "default", "ON") } } } + stage('Fp32 Hip Tensile All gfx908') { agent{ label rocmnode("gfx908") } environment{ @@ -907,20 +974,73 @@ pipeline { } steps{ script{ - try{ - buildHipClangJob('/opt/rocm/llvm/bin/clang++', cmd: cmd, gpu_arch: "gfx908") - } - catch(e){ - echo "throwing error exception for the stage" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - reboot() - } + tensileStage(cmd, "gfx908:xnack-", "default", "ON") + } + } + } + + stage('Fp16 Hip Tensile All gfx908') { + agent{ label rocmnode("gfx908") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx908:xnack-", "default", "ON") + } + } + } + + stage('Bf16 Hip Tensile All gfx908') { + agent{ label rocmnode("gfx908") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx908:xnack-", "default", "ON") } } } + + stage('Int8 Hip Tensile All gfx908') { + agent{ label rocmnode("gfx908") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx908:xnack-", "default", "ON") + } + } + } + } + } + + stage("MIOpenTensile Latest"){ + when { expression { params.MIOPENTENSILE_LATEST } } + parallel{ stage('Fp32 Hip Tensile-Latest All Vega20') { agent{ label rocmnode("vega20") } environment{ @@ -933,20 +1053,68 @@ pipeline { } steps{ script{ - try{ - buildHipClangJob('/opt/rocm/llvm/bin/clang++', cmd: cmd, gpu_arch: "gfx906", miotensile_version: "latest") - } - catch(e){ - echo "throwing error exception for the stage" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - reboot() - } + tensileStage(cmd, "gfx906:xnack-", "latest", "ON") + } + } + } + + stage('Fp16 Hip Tensile-Latest All Vega20') { + agent{ label rocmnode("vega20") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_HALF=On -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_ALL=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF -DMIOPEN_TEST_FLAGS=--disable-verification-cache .. + MIOPEN_DEBUG_HIP_KERNELS=0 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx906:xnack-", "latest", "ON") + } + } + } + + stage('Bf16 Hip Tensile-Latest All Vega20') { + agent{ label rocmnode("vega20") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx906:xnack-", "latest", "ON") } } } + + stage('Int8 Hip Tensile-Latest All Vega20') { + agent{ label rocmnode("vega20") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx906:xnack-", "latest", "ON") + } + } + } + stage('Fp32 Hip Tensile-Latest All gfx908') { agent{ label rocmnode("gfx908") } environment{ @@ -959,17 +1127,64 @@ pipeline { } steps{ script{ - try{ - buildHipClangJob('/opt/rocm/llvm/bin/clang++', cmd: cmd, gpu_arch: "gfx908", miotensile_version: "latest") - } - catch(e){ - echo "throwing error exception for the stage" - echo 'Exception occurred: ' + e.toString() - throw e - } - finally{ - reboot() - } + tensileStage(cmd, "gfx908:xnack-", "latest", "ON") + } + } + } + + stage('Fp16 Hip Tensile-Latest All gfx908') { + agent{ label rocmnode("gfx908") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_HALF=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx908:xnack-", "latest", "ON") + } + } + } + + stage('Bf16 Hip Tensile-Latest All gfx908') { + agent{ label rocmnode("gfx908") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx908:xnack-", "latest", "ON") + } + } + } + + stage('Int8 Hip Tensile-Latest All gfx908') { + agent{ label rocmnode("gfx908") } + environment{ + cmd = """ + ulimit -c unlimited + rm -rf build + mkdir build + cd build + CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_INT8=On -DMIOPEN_TEST_GFX908=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_MIOTENSILE=ON -DMIOPEN_USE_MIOPENTENSILE=ON -DMIOPEN_USE_ROCBLAS=OFF .. + MIOPEN_DEBUG_HIP_KERNELS=0 MIOPEN_LOG_LEVEL=5 CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check + """ + } + steps{ + script{ + tensileStage(cmd, "gfx908:xnack-", "latest", "ON") } } } diff --git a/requirements.txt b/requirements.txt index 9276b61405..db0eeb508f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ RadeonOpenCompute/rocm-cmake@cdd0f632b3a65bd4411593bb827eb664e25c80bc --build RadeonOpenCompute/clang-ocl@930015924b012c332d373535ff31a663b6ad2c64 ROCmSoftwarePlatform/MIOpenGEMM@0eb1257cfaef83ea155aabd67af4437c0028db48 ROCmSoftwarePlatform/rocBLAS@9790a8658341bc665c11c311129ad0dfc533d5c4 -# ROCmSoftwarePlatform/MIOpenTensile@5fe0bf4a8dc59f3ab62df929297280915372ce16 +# ROCmSoftwarePlatform/MIOpenTensile@master diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index e13d7630ab..31f69e10e2 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -149,20 +149,13 @@ static GemmBackend_t enforce_gemm_backend(miopenDataType_t data_type, // make sure backend chosen based on env variable is suppported #if MIOPEN_USE_MIOPENTENSILE + (void)data_type; switch(gemm_backend_env) { case GemmBackend_t::nogemmbackend: gemm_backend_enforced = GemmBackend_t::nogemmbackend; break; case GemmBackend_t::rocblas: case GemmBackend_t::miopengemm: - case GemmBackend_t::miopentensile: - gemm_backend_enforced = (data_type == miopenFloat) ? GemmBackend_t::miopentensile : -#if MIOPEN_USE_ROCBLAS - GemmBackend_t::rocblas -#else - GemmBackend_t::nogemmbackend -#endif - ; - break; + case GemmBackend_t::miopentensile: gemm_backend_enforced = GemmBackend_t::miopentensile; break; } #elif MIOPEN_USE_ROCBLAS and MIOPEN_USE_MIOPENGEMM switch(gemm_backend_env) @@ -270,8 +263,49 @@ miopenStatus_t CallGemmMIOpenTensile(const Handle& handle, { MIOPEN_LOG_FUNCTION("MIOpenTensile"); - if(gemm_desc.dataType != miopenFloat) - return miopenStatusNotImplemented; + miopen_tensile_type miotsl_in_dtype, miotsl_out_dtype; + Data_t ptrA, ptrB, ptrC; + switch(gemm_desc.dataType) + { + case miopenFloat: + miotsl_in_dtype = miopen_tensile_type_float; + ptrA = Data_t(reinterpret_cast(A) + a_offset); + ptrB = Data_t(reinterpret_cast(B) + b_offset); + ptrC = Data_t(reinterpret_cast(C) + c_offset); + break; + case miopenHalf: + miotsl_in_dtype = miopen_tensile_type_half; + ptrA = Data_t(reinterpret_cast(A) + a_offset); + ptrB = Data_t(reinterpret_cast(B) + b_offset); + ptrC = Data_t(reinterpret_cast(C) + c_offset); + break; + case miopenBFloat16: + miotsl_in_dtype = miopen_tensile_type_bfloat16; + ptrA = Data_t(reinterpret_cast(A) + a_offset); + ptrB = Data_t(reinterpret_cast(B) + b_offset); + ptrC = Data_t(reinterpret_cast(C) + c_offset); + break; + case miopenInt32: + miotsl_in_dtype = miopen_tensile_type_int32; + ptrA = Data_t(reinterpret_cast(A) + a_offset); + ptrB = Data_t(reinterpret_cast(B) + b_offset); + ptrC = Data_t(reinterpret_cast(C) + c_offset); + break; + case miopenInt8: + case miopenInt8x4: + miotsl_in_dtype = miopen_tensile_type_int8x4; + ptrA = Data_t(reinterpret_cast(A) + a_offset); + ptrB = Data_t(reinterpret_cast(B) + b_offset); + ptrC = Data_t(reinterpret_cast(C) + c_offset); + } + if(gemm_desc.dataType == miopenInt8 || gemm_desc.dataType == miopenInt8x4) + { + miotsl_out_dtype = miopen_tensile_type_int32; + } + else + { + miotsl_out_dtype = miotsl_in_dtype; + } #if MIOPEN_BACKEND_HIP HipEventPtr start = nullptr; @@ -302,21 +336,21 @@ miopenStatus_t CallGemmMIOpenTensile(const Handle& handle, miopen_tensile_matrix mtA{{mtA_len0, mtA_len1}, {mtA_str0, mtA_str1}, {mtA_b_n, mtA_b_str}, - miopen_tensile_type_float, + miotsl_in_dtype, gemm_desc.transA, - Data_t(reinterpret_cast(A) + a_offset)}; + ptrA}; miopen_tensile_matrix mtB{{mtB_len0, mtB_len1}, {mtB_str0, mtB_str1}, {mtB_b_n, mtB_b_str}, - miopen_tensile_type_float, + miotsl_in_dtype, gemm_desc.transB, - Data_t(reinterpret_cast(B) + b_offset)}; + ptrB}; miopen_tensile_matrix mtC{{mtC_len0, mtC_len1}, {mtC_str0, mtC_str1}, {mtC_b_n, mtC_b_str}, - miopen_tensile_type_float, + miotsl_out_dtype, false, - Data_t(reinterpret_cast(C) + c_offset)}; + ptrC}; miopen_tensile_status mt_status = miopen_tensile_status_no_solution; #if MIOPEN_BACKEND_HIP @@ -358,13 +392,9 @@ miopenStatus_t CallGemm(const Handle& handle, gemm_backend = enforce_gemm_backend(gemm_desc.dataType, gemm_backend); // do row-to-column major conversion here +// add macro to distinguish MIOpenTensile and rocBlas logic #if MIOPEN_USE_MIOPENTENSILE - if((gemm_desc.isColMajor && gemm_desc.dataType == miopenFloat) -#if MIOPEN_USE_ROCBLAS - || - (!gemm_desc.isColMajor && gemm_desc.dataType != miopenFloat) -#endif - ) + if(gemm_desc.isColMajor) #else if(!gemm_desc.isColMajor) #endif @@ -658,13 +688,9 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, gemm_backend = enforce_gemm_backend(gemm_desc.dataType, gemm_backend); // do row-to-column major conversion here +// add macro to distinguish MIOpenTensile and rocBlas logic #if MIOPEN_USE_MIOPENTENSILE - if((gemm_desc.isColMajor && gemm_desc.dataType == miopenFloat) -#if MIOPEN_USE_ROCBLAS - || - (!gemm_desc.isColMajor && gemm_desc.dataType != miopenFloat) -#endif - ) + if(gemm_desc.isColMajor) #else if(!gemm_desc.isColMajor) #endif @@ -907,13 +933,9 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, gemm_backend = enforce_gemm_backend(gemm_desc.dataType, gemm_backend); // do row-to-column major conversion here +// add macro to distinguish MIOpenTensile and rocBlas logic #if MIOPEN_USE_MIOPENTENSILE - if((gemm_desc.isColMajor && gemm_desc.dataType == miopenFloat) -#if MIOPEN_USE_ROCBLAS - || - (!gemm_desc.isColMajor && gemm_desc.dataType != miopenFloat) -#endif - ) + if(gemm_desc.isColMajor) #else if(!gemm_desc.isColMajor) #endif diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 7d30b2a2d1..a8a82f3fb7 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -74,9 +74,9 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_COMPILE_ONLY) #if MIOPEN_USE_GEMM #ifdef CPPCHECK // Keep the value unknown in cppcheck since this can differ between opencl and hip -static bool IsUseRocBlas; +static bool IsBF16PathValid; #else -static const bool IsUseRocBlas = (MIOPEN_USE_ROCBLAS == 1); +static const bool IsBF16PathValid = (MIOPEN_USE_ROCBLAS == 1 || MIOPEN_USE_MIOPENTENSILE == 1); #endif static inline bool IsAnyBufferBF16(const TensorDescriptor& xDesc, @@ -669,7 +669,7 @@ bool ConvolutionDescriptor::IsGemmApplicableWrw(const TensorDescriptor& dyDesc, { #if MIOPEN_USE_GEMM if(!miopen::IsDisabled(MIOPEN_DEBUG_CONV_GEMM{}) && - !(IsAnyBufferBF16(xDesc, dyDesc, dwDesc) && !IsUseRocBlas)) + !(IsAnyBufferBF16(xDesc, dyDesc, dwDesc) && !IsBF16PathValid)) { const std::size_t spatial_dim = GetSpatialDimension(); const auto wei_spatial = boost::adaptors::slice(dwDesc.GetLengths(), 2, 2 + spatial_dim); @@ -1644,7 +1644,7 @@ void ConvolutionDescriptor::FindConvBwdWeightsAlgorithm(Handle& handle, perf_db = UserFindDbRecord::TryLoad(handle, problem, [&](DbRecord& record) { #if MIOPEN_USE_GEMM if(!miopen::IsDisabled(MIOPEN_DEBUG_CONV_GEMM{}) && - !(IsAnyBufferBF16(xDesc, dyDesc, dwDesc) && !IsUseRocBlas)) + !(IsAnyBufferBF16(xDesc, dyDesc, dwDesc) && !IsBF16PathValid)) { const bool time_precision = (!IsDisabled(MIOPEN_CONV_PRECISE_ROCBLAS_TIMING{})); @@ -1934,7 +1934,7 @@ void ConvolutionDescriptor::BackwardWeightsGemm(Handle& handle, { MIOPEN_THROW("GEMM convolution is disabled"); } - if(IsAnyBufferBF16(tensors.xDesc, tensors.dyDesc, tensors.dwDesc) && !IsUseRocBlas) + if(IsAnyBufferBF16(tensors.xDesc, tensors.dyDesc, tensors.dwDesc) && !IsBF16PathValid) { MIOPEN_THROW("GEMM convolution is unsupported"); } diff --git a/src/solver/conv_MP_bidirectional_winograd.cpp b/src/solver/conv_MP_bidirectional_winograd.cpp index d5a1802b2f..052d200452 100644 --- a/src/solver/conv_MP_bidirectional_winograd.cpp +++ b/src/solver/conv_MP_bidirectional_winograd.cpp @@ -151,7 +151,7 @@ GetWinoBuffer(const ConvolutionContext& params, template inline bool IsApplicableGEMM(const ConvolutionContext& params) { -#if(MIOPEN_BACKEND_HIP && MIOPEN_USE_ROCBLAS) +#if(MIOPEN_BACKEND_HIP && (MIOPEN_USE_ROCBLAS || MIOPEN_USE_MIOPENTENSILE)) const miopenDataType_t transform_data_type = miopen::IsEnabled(MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM{}) @@ -429,7 +429,7 @@ InvokerFactory MakeWinogradInvokerFactory(const ConvolutionContext& params, } else { -#if MIOPEN_USE_ROCBLAS +#if MIOPEN_USE_ROCBLAS || MIOPEN_USE_MIOPENTENSILE // GEMM gemm_conv_kernel_name = "WRW_WINO_GEMM: "; @@ -453,7 +453,7 @@ InvokerFactory MakeWinogradInvokerFactory(const ConvolutionContext& params, gemm_conv_factory = [=](const std::vector&) { return [=](const Handle& handle, const AnyInvokeParams& ctx) { -#if MIOPEN_USE_ROCBLAS +#if MIOPEN_USE_ROCBLAS || MIOPEN_USE_MIOPENTENSILE const auto& data_ctx = ctx.CastTo(); Data_t workSpace = data_ctx.workSpace; CallGemmStridedBatched( diff --git a/src/solver/conv_multipass_wino3x3WrW.cpp b/src/solver/conv_multipass_wino3x3WrW.cpp index 89e827151f..9be6ffe0a5 100644 --- a/src/solver/conv_multipass_wino3x3WrW.cpp +++ b/src/solver/conv_multipass_wino3x3WrW.cpp @@ -364,10 +364,6 @@ bool ConvWinograd3x3MultipassWrW // ROCBLAS for GEMM step #if(MIOPEN_BACKEND_HIP && (MIOPEN_USE_ROCBLAS || MIOPEN_USE_MIOPENTENSILE)) -#if(!MIOPEN_USE_ROCBLAS) - if(!params.IsFp32()) - return false; -#endif static const int wino_data_tile = std::max(WinoDataH, WinoDataW); static const int wino_filter_tile = std::max(WinoFilterH, WinoFilterW); diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp index 519021dece..6d48530008 100644 --- a/src/solver/gemm.cpp +++ b/src/solver/gemm.cpp @@ -53,9 +53,10 @@ namespace solver { #if MIOPEN_USE_GEMM #ifdef CPPCHECK // Keep the value unknown in cppcheck since this can differ between opencl and hip -static bool IsUseRocBlas; +static bool IsBF16PathValid; #else -static constexpr const bool IsUseRocBlas = (MIOPEN_USE_ROCBLAS == 1); +static constexpr const bool IsBF16PathValid = + (MIOPEN_USE_ROCBLAS == 1 || MIOPEN_USE_MIOPENTENSILE == 1); #endif static inline bool IsAnyBufferBF16(const TensorDescriptor& xDesc, @@ -75,9 +76,9 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext&, const auto& wDesc = problem.GetWeights(); const auto& yDesc = problem.GetOut(); return problem.GetDirection() == conv::Direction::Forward && problem.IsLayoutDefault() && - !(IsAnyBufferBF16(xDesc, yDesc, wDesc) && !IsUseRocBlas); + !(IsAnyBufferBF16(xDesc, yDesc, wDesc) && !IsBF16PathValid); #else - std::ignore = problem; + std::ignore = problem; return false; #endif }; diff --git a/src/solver/gemm_bwd.cpp b/src/solver/gemm_bwd.cpp index dab2f78b1f..316a0d5b66 100644 --- a/src/solver/gemm_bwd.cpp +++ b/src/solver/gemm_bwd.cpp @@ -50,9 +50,10 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_CONV_PRECISE_ROCBLAS_TIMING) #if MIOPEN_USE_GEMM #ifdef CPPCHECK // Keep the value unknown in cppcheck since this can differ between opencl and hip -static bool IsUseRocBlas; +static bool IsBF16PathValid; #else -static constexpr const bool IsUseRocBlas = (MIOPEN_USE_ROCBLAS == 1); +static constexpr const bool IsBF16PathValid = + (MIOPEN_USE_ROCBLAS == 1 || MIOPEN_USE_MIOPENTENSILE == 1); #endif namespace miopen { @@ -89,9 +90,9 @@ bool GemmBwdBase::IsApplicable(const ExecutionContext&, const auto& wDesc = problem.GetWeights(); const auto& dxDesc = problem.GetOut(); return problem.GetDirection() == conv::Direction::BackwardData && problem.IsLayoutDefault() && - !(IsAnyBufferBF16(dxDesc, dyDesc, wDesc) && !IsUseRocBlas); + !(IsAnyBufferBF16(dxDesc, dyDesc, wDesc) && !IsBF16PathValid); #else - std::ignore = problem; + std::ignore = problem; return false; #endif } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 92e48b8112..285e49a40b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -64,29 +64,45 @@ set(SKIP_ALL_EXCEPT_TESTS dummy) set(SKIP_TESTS dummy) # dummy is for REMOVE_DUPLICATES set(MIOPEN_TEST_FLOAT_ARG) + +set(MIOPEN_TEST_FLOAT Off) if(MIOPEN_TEST_HALF) - if(MIOPEN_BACKEND_OPENCL) - set(SKIP_TESTS test_gru test_rnn_vanilla test_lstm test_conv_igemm_dynamic) - endif() set(MIOPEN_TEST_FLOAT_ARG --half) elseif(MIOPEN_TEST_INT8) - set(SKIP_ALL_EXCEPT_TESTS test_tensor_vec test_tensor_cast test_tensor_trans test_tensor_copy test_tensor_set test_tensor_transform test_conv2d) set(MIOPEN_TEST_FLOAT_ARG --int8) elseif(MIOPEN_TEST_BFLOAT16) - set(SKIP_ALL_EXCEPT_TESTS test_conv2d test_tensor_copy test_tensor_set test_tensor_vec test_immed_conv2d test_check_numerics_test) - if(MIOPEN_TEST_GFX908) - list(APPEND SKIP_ALL_EXCEPT_TESTS test_conv_extra test_conv_for_implicit_gemm test_miopen_conv test_deepbench_conv) - endif() set(MIOPEN_TEST_FLOAT_ARG --bfloat16) +else() + set(MIOPEN_TEST_FLOAT On) endif() -if(MIOPEN_TEST_GFX908) - list(APPEND SKIP_TESTS test_main test_tensor_scale test_tensor_set test_tensor_transform test_tensor_vec test_w_supertensor test_dropout test_immed_conv3d test_conv3d test_soft_max test_fusion_aux test_activation test_lrn_test test_ctc test_conv2d_bias test_conv3d_bias test_cba_inference test_cbna_inference test_pooling2d test_na_train test_na_inference test_bn_aux test_conv_igemm_dynamic) +if(NOT MIOPEN_TEST_MIOTENSILE) + if(MIOPEN_TEST_HALF) + if(MIOPEN_BACKEND_OPENCL) + set(SKIP_TESTS test_gru test_rnn_vanilla test_lstm test_conv_igemm_dynamic) + endif() + elseif(MIOPEN_TEST_INT8) + set(SKIP_ALL_EXCEPT_TESTS test_tensor_vec test_tensor_cast test_tensor_trans test_tensor_copy test_tensor_set test_tensor_transform test_conv2d) + elseif(MIOPEN_TEST_BFLOAT16) + set(SKIP_ALL_EXCEPT_TESTS test_conv2d test_tensor_copy test_tensor_set test_tensor_vec test_immed_conv2d test_check_numerics_test) + if(MIOPEN_TEST_GFX908) + list(APPEND SKIP_ALL_EXCEPT_TESTS test_conv_extra test_conv_for_implicit_gemm test_miopen_conv test_deepbench_conv) + endif() + endif() +else() + if(MIOPEN_TEST_HALF) + set(SKIP_ALL_EXCEPT_TESTS test_conv2d test_conv3d test_immed_conv2d test_immed_conv3d test_gru test_rnn_vanilla test_lstm test_gru_extra test_rnn_extra test_lstm_extra ) + elseif(MIOPEN_TEST_INT8) + set(SKIP_ALL_EXCEPT_TESTS test_conv2d) + elseif(MIOPEN_TEST_BFLOAT16) + set(SKIP_ALL_EXCEPT_TESTS test_conv2d test_immed_conv2d) + else() + set(SKIP_ALL_EXCEPT_TESTS test_conv2d test_conv3d test_immed_conv2d test_immed_conv3d test_gru test_rnn_vanilla test_lstm test_gru_extra test_rnn_extra test_lstm_extra ) + endif() endif() -if(MIOPEN_TEST_MIOTENSILE) - set(SKIP_ALL_EXCEPT_TESTS test_conv2d test_conv3d test_immed_conv2d test_immed_conv3d test_gru test_rnn_vanilla test_lstm test_gru_extra test_rnn_extra test_lstm_extra ) - list(APPEND SKIP_TESTS test_conv_igemm_dynamic test_conv_igemm_dynamic_small test_conv_for_implicit_gemm) +if(MIOPEN_TEST_GFX908) + list(APPEND SKIP_TESTS test_main test_tensor_scale test_tensor_set test_tensor_transform test_tensor_vec test_w_supertensor test_dropout test_immed_conv3d test_conv3d test_soft_max test_fusion_aux test_activation test_lrn_test test_ctc test_conv2d_bias test_conv3d_bias test_cba_inference test_cbna_inference test_pooling2d test_na_train test_na_inference test_bn_aux test_conv_igemm_dynamic) endif() list(REMOVE_DUPLICATES SKIP_TESTS) @@ -94,10 +110,7 @@ list(REMOVE_DUPLICATES SKIP_ALL_EXCEPT_TESTS) function(add_test_command NAME EXE) # Restrict the use of SKIP_ALL_EXCEPT_TESTS list in the Int8, BF16 and MIOpenTensile tests - if((NOT (NAME IN_LIST SKIP_ALL_EXCEPT_TESTS)) AND (MIOPEN_TEST_INT8 OR MIOPEN_TEST_BFLOAT16 OR MIOPEN_TEST_MIOTENSILE)) - add_test(NAME ${NAME} COMMAND echo skipped) - set_tests_properties(${NAME} PROPERTIES DISABLED On) - elseif(NAME IN_LIST SKIP_TESTS) + if(((NOT (NAME IN_LIST SKIP_ALL_EXCEPT_TESTS)) AND (MIOPEN_TEST_INT8 OR MIOPEN_TEST_BFLOAT16 OR MIOPEN_TEST_MIOTENSILE)) OR (NAME IN_LIST SKIP_TESTS)) add_test(NAME ${NAME} COMMAND echo skipped) set_tests_properties(${NAME} PROPERTIES DISABLED On) elseif(WIN32) @@ -216,7 +229,7 @@ set_tests_properties(test_sqlite_perfdb test_perfdb # add_sanitize_test(type_name.cpp) function(add_custom_test NAME) - set(options SKIP_UNLESS_ALL ALLOW_BFLOAT16 ALLOW_HALF ALLOW_INT8 ALLOW_NONXDLOPS) + set(options SKIP_UNLESS_ALL ALLOW_BFLOAT16 ALLOW_HALF ALLOW_INT8 ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE) set(oneValueArgs) set(multiValueArgs) cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -226,11 +239,11 @@ function(add_custom_test NAME) # are set, except the test is allowed explicitly. # Some tests are xDLOPs specific and should not run on gfx900/906 targets. if((NOT (MIOPEN_TEST_INT8 OR MIOPEN_TEST_BFLOAT16 OR MIOPEN_TEST_HALF OR MIOPEN_TEST_MIOTENSILE OR MIOPEN_TEST_NONXDLOPS)) - OR (MIOPEN_TEST_BFLOAT16 AND ${PARSE_ALLOW_BFLOAT16}) - OR (MIOPEN_TEST_HALF AND ${PARSE_ALLOW_HALF}) - OR (MIOPEN_TEST_INT8 AND ${PARSE_ALLOW_INT8}) - OR (MIOPEN_TEST_NONXDLOPS AND ${PARSE_ALLOW_NONXDLOPS}) - OR (MIOPEN_TEST_MIOTENSILE AND NOT(NAME IN_LIST SKIP_TESTS))) + OR (MIOPEN_TEST_BFLOAT16 AND ${PARSE_ALLOW_BFLOAT16} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_HALF AND ${PARSE_ALLOW_HALF} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_INT8 AND ${PARSE_ALLOW_INT8} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_NONXDLOPS AND ${PARSE_ALLOW_NONXDLOPS} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_FLOAT AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE}))) add_custom_target(${NAME} ${PARSE_UNPARSED_ARGUMENTS}) if(NOT PARSE_SKIP_UNLESS_ALL OR MIOPEN_TEST_ALL) add_test(NAME ${NAME} COMMAND ${CMAKE_COMMAND} --build ${CMAKE_CURRENT_BINARY_DIR} --target ${NAME}) @@ -240,7 +253,7 @@ function(add_custom_test NAME) endfunction() function(add_perf_test NAME) - set(options SKIP_UNLESS_ALL ALLOW_BFLOAT16 ALLOW_HALF ALLOW_INT8 ALLOW_NONXDLOPS) + set(options SKIP_UNLESS_ALL ALLOW_BFLOAT16 ALLOW_HALF ALLOW_INT8 ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE) set(oneValueArgs) set(multiValueArgs) cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -250,11 +263,11 @@ function(add_perf_test NAME) # are set, except the test is allowed explicitly. # Some tests are xDLOPs specific and should not run on gfx900/906 targets. if((NOT (MIOPEN_TEST_INT8 OR MIOPEN_TEST_BFLOAT16 OR MIOPEN_TEST_HALF OR MIOPEN_TEST_MIOTENSILE OR MIOPEN_TEST_NONXDLOPS)) - OR (MIOPEN_TEST_BFLOAT16 AND ${PARSE_ALLOW_BFLOAT16}) - OR (MIOPEN_TEST_HALF AND ${PARSE_ALLOW_HALF}) - OR (MIOPEN_TEST_INT8 AND ${PARSE_ALLOW_INT8}) - OR (MIOPEN_TEST_NONXDLOPS AND ${PARSE_ALLOW_NONXDLOPS}) - OR (MIOPEN_TEST_MIOTENSILE AND NOT(NAME IN_LIST SKIP_TESTS))) + OR (MIOPEN_TEST_BFLOAT16 AND ${PARSE_ALLOW_BFLOAT16} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_HALF AND ${PARSE_ALLOW_HALF} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_INT8 AND ${PARSE_ALLOW_INT8} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_NONXDLOPS AND ${PARSE_ALLOW_NONXDLOPS} AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE})) + OR (MIOPEN_TEST_FLOAT AND (NOT MIOPEN_TEST_MIOTENSILE OR ${PARSE_ALLOW_GEMM_FEATURE}))) add_custom_target(${NAME} ${PARSE_UNPARSED_ARGUMENTS}) if(NOT PARSE_SKIP_UNLESS_ALL OR MIOPEN_TEST_ALL) add_test(NAME ${NAME} COMMAND ${CMAKE_COMMAND} --build ${CMAKE_CURRENT_BINARY_DIR} --target ${NAME}) @@ -403,7 +416,7 @@ COMMAND $ ${IMPLICITGEMM_ARGS} --verbose --input 64 COMMAND $ ${IMPLICITGEMM_ARGS} --verbose --input 64 32 7 7 --weights 192 32 3 3 --pads_strides_dilations 2 2 2 2 1 1 ) -add_custom_test(test_conv_group SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_conv_group SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --input 16 128 56 56 --weights 256 4 3 3 --pads_strides_dilations 1 1 1 1 1 1 --group-count 32 COMMAND $ --verbose --input 16 256 56 56 --weights 512 8 3 3 --pads_strides_dilations 1 1 2 2 1 1 --group-count 32 COMMAND $ --verbose --input 16 256 28 28 --weights 512 8 3 3 --pads_strides_dilations 1 1 1 1 1 1 --group-count 32 @@ -460,7 +473,7 @@ COMMAND $ --verbose --input 8 3 108 108 --weights 63 1 ) if(MIOPEN_TEST_DEEPBENCH) - add_custom_test(test_deepbench_rnn ALLOW_NONXDLOPS + add_custom_test(test_deepbench_rnn ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --batch-size 16 --seq-len 50 --vector-len 1760 --hidden-size 1760 --num-layers 1 --in-mode 1 --bias-mode 0 -dir-mode 0 --rnn-mode 0 --flat-batch-fill COMMAND $ --verbose --batch-size 32 --seq-len 50 --vector-len 1760 --hidden-size 1760 --num-layers 1 --in-mode 1 --bias-mode 0 -dir-mode 0 --rnn-mode 0 --flat-batch-fill COMMAND $ --verbose --batch-size 64 --seq-len 50 --vector-len 1760 --hidden-size 1760 --num-layers 1 --in-mode 1 --bias-mode 0 -dir-mode 0 --rnn-mode 0 --flat-batch-fill @@ -518,7 +531,7 @@ if(MIOPEN_TEST_DEEPBENCH) endif() -add_custom_test(test_rnn_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_rnn_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --rnn-mode 0 --no-hx COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --rnn-mode 0 --no-dhy COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --rnn-mode 0 --no-hx --no-dhy @@ -549,7 +562,7 @@ COMMAND $ --verbose --batch-size 32 --seq-len 3 -- COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 1 --rnn-mode 1 --no-hx --no-dhy --no-hy --no-dhx ) -add_custom_test(test_gru_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_gru_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --no-hx COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --no-dhy COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --no-hx --no-dhy @@ -566,7 +579,7 @@ COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-se COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 1 --no-hx --no-dhy --no-hy --no-dhx ) -add_custom_test(test_lstm_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_lstm_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --no-hx COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --no-dhy COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-seq 32 32 32 --vector-len 128 --hidden-size 128 --num-layers 1 --in-mode 0 --bias-mode 0 -dir-mode 0 --no-hx --no-dhy @@ -600,7 +613,7 @@ COMMAND $ --verbose --batch-size 32 --seq-len 3 --batch-s ) -add_custom_test(test_conv_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_conv_extra SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE # COMMAND $ --verbose --input 1 1 1 1 --weights 1 1 2 2 --pads_strides_dilations 0 0 3 3 1 1 COMMAND $ --verbose --input 4 1 161 700 --weights 4 1 5 20 --pads_strides_dilations 0 0 2 2 1 1 COMMAND $ --verbose --input 4 1 161 700 --weights 4 1 5 20 --pads_strides_dilations 0 0 2 2 1 1 @@ -617,7 +630,7 @@ COMMAND $ --verbose --input 4 32 14 14 --weights 4 32 5 ) -add_custom_test(test_conv_trans SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_conv_trans SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --input 8 128 28 28 --weights 128 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --cmode trans --pmode default COMMAND $ --verbose --input 8 256 28 28 --weights 256 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --cmode trans --pmode same COMMAND $ --verbose --input 8 32 28 28 --weights 32 32 5 5 --pads_strides_dilations 0 0 2 2 1 1 --cmode trans --pmode default @@ -637,7 +650,7 @@ COMMAND $ --verbose --input 100 6 4 4 --weights 6 4 1 1 ) -add_custom_test(test_conv_3d SKIP_UNLESS_ALL ALLOW_NONXDLOPS +add_custom_test(test_conv_3d SKIP_UNLESS_ALL ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --conv_dim_type conv3d --input 16 32 4 9 9 --weights 64 32 3 3 3 --pads_strides_dilations 0 0 0 2 2 2 1 1 1 --group-count 1 --cmode conv --pmode default COMMAND $ --verbose --conv_dim_type conv3d --input 4 3 4 227 227 --weights 4 3 3 11 11 --pads_strides_dilations 0 0 0 1 1 1 1 1 1 --group-count 1 --cmode conv --pmode default COMMAND $ --verbose --conv_dim_type conv3d --input 16 128 4 56 56 --weights 256 4 3 3 3 --pads_strides_dilations 1 1 1 1 1 1 1 1 1 --group-count 32 --cmode conv --pmode default @@ -761,7 +774,7 @@ COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS_XDLOPS} $ --ver ) if(MIOPEN_TEST_DEEPBENCH) - add_custom_test(test_deepbench_conv ALLOW_NONXDLOPS + add_custom_test(test_deepbench_conv ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --input 4 1 161 700 --weights 32 1 5 20 --pads_strides_dilations 0 0 2 2 1 1 COMMAND $ --verbose --input 8 1 161 700 --weights 32 1 5 20 --pads_strides_dilations 0 0 2 2 1 1 COMMAND $ --verbose --input 16 1 161 700 --weights 32 1 5 20 --pads_strides_dilations 0 0 2 2 1 1 @@ -802,7 +815,7 @@ if(MIOPEN_TEST_DEEPBENCH) endif() if(MIOPEN_TEST_CONV) - add_custom_test(test_miopen_conv ALLOW_NONXDLOPS + add_custom_test(test_miopen_conv ALLOW_NONXDLOPS ALLOW_GEMM_FEATURE COMMAND $ --verbose --input 1 3 32 32 --weights 1 3 7 7 --pads_strides_dilations 1 1 1 1 1 1 COMMAND $ --verbose --input 1 3 227 227 --weights 1 3 7 7 --pads_strides_dilations 1 1 1 1 1 1 COMMAND $ --verbose --input 1 64 56 56 --weights 1 64 1 1 --pads_strides_dilations 0 0 2 2 1 1