From e39aa0444843f1d63b31e4c93cd8079455072e29 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 6 Jul 2023 06:40:57 +0000 Subject: [PATCH 01/28] bg/LWPMIOPEN-194 : ck batch norm test pass --- src/CMakeLists.txt | 1 + .../miopen/batchnorm/problem_description.hpp | 2 + src/include/miopen/batchnorm/solvers.hpp | 13 + src/include/miopen/fusion/solvers.hpp | 54 +++ src/solver/batchnorm/forward_inference_ck.cpp | 420 ++++++++++++++++++ test/gtest/bn.hpp | 67 +++ test/gtest/bn_test_base.hpp | 219 +++++++++ test/gtest/log_test_helper.cpp | 4 +- test/gtest/na.hpp | 185 ++------ test/gtest/na_infer.cpp | 77 +++- test/gtest/test_fusion_plan_base.hpp | 123 +++++ 11 files changed, 997 insertions(+), 168 deletions(-) create mode 100644 src/solver/batchnorm/forward_inference_ck.cpp create mode 100644 test/gtest/bn.hpp create mode 100644 test/gtest/bn_test_base.hpp create mode 100644 test/gtest/test_fusion_plan_base.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ea9a82d120..08290b928d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -149,6 +149,7 @@ set( MIOpen_Source solver/batchnorm/backward_spatial_multiple.cpp solver/batchnorm/backward_spatial_single.cpp solver/batchnorm/forward_inference.cpp + solver/batchnorm/forward_inference_ck.cpp solver/batchnorm/forward_inference_fused.cpp solver/batchnorm/forward_per_activation.cpp solver/batchnorm/forward_per_activation_fused.cpp diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp index b3af4e1938..224ec0c92d 100644 --- a/src/include/miopen/batchnorm/problem_description.hpp +++ b/src/include/miopen/batchnorm/problem_description.hpp @@ -84,6 +84,8 @@ struct ProblemDescription : ProblemDescriptionBase scaleBiasDesc(bnScaleBiasMeanVarDesc_), epsilon(epsilon_) { + in_layout = xDesc.GetLayout(xDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); + out_layout = yOrDyDesc.GetLayout(yOrDyDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); } // Backward diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 64f1e237bf..c7b91fb697 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -132,6 +132,19 @@ struct BnFwdInference final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; +struct CKBnFwdInference final : BatchnormSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + } // namespace batchnorm } // namespace solver diff --git a/src/include/miopen/fusion/solvers.hpp b/src/include/miopen/fusion/solvers.hpp index f5621e72dc..c0b4ca3659 100644 --- a/src/include/miopen/fusion/solvers.hpp +++ b/src/include/miopen/fusion/solvers.hpp @@ -317,6 +317,60 @@ struct BnBwdTrgActivationFused final : FusionSolverBase const FusionDescription& problem) const override; }; +struct PerformanceConfigCKBnFwdInference : PerfConfigBase +{ + int index; + std::string kernel_id; + std::vector valid_kernels; + PerformanceConfigCKBnFwdInference(int idx, std::string kernl_id) + : index(idx), kernel_id(kernl_id) + { + } + PerformanceConfigCKBnFwdInference() : PerformanceConfigCKBnFwdInference(0, "") {} + PerformanceConfigCKBnFwdInference(bool) : PerformanceConfigCKBnFwdInference(0, "") {} + void HeuristicInit(const FusionDescription& fdesc_problem); + bool SetNextValue(const FusionDescription& fdesc_problem); + bool IsValidValue() const; + bool IsValid(const FusionContext&, const FusionDescription& fdesc_problem) const; + + template + static void Visit(Self&& s, F f) + { + f(s.kernel_id, "kernel_id"); + } + bool operator==(const PerformanceConfigCKBnFwdInference& other) const; + +private: + template + void Init(const miopen::batchnorm::ProblemDescription&); + template + bool CheckIsSupportCKArgs(const miopen::batchnorm::ProblemDescription&) const; +}; + +struct CKBnFwdInference final : FusionTunableSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + PerformanceConfigCKBnFwdInference + GetDefaultPerformanceConfig(const FusionContext& ctx, + const FusionDescription& fdesc_problem) const override; + bool IsValidPerformanceConfig(const FusionContext& ctx, + const FusionDescription& fdesc_problem, + const PerformanceConfigCKBnFwdInference& config) const override; + PerformanceConfigCKBnFwdInference Search(const FusionContext& ctx, + const FusionDescription& fdesc_problem, + const AnyInvokeParams& invoke_ctx) const override; + bool IsApplicable(const FusionContext& ctx, + const FusionDescription& fdesc_problem) const override; + ConvSolution GetSolution(const FusionContext& ctx, + const FusionDescription& fdesc_problem, + const PerformanceConfigCKBnFwdInference& config) const override; + +private: + template + bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription&) const; +}; + } // namespace fusion } // namespace solver } // namespace miopen diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp new file mode 100644 index 0000000000..53765a3e85 --- /dev/null +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -0,0 +1,420 @@ + +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#endif +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_INFER) + +namespace miopen { +namespace solver { +namespace fusion { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using index_t = int32_t; +using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; + +constexpr index_t Rank = 4; +constexpr index_t NumBatchNormReduceDim = 3; + +template +using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + Rank>; + +struct CKArgsBNormFwd +{ + CKArgsBNormFwd(const miopen::batchnorm::ProblemDescription& problem) + { + std::copy(problem.GetXDesc().GetLengths().begin(), + problem.GetXDesc().GetLengths().end(), + xyLengths.begin()); + + std::copy(problem.GetXDesc().GetStrides().begin(), + problem.GetXDesc().GetStrides().end(), + xyStrides.begin()); + + aligned_scaleBiasMeanVarStrides[0] = 0; + aligned_scaleBiasMeanVarStrides[1] = 1; + aligned_scaleBiasMeanVarStrides[2] = 0; + aligned_scaleBiasMeanVarStrides[3] = 0; + } + + std::array xyLengths; // inOutLengths + std::array xyStrides; // inOutStrides + std::vector invariantDims; + + std::array aligned_scaleBiasMeanVarStrides{3}; + + double epsilon = 0.0001; + std::array reduceDims{0, 1, 2}; +}; + +template +void PerformanceConfigCKBnFwdInference::Init(const miopen::batchnorm::ProblemDescription& problem) +{ + const auto& args = CKArgsBNormFwd{problem}; + const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + assert(!bn_fwd_ptrs.empty()); + for(const auto& it : bn_fwd_ptrs) + { + auto argument_ptr = it->MakeArgumentPointer(args.xyLengths, + {args.xyStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides}, + {args.xyStrides}, + {nullptr, nullptr, nullptr, nullptr, nullptr}, + {nullptr}, + Normalize{0.0}); + if(it->IsSupportedArgument(argument_ptr.get())) + { + valid_kernels.push_back(it->GetTypeString()); + } + } + + assert(!valid_kernels.empty()); + this->index = 0; + this->kernel_id = valid_kernels[0]; +} + +template +bool PerformanceConfigCKBnFwdInference::CheckIsSupportCKArgs( + const miopen::batchnorm::ProblemDescription& problem) const +{ + const auto& args = CKArgsBNormFwd{problem}; + const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + int i = 0; + for(; i < bn_fwd_ptrs.size(); i++) + { + if(bn_fwd_ptrs[i]->GetTypeString() == this->kernel_id) + { + break; + } + } + if(i == valid_kernels.size()) + { + return false; + } + auto argument_ptr = + bn_fwd_ptrs[i]->MakeArgumentPointer(args.xyLengths, + {args.xyStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides}, + {args.xyStrides}, + {nullptr, nullptr, nullptr, nullptr, nullptr}, + {nullptr}, + Normalize{0.0}); + return bn_fwd_ptrs[i]->IsSupportedArgument(argument_ptr.get()); +} + +template +bool CKBnFwdInference::CheckCKApplicability( + const miopen::batchnorm::ProblemDescription& problem) const +{ + const auto& args = CKArgsBNormFwd{problem}; + const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + assert(!bn_fwd_ptrs.empty()); + + for(const auto& it : bn_fwd_ptrs) + { + auto argument_ptr = it->MakeArgumentPointer(args.xyLengths, + {args.xyStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides}, + {args.xyStrides}, + {nullptr, nullptr, nullptr, nullptr, nullptr}, + {nullptr}, + Normalize{0.0}); + if(it->IsSupportedArgument(argument_ptr.get())) + return true; + } + return false; +} + +template +void RunCKSolution(const Handle& handle, + const AnyInvokeParams& primitive_parameters, + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigCKBnFwdInference& config) +{ + const auto& args = CKArgsBNormFwd{problem}; + + const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + int index = 0; + for(; index < bn_fwd_ptrs.size(); index++) + { + if(bn_fwd_ptrs[index]->GetTypeString() == config.kernel_id) + { + break; + } + } + assert(index < bn_fwd_ptrs.size()); + auto& bn_ptr = bn_fwd_ptrs.at(index); + const auto& invoke_ctx = primitive_parameters.CastTo(); + assert(invoke_ctx.op_args.params[0] != nullptr); + const auto& params = dynamic_cast( + *invoke_ctx.op_args.params[0]); + + auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, + {args.xyStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides}, + {args.xyStrides}, + {invoke_ctx.in, + params.estimatedMean, + params.estimatedVariance, + params.bnScale, + params.bnBias}, + {invoke_ctx.out}, + Normalize{params.epsilon}); + + auto invoker_ptr = bn_ptr->MakeInvokerPointer(); + const auto enable_profiling = handle.IsProfilingEnabled(); + + float elapsed_time = + invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); + if(enable_profiling) + { + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed_time); + } +} +#endif + +void PerformanceConfigCKBnFwdInference::HeuristicInit(const FusionDescription& fdesc_problem) +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = fdesc_problem; +#else + const auto& bn_problem = + fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); + switch(bn_problem.GetXDesc().GetType()) + { + case miopenHalf: Init(bn_problem); break; + case miopenInt8: + case miopenFloat: Init(bn_problem); break; + case miopenInt32: + case miopenInt8x4: + case miopenBFloat16: + case miopenDouble: + default: MIOPEN_THROW("Unsupported datatype"); + } + +#endif +} + +bool PerformanceConfigCKBnFwdInference::SetNextValue(const FusionDescription& fdesc_problem) +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = fdesc_problem; + return false; +#else + if(this->valid_kernels.empty()) + { + this->HeuristicInit(fdesc_problem); + assert(!valid_kernels.empty()); + return true; + } + if((this->index + 1) < valid_kernels.size()) + { + ++this->index; + this->kernel_id = this->valid_kernels[index]; + return true; + } + else + return false; +#endif +} + +bool PerformanceConfigCKBnFwdInference::IsValidValue() const +{ + return this->index >= 0 && this->index < valid_kernels.size(); +} + +bool PerformanceConfigCKBnFwdInference::IsValid(const FusionContext&, + const FusionDescription& fdesc_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = fdesc_problem; + return false; +#else + // Extract convolution problem from the fusion context. + const auto& bn_problem = + fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); + switch(bn_problem.GetDXDesc().GetType()) + { + case miopenHalf: return CheckIsSupportCKArgs(bn_problem); + case miopenInt8: + case miopenFloat: return CheckIsSupportCKArgs(bn_problem); + case miopenInt32: + case miopenInt8x4: + case miopenBFloat16: + case miopenDouble: + default: MIOPEN_THROW("Unsupported datatype"); + } + return false; +#endif +} + +bool PerformanceConfigCKBnFwdInference::operator==( + const PerformanceConfigCKBnFwdInference& other) const +{ + return this->kernel_id == other.kernel_id; +} +PerformanceConfigCKBnFwdInference +CKBnFwdInference::GetDefaultPerformanceConfig(const FusionContext&, + const FusionDescription& fdesc_problem) const +{ + PerformanceConfigCKBnFwdInference pp; + pp.HeuristicInit(fdesc_problem); + MIOPEN_LOG_I(pp.ToString()); + return pp; +} + +bool CKBnFwdInference::IsValidPerformanceConfig( + const FusionContext& ctx, + const FusionDescription& fdesc_problem, + const PerformanceConfigCKBnFwdInference& config) const +{ + return config.IsValid(ctx, fdesc_problem); +} + +PerformanceConfigCKBnFwdInference CKBnFwdInference::Search(const FusionContext& ctx, + const FusionDescription& fdesc_problem, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, fdesc_problem, invoke_ctx); +} + +bool CKBnFwdInference::IsApplicable(const FusionContext& ctx, + const FusionDescription& fdesc_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; + std::ignore = fdesc_problem; + return false; +#else + const auto& desc = *fdesc_problem.fusion_plan_desc; + if(desc.op_map.empty()) + MIOPEN_THROW(miopenStatusInternalError, "desc.op_map.empty()"); + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_INFER{})) + return false; + if(desc.op_map.size() != 1) + return false; + const auto& bn_op = dynamic_cast(*desc.op_map[0]); + if(bn_op.kind() != miopenFusionOpBatchNormInference) + return false; + const auto& bn_problem = + fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); + if(!bn_problem.IsLayoutNHWC()) + return false; + + const std::string arch = ctx.GetStream().GetDeviceName(); + if(arch != "gfx908" && arch != "gfx90a") + return false; + + switch(bn_problem.GetXDesc().GetType()) + { + case miopenHalf: return CheckCKApplicability(bn_problem); + case miopenInt8: + case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenInt32: + case miopenInt8x4: + case miopenBFloat16: + case miopenDouble: + default: MIOPEN_THROW("Unsupported datatype"); + } + return false; +#endif +} + +ConvSolution CKBnFwdInference::GetSolution(const FusionContext&, + const FusionDescription& fdesc_problem, + const PerformanceConfigCKBnFwdInference& config) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = fdesc_problem; + std::ignore = config; + return {}; +#else + const auto& bn_problem = + fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); + + ConvSolution result; + result.invoker_factory = [=](const std::vector& kernels) { + std::ignore = kernels; + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem + { + case miopenHalf: + RunCKSolution(handle, primitive_parameters, bn_problem, config); + break; + case miopenInt8: + case miopenFloat: + RunCKSolution(handle, primitive_parameters, bn_problem, config); + break; + case miopenInt32: + case miopenInt8x4: + case miopenBFloat16: + case miopenDouble: + default: MIOPEN_THROW("Unsupported datatype"); + } + }; + }; + return result; +#endif +} + +} // namespace fusion +} // namespace solver +} // namespace miopen diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp new file mode 100644 index 0000000000..eaa374db48 --- /dev/null +++ b/test/gtest/bn.hpp @@ -0,0 +1,67 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include "bn_test_base.hpp" +#include "test_fusion_plan_base.hpp" + +template +struct BNInferTest : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(bn_config, tensor_layout) = GetParam(); + bn_infer_data.SetUpImpl(bn_config, tensor_layout); + + test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data); + test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data); + } + + void TearDown() override + { + if(test_skipped) + return; + test::FusionPlan::ComputeRefBN(bn_infer_data); + auto&& handle = get_handle(); + bn_infer_data.output.data = + handle.Read(bn_infer_data.out_dev, bn_infer_data.output.data.size()); + test::FusionPlan::ComputeRefBN(bn_infer_data); + test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out); + } + BNTestCase bn_config; + + bool test_skipped = false; + miopen::FusionPlanDescriptor fusePlanDesc; + miopen::OperatorArgs params; + + BNInferSolverTest bn_infer_data; + + miopenTensorLayout_t tensor_layout; +}; diff --git a/test/gtest/bn_test_base.hpp b/test/gtest/bn_test_base.hpp new file mode 100644 index 0000000000..2ae1e0aa16 --- /dev/null +++ b/test/gtest/bn_test_base.hpp @@ -0,0 +1,219 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +#include "tensor_util.hpp" +#include "get_handle.hpp" + +struct BNTestCase +{ + size_t N; + size_t C; + size_t H; + size_t W; + miopenBatchNormMode_t mode; + miopen::batchnorm::Direction Direction; + bool save; + bool keepRunning; + + friend std::ostream& operator<<(std::ostream& ss, const BNTestCase& tc) + { + return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W + << " mode: " << tc.mode << " Direction: " << static_cast(tc.Direction) + << " save: " << tc.save << " keepRunning: " << tc.keepRunning; + } + std::vector GetInput() const { return {N, C, H, W}; } +}; + +std::vector Network1() +{ + // pyt_mlperf_resnet50v1.5 + return { + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; +} + +template +struct BNSolverTestBase +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + bn_config = config; + tensor_layout = t_layout; + CreateTensors(); + InitTensorsWithRandValue(); + SetDirection(); + SetBNMode(); + WriteToGPU(); + } + const miopen::TensorDescriptor& GetInputDesc() const { return input.desc; } + + tensor input; + tensor output; + tensor ref_out; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr out_dev; + + miopen::FusionPlanDescriptor fusePlanDesc; + miopen::OperatorArgs params; + miopen::TensorDescriptor bn_desc; + miopenBatchNormMode_t bn_mode; + miopen::batchnorm::Direction direction; + miopenTensorLayout_t tensor_layout; + TConfig bn_config; + +private: + void CreateTensors() + { + input = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; + output = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; + ref_out = output; + } + + void InitTensorsWithRandValue() + { + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::uniform_int_distribution<> d{0, 100}; + auto gen_value = [&](auto...) { + return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + }; + input.generate(gen_value); + } + + void SetDirection() { direction = bn_config.Direction; } + void SetBNMode() { bn_mode = bn_config.mode; } + void WriteToGPU() + { + auto&& handle = get_handle(); + in_dev = handle.Write(input.data); + out_dev = handle.Write(output.data); + } +}; + +template +struct BNInferSolverTest : public BNSolverTestBase +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + BNSolverTestBase::SetUpImpl(config, t_layout); + CreateTensors(); + InitTensorsWithRandValue(); + WriteToGPU(); + } + + tensor scale; + tensor shift; + tensor estMean; + tensor estVariance; + miopen::Allocator::ManageDataPtr scale_dev; + miopen::Allocator::ManageDataPtr shift_dev; // bias + miopen::Allocator::ManageDataPtr estMean_dev; + miopen::Allocator::ManageDataPtr estVariance_dev; + double epsilon = 1.0e-5; + const float alpha = static_cast(1.0f); + const float beta = static_cast(0); + const float activ_alpha = static_cast(0.5f); + const float activ_beta = static_cast(0.5f); + const float activ_gamma = static_cast(0.5f); + +private: + void CreateTensors() + { + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + BNSolverTestBase::input.desc, + BNSolverTestBase::bn_mode); + scale = tensor{miopen_type{}, + BNSolverTestBase::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{miopen_type{}, + BNSolverTestBase::tensor_layout, + derivedBnDesc.GetLengths()}; + estMean = tensor{miopen_type{}, + BNSolverTestBase::tensor_layout, + derivedBnDesc.GetLengths()}; + estVariance = tensor{miopen_type{}, + BNSolverTestBase::tensor_layout, + derivedBnDesc.GetLengths()}; + } + + void InitTensorsWithRandValue() + { + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::uniform_int_distribution<> d{0, 100}; + auto gen_value = [&](auto...) { + return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + }; + scale.generate(gen_value); + shift.generate(gen_value); + estMean.generate(gen_value); + auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; + estVariance.generate(gen_var); + } + void WriteToGPU() + { + auto&& handle = get_handle(); + scale_dev = handle.Write(scale.data); + shift_dev = handle.Write(shift.data); + estMean_dev = handle.Write(estMean.data); + estVariance_dev = handle.Write(estVariance.data); + } +}; diff --git a/test/gtest/log_test_helper.cpp b/test/gtest/log_test_helper.cpp index 1c3f521dc4..bcc5e07919 100644 --- a/test/gtest/log_test_helper.cpp +++ b/test/gtest/log_test_helper.cpp @@ -194,7 +194,7 @@ void TestLogFun(std::function - #include -#include -#include -#include -#include - -#include "tensor_util.hpp" -#include "get_handle.hpp" - -struct BNTestCase -{ - size_t N; - size_t C; - size_t H; - size_t W; - miopenBatchNormMode_t mode; - miopen::batchnorm::Direction Direction; - bool save; - bool keepRunning; - friend std::ostream& operator<<(std::ostream& ss, const BNTestCase& tc) - { - return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W - << " mode: " << tc.mode << " Direction: " << static_cast(tc.Direction) - << " save: " << tc.save << " keepRunning: " << tc.keepRunning; - } - std::vector GetInput() { return {N, C, H, W}; } -}; - -std::vector Network1() -{ - // pyt_mlperf_resnet50v1.5 - return { - {64, 128, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; -} +#include "bn_test_base.hpp" +#include "test_fusion_plan_base.hpp" template -struct BNActivInferTest - : public ::testing::TestWithParam> +struct BNActivInferTest : public ::testing::TestWithParam< + std::tuple> { protected: void SetUp() override { - test_skipped = false; - std::tie(activ_mode, bn_config) = GetParam(); - bn_mode = bn_config.mode; - input = tensor{bn_config.GetInput()}; - auto derivedBnDesc = miopen::TensorDescriptor{}; - miopen::DeriveBNTensorDescriptor(derivedBnDesc, input.desc, bn_mode); - scale = tensor{derivedBnDesc.GetLengths()}; - shift = tensor{derivedBnDesc.GetLengths()}; - estMean = tensor{derivedBnDesc.GetLengths()}; - estVariance = tensor{derivedBnDesc.GetLengths()}; - std::random_device rd{}; - std::mt19937 gen{rd()}; - std::uniform_int_distribution<> d{0, 100}; - auto gen_value = [&](auto...) { - return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); - }; - input.generate(gen_value); - scale.generate(gen_value); - shift.generate(gen_value); - estMean.generate(gen_value); - auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; - estVariance.generate(gen_var); - activ_desc = {activ_mode, activ_alpha, activ_beta, activ_gamma}; - output = tensor{bn_config.GetInput()}; - auto&& handle = get_handle(); - std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); - in_dev = handle.Write(input.data); - scale_dev = handle.Write(scale.data); - shift_dev = handle.Write(shift.data); - estMean_dev = handle.Write(estMean.data); - estVariance_dev = handle.Write(estVariance.data); - out_dev = handle.Write(output.data); + test_skipped = false; + std::tie(activ_mode, bn_config, tensor_layout) = GetParam(); + bn_infer_data.SetUpImpl(bn_config, tensor_layout); - // Setup the Fusionplan - fusePlanDesc = miopen::FusionPlanDescriptor(miopenVerticalFusion, input.desc); - auto bnOp = - std::make_shared(bn_mode, bn_desc); - EXPECT_EQ(fusePlanDesc.AddOp(bnOp), miopenStatusSuccess); - bnOp->SetArgs(params, - &alpha, - &beta, - scale_dev.get(), - shift_dev.get(), - estMean_dev.get(), - estVariance_dev.get(), - epsilon); - auto activOp = std::make_shared(activ_desc.GetMode()); - EXPECT_EQ(fusePlanDesc.AddOp(activOp), miopenStatusSuccess); - activOp->SetArgs(params, &alpha, &beta, activ_alpha, activ_beta, activ_gamma); + test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data); + test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data); + test::FusionPlan::AddActiv(fusePlanDesc, params, bn_infer_data, activ_mode); } void TearDown() override { if(test_skipped) return; - ref_out = tensor{bn_config.GetInput()}; - if(bn_mode == miopenBNPerActivation) - { - batchNormPerActivHostInference( - input, ref_out, scale, shift, epsilon, estMean, estVariance); - } - else - { - batchNormSpatialHostInference( - input, ref_out, scale, shift, epsilon, estMean, estVariance); - } - activationHostInfer( - activ_mode, activ_gamma, activ_beta, activ_alpha, ref_out.data, ref_out.data); + test::FusionPlan::ComputeRefBN(bn_infer_data); + activationHostInfer(activ_mode, + bn_infer_data.activ_gamma, + bn_infer_data.activ_beta, + bn_infer_data.activ_alpha, + bn_infer_data.ref_out.data, + bn_infer_data.ref_out.data); auto&& handle = get_handle(); - output.data = handle.Read(out_dev, output.data.size()); - EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; - EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; - EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) - << "Non finite number found in the GPU data"; - EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); - const double tolerance = 80; - double threshold = std::numeric_limits::epsilon() * tolerance; - auto error = miopen::rms_range(ref_out, output); - EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) - << "Non finite number found in the CPU data"; - EXPECT_TRUE(error < threshold) - << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + bn_infer_data.output.data = + handle.Read(bn_infer_data.out_dev, bn_infer_data.output.data.size()); + test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out); } BNTestCase bn_config; - miopen::TensorDescriptor bn_desc; - miopen::ActivationDescriptor activ_desc; - miopenBatchNormMode_t bn_mode; - tensor input; - tensor output; - tensor ref_out; - tensor scale; - tensor shift; - tensor estMean; - tensor estVariance; - miopen::Allocator::ManageDataPtr in_dev; - miopen::Allocator::ManageDataPtr out_dev; - miopen::Allocator::ManageDataPtr scale_dev; - miopen::Allocator::ManageDataPtr shift_dev; - miopen::Allocator::ManageDataPtr estMean_dev; - miopen::Allocator::ManageDataPtr estVariance_dev; + bool test_skipped = false; miopenActivationMode_t activ_mode; miopen::FusionPlanDescriptor fusePlanDesc; miopen::OperatorArgs params; - const float alpha = static_cast(1.0f); - const float beta = static_cast(0); - const float activ_alpha = static_cast(0.5f); - const float activ_beta = static_cast(0.5f); - const float activ_gamma = static_cast(0.5f); - double epsilon = 1.0e-5; + + BNInferSolverTest bn_infer_data; + + miopenTensorLayout_t tensor_layout; }; diff --git a/test/gtest/na_infer.cpp b/test/gtest/na_infer.cpp index 8716a03348..b1a536ded0 100644 --- a/test/gtest/na_infer.cpp +++ b/test/gtest/na_infer.cpp @@ -28,6 +28,7 @@ #include #include "na.hpp" +#include "bn.hpp" struct BNActivInferFloat : BNActivInferTest { @@ -37,6 +38,14 @@ struct BNActivInferHalf : BNActivInferTest { }; +struct BNInferFloat : BNInferTest +{ +}; + +struct BNInferHalf : BNInferTest +{ +}; + template void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc, const miopen::fusion::FusionInvokeParams& plan_params, @@ -61,22 +70,77 @@ void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc, (invoker)(handle, plan_params); handle.Finish(); } + +template +void RunTunableSolver(miopen::FusionPlanDescriptor& fusePlanDesc, + const std::unique_ptr& plan_params, + const TestCase& config, + bool& test_skipped) +{ + auto& handle = get_handle(); + Solver solv{}; + const auto fusion_problem = miopen::FusionDescription{&fusePlanDesc}; + auto fusion_ctx = miopen::FusionContext{handle}; + fusion_ctx.DetectRocm(); + if(!solv.IsApplicable(fusion_ctx, fusion_problem)) + { + test_skipped = true; + GTEST_SKIP() << solv.SolverDbId() << " Not Applicable" << config; + } + ASSERT_TRUE(solv.IsApplicable(fusion_ctx, fusion_problem)); + auto sol = solv.GetSolution( + fusion_ctx, fusion_problem, solv.GetDefaultPerformanceConfig(fusion_ctx, fusion_problem)); + ASSERT_TRUE(sol.Succeeded()); + ASSERT_TRUE(sol.invoker_factory); + const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); + (invoker)(handle, *(plan_params.get())); + handle.Finish(); +} + TEST_P(BNActivInferFloat, BnFwdInferActivationFused) { - const auto plan_params = miopen::fusion::FusionInvokeParams( - params, input.desc, in_dev.get(), output.desc, out_dev.get(), false); + const auto plan_params = miopen::fusion::FusionInvokeParams(params, + bn_infer_data.input.desc, + bn_infer_data.in_dev.get(), + bn_infer_data.output.desc, + bn_infer_data.out_dev.get(), + false); RunSolver( fusePlanDesc, plan_params, bn_config, test_skipped); } +TEST_P(BNInferFloat, CKBnFwdInference) +{ + const auto plan_params = + std::make_unique(params, + bn_infer_data.input.desc, + bn_infer_data.in_dev.get(), + bn_infer_data.output.desc, + bn_infer_data.out_dev.get(), + false); + RunTunableSolver( + fusePlanDesc, plan_params, bn_config, test_skipped); +} + +INSTANTIATE_TEST_SUITE_P(BNInferFloatSuite, + BNInferFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + INSTANTIATE_TEST_SUITE_P(BNActivInferFloatSuite, BNActivInferFloat, testing::Combine(testing::Values(miopenActivationRELU), - testing::ValuesIn(Network1()))); + testing::ValuesIn(Network1()), + testing::Values(miopenTensorNCHW))); + TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused) { - const auto plan_params = miopen::fusion::FusionInvokeParams( - params, input.desc, in_dev.get(), output.desc, out_dev.get(), false); + const auto plan_params = miopen::fusion::FusionInvokeParams(params, + bn_infer_data.input.desc, + bn_infer_data.in_dev.get(), + bn_infer_data.output.desc, + bn_infer_data.out_dev.get(), + false); RunSolver( fusePlanDesc, plan_params, bn_config, test_skipped); } @@ -84,4 +148,5 @@ TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused) INSTANTIATE_TEST_SUITE_P(BNActivInferHalfSuite, BNActivInferHalf, testing::Combine(testing::Values(miopenActivationRELU), - testing::ValuesIn(Network1()))); + testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/test_fusion_plan_base.hpp b/test/gtest/test_fusion_plan_base.hpp new file mode 100644 index 0000000000..e4b46ca1f6 --- /dev/null +++ b/test/gtest/test_fusion_plan_base.hpp @@ -0,0 +1,123 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "bn_test_base.hpp" +namespace test { +namespace FusionPlan { +template +void InitFusionPlan(miopen::FusionPlanDescriptor& fusePlanDesc, DLModule& dl_module) +{ + fusePlanDesc = miopen::FusionPlanDescriptor(miopenVerticalFusion, dl_module.GetInputDesc()); +} + +template +void AddBnInfer(miopen::FusionPlanDescriptor& fusePlanDesc, + miopen::OperatorArgs& params, + DLModule& dl_module) +{ + auto bnOp = std::make_shared(dl_module.bn_mode, + dl_module.bn_desc); + EXPECT_EQ(fusePlanDesc.AddOp(bnOp), miopenStatusSuccess); + bnOp->SetArgs(params, + &dl_module.alpha, + &dl_module.beta, + dl_module.scale_dev.get(), + dl_module.shift_dev.get(), + dl_module.estMean_dev.get(), + dl_module.estVariance_dev.get(), + dl_module.epsilon); +} + +template +void AddActiv(miopen::FusionPlanDescriptor& fusePlanDesc, + miopen::OperatorArgs& params, + DLModule& dl_module, + miopenActivationMode_t activ_mode) +{ + auto activOp = std::make_shared(activ_mode); + EXPECT_EQ(fusePlanDesc.AddOp(activOp), miopenStatusSuccess); + EXPECT_EQ(activOp->SetArgs(params, + &dl_module.alpha, + &dl_module.beta, + dl_module.activ_alpha, + dl_module.activ_beta, + dl_module.activ_gamma), + miopenStatusSuccess); +} + +bool Skip(miopen::Handle& handle) +{ + const std::string arch = handle.GetDeviceName(); + bool skip_test = (arch != "gfx908" && arch != "gfx90a"); + + return skip_test; +} + +template +void ComputeRefBN(DLModule& dl_module) +{ + if(dl_module.bn_mode == miopenBNPerActivation) + { + batchNormPerActivHostInference(dl_module.input, + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); + } + else + { + batchNormSpatialHostInference(dl_module.input, + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); + } +} + +template +void BnCmpare(const tensor& output, const tensor& ref_out) +{ + EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; + EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; + EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) + << "Non finite number found in the GPU data"; + EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); + const double tolerance = 80; + double threshold = std::numeric_limits::epsilon() * tolerance; + auto error = miopen::rms_range(ref_out, output); + EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) + << "Non finite number found in the CPU data"; + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; +} + +} // namespace FusionPlan +} // namespace test From 766d06014dfd0eabe6a3e9732e2c03a3f4352703 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 24 Jul 2023 14:45:12 +0000 Subject: [PATCH 02/28] bg/LWPMIOPEN-194 : removed stale code --- src/include/miopen/batchnorm/solvers.hpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index c7b91fb697..64f1e237bf 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -132,19 +132,6 @@ struct BnFwdInference final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; -struct CKBnFwdInference final : BatchnormSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - bool IsApplicable(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; -}; - } // namespace batchnorm } // namespace solver From 75d8d95a185cef1f15e8a32fdd0892ecad307c24 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 24 Jul 2023 14:53:24 +0000 Subject: [PATCH 03/28] bg/LWPMIOPEN-194 : minor mixes --- test/gtest/bn.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index eaa374db48..87f504bef2 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2022 Advanced Micro Devices, Inc. + * Copyright (c) 2023 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -48,7 +48,6 @@ struct BNInferTest : public ::testing::TestWithParam(bn_infer_data.out_dev, bn_infer_data.output.data.size()); From 171fa3a7f7b963f352981dfd2dc8d4f7f4734570 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Tue, 29 Aug 2023 15:24:46 +0000 Subject: [PATCH 04/28] bg/LWPMIOPEN-194 : fix rotate lens to pass NHWC layout is CK --- src/solver/batchnorm/forward_inference_ck.cpp | 16 +++++++++------- test/gtest/na_infer.cpp | 1 - 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 53765a3e85..c0f7909018 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL #include @@ -69,15 +70,18 @@ struct CKArgsBNormFwd std::copy(problem.GetXDesc().GetStrides().begin(), problem.GetXDesc().GetStrides().end(), xyStrides.begin()); + // prep for CK + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); aligned_scaleBiasMeanVarStrides[0] = 0; - aligned_scaleBiasMeanVarStrides[1] = 1; + aligned_scaleBiasMeanVarStrides[1] = 0; aligned_scaleBiasMeanVarStrides[2] = 0; - aligned_scaleBiasMeanVarStrides[3] = 0; + aligned_scaleBiasMeanVarStrides[3] = 1; } - std::array xyLengths; // inOutLengths - std::array xyStrides; // inOutStrides + std::array xyLengths; + std::array xyStrides; std::vector invariantDims; std::array aligned_scaleBiasMeanVarStrides{3}; @@ -358,9 +362,7 @@ bool CKBnFwdInference::IsApplicable(const FusionContext& ctx, fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); if(!bn_problem.IsLayoutNHWC()) return false; - - const std::string arch = ctx.GetStream().GetDeviceName(); - if(arch != "gfx908" && arch != "gfx90a") + if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) return false; switch(bn_problem.GetXDesc().GetType()) diff --git a/test/gtest/na_infer.cpp b/test/gtest/na_infer.cpp index 61c2c51021..2cc0f5f118 100644 --- a/test/gtest/na_infer.cpp +++ b/test/gtest/na_infer.cpp @@ -80,7 +80,6 @@ void RunTunableSolver(miopen::FusionPlanDescriptor& fusePlanDesc, Solver solv{}; const auto fusion_problem = miopen::FusionDescription{&fusePlanDesc}; auto fusion_ctx = miopen::FusionContext{handle}; - fusion_ctx.DetectRocm(); if(!solv.IsApplicable(fusion_ctx, fusion_problem)) { test_skipped = true; From 619fe9ee445b0bf75010432ab725ce8451d37b64 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Tue, 29 Aug 2023 19:12:37 -0700 Subject: [PATCH 05/28] fix clang format issue --- src/solver/batchnorm/forward_inference_ck.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index c0f7909018..78f2a5cf7b 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -71,7 +71,7 @@ struct CKArgsBNormFwd problem.GetXDesc().GetStrides().end(), xyStrides.begin()); // prep for CK - std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); aligned_scaleBiasMeanVarStrides[0] = 0; @@ -80,8 +80,8 @@ struct CKArgsBNormFwd aligned_scaleBiasMeanVarStrides[3] = 1; } - std::array xyLengths; - std::array xyStrides; + std::array xyLengths; + std::array xyStrides; std::vector invariantDims; std::array aligned_scaleBiasMeanVarStrides{3}; From 279664b20383cd785dbdd845fecfc7645b560cb4 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Sun, 3 Sep 2023 00:06:18 +0000 Subject: [PATCH 06/28] bg/LWPMIOPEN-194: inhert from non-tunable solver --- src/include/miopen/batchnorm/solvers.hpp | 14 + src/include/miopen/fusion/solvers.hpp | 54 ---- src/ocl/batchnormocl.cpp | 3 +- src/solver.cpp | 1 + src/solver/batchnorm/forward_inference.cpp | 6 +- src/solver/batchnorm/forward_inference_ck.cpp | 258 +++--------------- test/gtest/bn.hpp | 40 ++- test/gtest/bn_infer.cpp | 42 +++ .../{bn_test_base.hpp => bn_test_data.hpp} | 23 +- test/gtest/na.hpp | 185 +++++++++++-- test/gtest/na_infer.cpp | 76 +----- ...sion_plan_base.hpp => test_operations.hpp} | 75 +++-- 12 files changed, 330 insertions(+), 447 deletions(-) create mode 100644 test/gtest/bn_infer.cpp rename test/gtest/{bn_test_base.hpp => bn_test_data.hpp} (92%) rename test/gtest/{test_fusion_plan_base.hpp => test_operations.hpp} (88%) diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 64f1e237bf..6cf494ce37 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -132,6 +132,20 @@ struct BnFwdInference final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; +// ----------- start BnCKFwdInference --------------- +struct BnCKFwdInference final : BatchnormSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + +// ----------- end BnCKFwdInference --------------- + + } // namespace batchnorm } // namespace solver diff --git a/src/include/miopen/fusion/solvers.hpp b/src/include/miopen/fusion/solvers.hpp index c0b4ca3659..f5621e72dc 100644 --- a/src/include/miopen/fusion/solvers.hpp +++ b/src/include/miopen/fusion/solvers.hpp @@ -317,60 +317,6 @@ struct BnBwdTrgActivationFused final : FusionSolverBase const FusionDescription& problem) const override; }; -struct PerformanceConfigCKBnFwdInference : PerfConfigBase -{ - int index; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigCKBnFwdInference(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigCKBnFwdInference() : PerformanceConfigCKBnFwdInference(0, "") {} - PerformanceConfigCKBnFwdInference(bool) : PerformanceConfigCKBnFwdInference(0, "") {} - void HeuristicInit(const FusionDescription& fdesc_problem); - bool SetNextValue(const FusionDescription& fdesc_problem); - bool IsValidValue() const; - bool IsValid(const FusionContext&, const FusionDescription& fdesc_problem) const; - - template - static void Visit(Self&& s, F f) - { - f(s.kernel_id, "kernel_id"); - } - bool operator==(const PerformanceConfigCKBnFwdInference& other) const; - -private: - template - void Init(const miopen::batchnorm::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::batchnorm::ProblemDescription&) const; -}; - -struct CKBnFwdInference final : FusionTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - PerformanceConfigCKBnFwdInference - GetDefaultPerformanceConfig(const FusionContext& ctx, - const FusionDescription& fdesc_problem) const override; - bool IsValidPerformanceConfig(const FusionContext& ctx, - const FusionDescription& fdesc_problem, - const PerformanceConfigCKBnFwdInference& config) const override; - PerformanceConfigCKBnFwdInference Search(const FusionContext& ctx, - const FusionDescription& fdesc_problem, - const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const FusionContext& ctx, - const FusionDescription& fdesc_problem) const override; - ConvSolution GetSolution(const FusionContext& ctx, - const FusionDescription& fdesc_problem, - const PerformanceConfigCKBnFwdInference& config) const override; - -private: - template - bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription&) const; -}; - } // namespace fusion } // namespace solver } // namespace miopen diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 5c8aac5386..1025509da4 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -222,7 +222,8 @@ void BatchNormForwardInference(Handle& handle, }(); const auto algo = AlgorithmName{"miopenBatchNormalizationForwardInference"}; - const auto solvers = solver::SolverContainer{}; + const auto solvers = solver::SolverContainer{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } diff --git a/src/solver.cpp b/src/solver.cpp index 89423b495d..48020d11b7 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -561,6 +561,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId()); RegisterWithSolver( registry, ++id, ConvHipImplicitGemmGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp index 505349bedd..3a26673301 100644 --- a/src/solver/batchnorm/forward_inference.cpp +++ b/src/solver/batchnorm/forward_inference.cpp @@ -41,7 +41,11 @@ namespace batchnorm { bool BnFwdInference::IsApplicable(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::batchnorm::Direction::ForwardInference; + if(problem.IsLayoutNHWC()) + return false; + if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference) + return false; + return true; } ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context, diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index c0f7909018..1493f46d51 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -25,24 +25,19 @@ * *******************************************************************************/ -#include -#include -#include -#include -#include -#include +#include #include -#include -#include +#include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include #include #endif MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_INFER) namespace miopen { namespace solver { -namespace fusion { +namespace batchnorm { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL using PassThrough = ck::tensor_operation::element_wise::PassThrough; @@ -91,12 +86,14 @@ struct CKArgsBNormFwd }; template -void PerformanceConfigCKBnFwdInference::Init(const miopen::batchnorm::ProblemDescription& problem) +int CheckCKApplicability( + const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); assert(!bn_fwd_ptrs.empty()); + int count = 0; for(const auto& it : bn_fwd_ptrs) { auto argument_ptr = it->MakeArgumentPointer(args.xyLengths, @@ -109,103 +106,28 @@ void PerformanceConfigCKBnFwdInference::Init(const miopen::batchnorm::ProblemDes {nullptr, nullptr, nullptr, nullptr, nullptr}, {nullptr}, Normalize{0.0}); - if(it->IsSupportedArgument(argument_ptr.get())) - { - valid_kernels.push_back(it->GetTypeString()); - } - } - - assert(!valid_kernels.empty()); - this->index = 0; - this->kernel_id = valid_kernels[0]; -} - -template -bool PerformanceConfigCKBnFwdInference::CheckIsSupportCKArgs( - const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& args = CKArgsBNormFwd{problem}; - const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - int i = 0; - for(; i < bn_fwd_ptrs.size(); i++) - { - if(bn_fwd_ptrs[i]->GetTypeString() == this->kernel_id) - { - break; + if(it->IsSupportedArgument(argument_ptr.get())){ + return count; } + count++; } - if(i == valid_kernels.size()) - { - return false; - } - auto argument_ptr = - bn_fwd_ptrs[i]->MakeArgumentPointer(args.xyLengths, - {args.xyStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides}, - {args.xyStrides}, - {nullptr, nullptr, nullptr, nullptr, nullptr}, - {nullptr}, - Normalize{0.0}); - return bn_fwd_ptrs[i]->IsSupportedArgument(argument_ptr.get()); -} - -template -bool CKBnFwdInference::CheckCKApplicability( - const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& args = CKArgsBNormFwd{problem}; - const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - assert(!bn_fwd_ptrs.empty()); - - for(const auto& it : bn_fwd_ptrs) - { - auto argument_ptr = it->MakeArgumentPointer(args.xyLengths, - {args.xyStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides}, - {args.xyStrides}, - {nullptr, nullptr, nullptr, nullptr, nullptr}, - {nullptr}, - Normalize{0.0}); - if(it->IsSupportedArgument(argument_ptr.get())) - return true; - } - return false; + return -1; } template -void RunCKSolution(const Handle& handle, +static void RunCKSolution(const Handle& handle, const AnyInvokeParams& primitive_parameters, - const miopen::batchnorm::ProblemDescription& problem, - const PerformanceConfigCKBnFwdInference& config) + const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); - int index = 0; - for(; index < bn_fwd_ptrs.size(); index++) - { - if(bn_fwd_ptrs[index]->GetTypeString() == config.kernel_id) - { - break; - } - } - assert(index < bn_fwd_ptrs.size()); - auto& bn_ptr = bn_fwd_ptrs.at(index); - const auto& invoke_ctx = primitive_parameters.CastTo(); - assert(invoke_ctx.op_args.params[0] != nullptr); - const auto& params = dynamic_cast( - *invoke_ctx.op_args.params[0]); + int kernel_index = CheckCKApplicability(problem); + assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size()); + auto& bn_ptr = bn_fwd_ptrs.at(kernel_index); + const auto& params = primitive_parameters.CastTo(); auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, {args.xyStrides, @@ -214,12 +136,12 @@ void RunCKSolution(const Handle& handle, args.aligned_scaleBiasMeanVarStrides, args.aligned_scaleBiasMeanVarStrides}, {args.xyStrides}, - {invoke_ctx.in, + {params.x, params.estimatedMean, params.estimatedVariance, params.bnScale, params.bnBias}, - {invoke_ctx.out}, + {params.y}, Normalize{params.epsilon}); auto invoker_ptr = bn_ptr->MakeInvokerPointer(); @@ -235,131 +157,16 @@ void RunCKSolution(const Handle& handle, } #endif -void PerformanceConfigCKBnFwdInference::HeuristicInit(const FusionDescription& fdesc_problem) -{ -#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = fdesc_problem; -#else - const auto& bn_problem = - fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); - switch(bn_problem.GetXDesc().GetType()) - { - case miopenHalf: Init(bn_problem); break; - case miopenInt8: - case miopenFloat: Init(bn_problem); break; - case miopenInt32: - case miopenInt8x4: - case miopenBFloat16: - case miopenDouble: - default: MIOPEN_THROW("Unsupported datatype"); - } - -#endif -} - -bool PerformanceConfigCKBnFwdInference::SetNextValue(const FusionDescription& fdesc_problem) -{ -#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = fdesc_problem; - return false; -#else - if(this->valid_kernels.empty()) - { - this->HeuristicInit(fdesc_problem); - assert(!valid_kernels.empty()); - return true; - } - if((this->index + 1) < valid_kernels.size()) - { - ++this->index; - this->kernel_id = this->valid_kernels[index]; - return true; - } - else - return false; -#endif -} - -bool PerformanceConfigCKBnFwdInference::IsValidValue() const -{ - return this->index >= 0 && this->index < valid_kernels.size(); -} - -bool PerformanceConfigCKBnFwdInference::IsValid(const FusionContext&, - const FusionDescription& fdesc_problem) const -{ -#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = fdesc_problem; - return false; -#else - // Extract convolution problem from the fusion context. - const auto& bn_problem = - fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); - switch(bn_problem.GetDXDesc().GetType()) - { - case miopenHalf: return CheckIsSupportCKArgs(bn_problem); - case miopenInt8: - case miopenFloat: return CheckIsSupportCKArgs(bn_problem); - case miopenInt32: - case miopenInt8x4: - case miopenBFloat16: - case miopenDouble: - default: MIOPEN_THROW("Unsupported datatype"); - } - return false; -#endif -} - -bool PerformanceConfigCKBnFwdInference::operator==( - const PerformanceConfigCKBnFwdInference& other) const -{ - return this->kernel_id == other.kernel_id; -} -PerformanceConfigCKBnFwdInference -CKBnFwdInference::GetDefaultPerformanceConfig(const FusionContext&, - const FusionDescription& fdesc_problem) const -{ - PerformanceConfigCKBnFwdInference pp; - pp.HeuristicInit(fdesc_problem); - MIOPEN_LOG_I(pp.ToString()); - return pp; -} - -bool CKBnFwdInference::IsValidPerformanceConfig( - const FusionContext& ctx, - const FusionDescription& fdesc_problem, - const PerformanceConfigCKBnFwdInference& config) const -{ - return config.IsValid(ctx, fdesc_problem); -} - -PerformanceConfigCKBnFwdInference CKBnFwdInference::Search(const FusionContext& ctx, - const FusionDescription& fdesc_problem, - const AnyInvokeParams& invoke_ctx) const -{ - return GenericSearch(*this, ctx, fdesc_problem, invoke_ctx); -} - -bool CKBnFwdInference::IsApplicable(const FusionContext& ctx, - const FusionDescription& fdesc_problem) const +bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL std::ignore = ctx; std::ignore = fdesc_problem; return false; #else - const auto& desc = *fdesc_problem.fusion_plan_desc; - if(desc.op_map.empty()) - MIOPEN_THROW(miopenStatusInternalError, "desc.op_map.empty()"); if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_INFER{})) return false; - if(desc.op_map.size() != 1) - return false; - const auto& bn_op = dynamic_cast(*desc.op_map[0]); - if(bn_op.kind() != miopenFusionOpBatchNormInference) - return false; - const auto& bn_problem = - fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); if(!bn_problem.IsLayoutNHWC()) return false; if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) @@ -367,9 +174,9 @@ bool CKBnFwdInference::IsApplicable(const FusionContext& ctx, switch(bn_problem.GetXDesc().GetType()) { - case miopenHalf: return CheckCKApplicability(bn_problem); + case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); case miopenInt8: - case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); case miopenInt32: case miopenInt8x4: case miopenBFloat16: @@ -380,30 +187,29 @@ bool CKBnFwdInference::IsApplicable(const FusionContext& ctx, #endif } -ConvSolution CKBnFwdInference::GetSolution(const FusionContext&, - const FusionDescription& fdesc_problem, - const PerformanceConfigCKBnFwdInference& config) const +ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = fdesc_problem; - std::ignore = config; + std::ignore = context; + std::ignore = bn_problem; return {}; #else - const auto& bn_problem = - fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference); + std::ignore = context; ConvSolution result; result.invoker_factory = [=](const std::vector& kernels) { std::ignore = kernels; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem { case miopenHalf: - RunCKSolution(handle, primitive_parameters, bn_problem, config); + RunCKSolution(handle, primitive_parameters, bn_problem); break; case miopenInt8: case miopenFloat: - RunCKSolution(handle, primitive_parameters, bn_problem, config); + RunCKSolution(handle, primitive_parameters, bn_problem); break; case miopenInt32: case miopenInt8x4: @@ -417,6 +223,6 @@ ConvSolution CKBnFwdInference::GetSolution(const FusionContext&, #endif } -} // namespace fusion +} // namespace batchnorm } // namespace solver } // namespace miopen diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 87f504bef2..fd9fcc8f85 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -25,10 +25,11 @@ *******************************************************************************/ #pragma once +#include #include -#include "bn_test_base.hpp" -#include "test_fusion_plan_base.hpp" +#include "bn_test_data.hpp" +#include "test_operations.hpp" template struct BNInferTest : public ::testing::TestWithParam> @@ -38,10 +39,24 @@ struct BNInferTest : public ::testing::TestWithParam(bn_infer_data.out_dev, bn_infer_data.output.data.size()); - test::FusionPlan::ComputeRefBN(bn_infer_data); - test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out); + bn_infer_test_data.output.data = + handle.Read(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); + test::ComputeCPUBNInference(bn_infer_test_data); + test::BnCmpare(bn_infer_test_data.output, bn_infer_test_data.ref_out); } BNTestCase bn_config; - bool test_skipped = false; - miopen::FusionPlanDescriptor fusePlanDesc; - miopen::OperatorArgs params; - - BNInferSolverTest bn_infer_data; - + BNInferTestData bn_infer_test_data; miopenTensorLayout_t tensor_layout; }; diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp new file mode 100644 index 0000000000..6df3d547a5 --- /dev/null +++ b/test/gtest/bn_infer.cpp @@ -0,0 +1,42 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +struct BNInferTestFloat : BNInferTest +{ +}; + +TEST_P(BNInferTestFloat, BnFwdInferCK) +{ + +} + + +INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, + BNInferTestFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_test_base.hpp b/test/gtest/bn_test_data.hpp similarity index 92% rename from test/gtest/bn_test_base.hpp rename to test/gtest/bn_test_data.hpp index 2ae1e0aa16..689cb1d058 100644 --- a/test/gtest/bn_test_base.hpp +++ b/test/gtest/bn_test_data.hpp @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -90,7 +91,7 @@ std::vector Network1() } template -struct BNSolverTestBase +struct BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) { @@ -148,11 +149,11 @@ struct BNSolverTestBase }; template -struct BNInferSolverTest : public BNSolverTestBase +struct BNInferTestData : public BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) { - BNSolverTestBase::SetUpImpl(config, t_layout); + BNTestData::SetUpImpl(config, t_layout); CreateTensors(); InitTensorsWithRandValue(); WriteToGPU(); @@ -167,8 +168,8 @@ struct BNInferSolverTest : public BNSolverTestBase miopen::Allocator::ManageDataPtr estMean_dev; miopen::Allocator::ManageDataPtr estVariance_dev; double epsilon = 1.0e-5; - const float alpha = static_cast(1.0f); - const float beta = static_cast(0); + float alpha = static_cast(1.0f); + float beta = static_cast(0); const float activ_alpha = static_cast(0.5f); const float activ_beta = static_cast(0.5f); const float activ_gamma = static_cast(0.5f); @@ -178,19 +179,19 @@ struct BNInferSolverTest : public BNSolverTestBase { auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, - BNSolverTestBase::input.desc, - BNSolverTestBase::bn_mode); + BNTestData::input.desc, + BNTestData::bn_mode); scale = tensor{miopen_type{}, - BNSolverTestBase::tensor_layout, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; shift = tensor{miopen_type{}, - BNSolverTestBase::tensor_layout, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; estMean = tensor{miopen_type{}, - BNSolverTestBase::tensor_layout, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; estVariance = tensor{miopen_type{}, - BNSolverTestBase::tensor_layout, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; } diff --git a/test/gtest/na.hpp b/test/gtest/na.hpp index 1377a93038..02bc98d0bc 100644 --- a/test/gtest/na.hpp +++ b/test/gtest/na.hpp @@ -25,51 +25,186 @@ *******************************************************************************/ #pragma once +#include + #include +#include +#include +#include +#include + +#include "tensor_util.hpp" +#include "get_handle.hpp" + +struct BNTestCase +{ + size_t N; + size_t C; + size_t H; + size_t W; + miopenBatchNormMode_t mode; + miopen::batchnorm::Direction Direction; + bool save; + bool keepRunning; -#include "bn_test_base.hpp" -#include "test_fusion_plan_base.hpp" + friend std::ostream& operator<<(std::ostream& ss, const BNTestCase& tc) + { + return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W + << " mode: " << tc.mode << " Direction: " << static_cast(tc.Direction) + << " save: " << tc.save << " keepRunning: " << tc.keepRunning; + } + std::vector GetInput() { return {N, C, H, W}; } +}; + +std::vector Network1() +{ + // pyt_mlperf_resnet50v1.5 + return { + {64, 128, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; +} template -struct BNActivInferTest : public ::testing::TestWithParam< - std::tuple> +struct BNActivInferTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - test_skipped = false; - std::tie(activ_mode, bn_config, tensor_layout) = GetParam(); - bn_infer_data.SetUpImpl(bn_config, tensor_layout); + test_skipped = false; + std::tie(activ_mode, bn_config) = GetParam(); + bn_mode = bn_config.mode; + input = tensor{bn_config.GetInput()}; + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, input.desc, bn_mode); + scale = tensor{derivedBnDesc.GetLengths()}; + shift = tensor{derivedBnDesc.GetLengths()}; + estMean = tensor{derivedBnDesc.GetLengths()}; + estVariance = tensor{derivedBnDesc.GetLengths()}; + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::uniform_int_distribution<> d{0, 100}; + auto gen_value = [&](auto...) { + return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + }; + input.generate(gen_value); + scale.generate(gen_value); + shift.generate(gen_value); + estMean.generate(gen_value); + auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; + estVariance.generate(gen_var); + activ_desc = {activ_mode, activ_alpha, activ_beta, activ_gamma}; + output = tensor{bn_config.GetInput()}; + auto&& handle = get_handle(); + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + in_dev = handle.Write(input.data); + scale_dev = handle.Write(scale.data); + shift_dev = handle.Write(shift.data); + estMean_dev = handle.Write(estMean.data); + estVariance_dev = handle.Write(estVariance.data); + out_dev = handle.Write(output.data); - test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data); - test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data); - test::FusionPlan::AddActiv(fusePlanDesc, params, bn_infer_data, activ_mode); + // Setup the Fusionplan + fusePlanDesc = miopen::FusionPlanDescriptor(miopenVerticalFusion, input.desc); + auto bnOp = + std::make_shared(bn_mode, bn_desc); + EXPECT_EQ(fusePlanDesc.AddOp(bnOp), miopenStatusSuccess); + bnOp->SetArgs(params, + &alpha, + &beta, + scale_dev.get(), + shift_dev.get(), + estMean_dev.get(), + estVariance_dev.get(), + epsilon); + auto activOp = std::make_shared(activ_desc.GetMode()); + EXPECT_EQ(fusePlanDesc.AddOp(activOp), miopenStatusSuccess); + activOp->SetArgs(params, &alpha, &beta, activ_alpha, activ_beta, activ_gamma); } void TearDown() override { if(test_skipped) return; - test::FusionPlan::ComputeRefBN(bn_infer_data); - activationHostInfer(activ_mode, - bn_infer_data.activ_gamma, - bn_infer_data.activ_beta, - bn_infer_data.activ_alpha, - bn_infer_data.ref_out.data, - bn_infer_data.ref_out.data); + ref_out = tensor{bn_config.GetInput()}; + if(bn_mode == miopenBNPerActivation) + { + batchNormPerActivHostInference( + input, ref_out, scale, shift, epsilon, estMean, estVariance); + } + else + { + batchNormSpatialHostInference( + input, ref_out, scale, shift, epsilon, estMean, estVariance); + } + activationHostInfer( + activ_mode, activ_gamma, activ_beta, activ_alpha, ref_out.data, ref_out.data); auto&& handle = get_handle(); - bn_infer_data.output.data = - handle.Read(bn_infer_data.out_dev, bn_infer_data.output.data.size()); - test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out); + output.data = handle.Read(out_dev, output.data.size()); + EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; + EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; + EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) + << "Non finite number found in the GPU data"; + EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); + const double tolerance = 80; + double threshold = std::numeric_limits::epsilon() * tolerance; + auto error = miopen::rms_range(ref_out, output); + EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) + << "Non finite number found in the CPU data"; + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } BNTestCase bn_config; - + miopen::TensorDescriptor bn_desc; + miopen::ActivationDescriptor activ_desc; + miopenBatchNormMode_t bn_mode; + tensor input; + tensor output; + tensor ref_out; + tensor scale; + tensor shift; + tensor estMean; + tensor estVariance; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr out_dev; + miopen::Allocator::ManageDataPtr scale_dev; + miopen::Allocator::ManageDataPtr shift_dev; + miopen::Allocator::ManageDataPtr estMean_dev; + miopen::Allocator::ManageDataPtr estVariance_dev; bool test_skipped = false; miopenActivationMode_t activ_mode; miopen::FusionPlanDescriptor fusePlanDesc; miopen::OperatorArgs params; - - BNInferSolverTest bn_infer_data; - - miopenTensorLayout_t tensor_layout; + const float alpha = static_cast(1.0f); + const float beta = static_cast(0); + const float activ_alpha = static_cast(0.5f); + const float activ_beta = static_cast(0.5f); + const float activ_gamma = static_cast(0.5f); + double epsilon = 1.0e-5; }; diff --git a/test/gtest/na_infer.cpp b/test/gtest/na_infer.cpp index 2cc0f5f118..f0d3407b2b 100644 --- a/test/gtest/na_infer.cpp +++ b/test/gtest/na_infer.cpp @@ -28,7 +28,6 @@ #include #include "na.hpp" -#include "bn.hpp" struct BNActivInferFloat : BNActivInferTest { @@ -38,14 +37,6 @@ struct BNActivInferHalf : BNActivInferTest { }; -struct BNInferFloat : BNInferTest -{ -}; - -struct BNInferHalf : BNInferTest -{ -}; - template void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc, const miopen::fusion::FusionInvokeParams& plan_params, @@ -69,76 +60,22 @@ void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc, (invoker)(handle, plan_params); handle.Finish(); } - -template -void RunTunableSolver(miopen::FusionPlanDescriptor& fusePlanDesc, - const std::unique_ptr& plan_params, - const TestCase& config, - bool& test_skipped) -{ - auto& handle = get_handle(); - Solver solv{}; - const auto fusion_problem = miopen::FusionDescription{&fusePlanDesc}; - auto fusion_ctx = miopen::FusionContext{handle}; - if(!solv.IsApplicable(fusion_ctx, fusion_problem)) - { - test_skipped = true; - GTEST_SKIP() << solv.SolverDbId() << " Not Applicable" << config; - } - ASSERT_TRUE(solv.IsApplicable(fusion_ctx, fusion_problem)); - auto sol = solv.GetSolution( - fusion_ctx, fusion_problem, solv.GetDefaultPerformanceConfig(fusion_ctx, fusion_problem)); - ASSERT_TRUE(sol.Succeeded()); - ASSERT_TRUE(sol.invoker_factory); - const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); - (invoker)(handle, *(plan_params.get())); - handle.Finish(); -} - TEST_P(BNActivInferFloat, BnFwdInferActivationFused) { - const auto plan_params = miopen::fusion::FusionInvokeParams(params, - bn_infer_data.input.desc, - bn_infer_data.in_dev.get(), - bn_infer_data.output.desc, - bn_infer_data.out_dev.get(), - false); + const auto plan_params = miopen::fusion::FusionInvokeParams( + params, input.desc, in_dev.get(), output.desc, out_dev.get(), false); RunSolver( fusePlanDesc, plan_params, bn_config, test_skipped); } -TEST_P(BNInferFloat, CKBnFwdInference) -{ - const auto plan_params = - std::make_unique(params, - bn_infer_data.input.desc, - bn_infer_data.in_dev.get(), - bn_infer_data.output.desc, - bn_infer_data.out_dev.get(), - false); - RunTunableSolver( - fusePlanDesc, plan_params, bn_config, test_skipped); -} - -INSTANTIATE_TEST_SUITE_P(BNInferFloatSuite, - BNInferFloat, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); - INSTANTIATE_TEST_SUITE_P(BNActivInferFloatSuite, BNActivInferFloat, testing::Combine(testing::Values(miopenActivationRELU), - testing::ValuesIn(Network1()), - testing::Values(miopenTensorNCHW))); - + testing::ValuesIn(Network1()))); TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused) { - const auto plan_params = miopen::fusion::FusionInvokeParams(params, - bn_infer_data.input.desc, - bn_infer_data.in_dev.get(), - bn_infer_data.output.desc, - bn_infer_data.out_dev.get(), - false); + const auto plan_params = miopen::fusion::FusionInvokeParams( + params, input.desc, in_dev.get(), output.desc, out_dev.get(), false); RunSolver( fusePlanDesc, plan_params, bn_config, test_skipped); } @@ -146,5 +83,4 @@ TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused) INSTANTIATE_TEST_SUITE_P(BNActivInferHalfSuite, BNActivInferHalf, testing::Combine(testing::Values(miopenActivationRELU), - testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + testing::ValuesIn(Network1()))); diff --git a/test/gtest/test_fusion_plan_base.hpp b/test/gtest/test_operations.hpp similarity index 88% rename from test/gtest/test_fusion_plan_base.hpp rename to test/gtest/test_operations.hpp index e4b46ca1f6..398aaf406a 100644 --- a/test/gtest/test_fusion_plan_base.hpp +++ b/test/gtest/test_operations.hpp @@ -25,7 +25,37 @@ *******************************************************************************/ #pragma once -#include "bn_test_base.hpp" +namespace test{ +template +void ComputeCPUBNInference(DLModule& dl_module) +{ + batchNormSpatialHostInference(dl_module.input, + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); +} + +template +void BnCmpare(const tensor& output, const tensor& ref_out) +{ + EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; + EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; + EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) + << "Non finite number found in the GPU data"; + EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); + const double tolerance = 80; + double threshold = std::numeric_limits::epsilon() * tolerance; + auto error = miopen::rms_range(ref_out, output); + EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) + << "Non finite number found in the CPU data"; + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; +} +} + namespace test { namespace FusionPlan { template @@ -76,48 +106,5 @@ bool Skip(miopen::Handle& handle) return skip_test; } - -template -void ComputeRefBN(DLModule& dl_module) -{ - if(dl_module.bn_mode == miopenBNPerActivation) - { - batchNormPerActivHostInference(dl_module.input, - dl_module.ref_out, - dl_module.scale, - dl_module.shift, - dl_module.epsilon, - dl_module.estMean, - dl_module.estVariance); - } - else - { - batchNormSpatialHostInference(dl_module.input, - dl_module.ref_out, - dl_module.scale, - dl_module.shift, - dl_module.epsilon, - dl_module.estMean, - dl_module.estVariance); - } -} - -template -void BnCmpare(const tensor& output, const tensor& ref_out) -{ - EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; - EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; - EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) - << "Non finite number found in the GPU data"; - EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); - const double tolerance = 80; - double threshold = std::numeric_limits::epsilon() * tolerance; - auto error = miopen::rms_range(ref_out, output); - EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) - << "Non finite number found in the CPU data"; - EXPECT_TRUE(error < threshold) - << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; -} - } // namespace FusionPlan } // namespace test From 4002995786f35784a6c57410246122c5f9e5781a Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Sun, 3 Sep 2023 01:08:30 +0000 Subject: [PATCH 07/28] bg/LWPMIOPEN-194 : add all data types supported by CK --- src/solver/batchnorm/forward_inference_ck.cpp | 63 ++++++++++++------- test/fusionHost.hpp | 16 ++--- test/gtest/bn.hpp | 22 +++++-- test/gtest/bn_infer.cpp | 35 ++++++++++- test/gtest/bn_test_data.hpp | 57 +++++++++-------- test/gtest/test_operations.hpp | 4 +- 6 files changed, 131 insertions(+), 66 deletions(-) diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 1493f46d51..13ee6e2fd5 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -47,12 +47,9 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; constexpr index_t Rank = 4; constexpr index_t NumBatchNormReduceDim = 3; -template -using DeviceOp = ck::tensor_operation::device::DeviceElementwise< - ck::Tuple, - ck::Tuple, - Normalize, - Rank>; +using F16 = ck::half_t; +using F32 = float; +using F64 = double; struct CKArgsBNormFwd { @@ -81,17 +78,26 @@ struct CKArgsBNormFwd std::array aligned_scaleBiasMeanVarStrides{3}; - double epsilon = 0.0001; std::array reduceDims{0, 1, 2}; }; -template +template int CheckCKApplicability( const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + Rank>; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); + DeviceOp>::GetInstances(); assert(!bn_fwd_ptrs.empty()); int count = 0; for(const auto& it : bn_fwd_ptrs) @@ -114,17 +120,28 @@ int CheckCKApplicability( return -1; } -template +template static void RunCKSolution(const Handle& handle, const AnyInvokeParams& primitive_parameters, const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + ck::Tuple, + ck::Tuple, + Normalize, + Rank>; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); + DeviceOp>::GetInstances(); - int kernel_index = CheckCKApplicability(problem); + int kernel_index = CheckCKApplicability(problem); assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size()); auto& bn_ptr = bn_fwd_ptrs.at(kernel_index); const auto& params = primitive_parameters.CastTo(); @@ -174,13 +191,13 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, switch(bn_problem.GetXDesc().GetType()) { - case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); - case miopenInt8: - case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); + case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); + case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); + case miopenDouble: return (CheckCKApplicability(bn_problem) != -1); + case miopenBFloat16: case miopenInt32: + case miopenInt8: case miopenInt8x4: - case miopenBFloat16: - case miopenDouble: default: MIOPEN_THROW("Unsupported datatype"); } return false; @@ -205,16 +222,18 @@ ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context, switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem { case miopenHalf: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution(handle, primitive_parameters, bn_problem); break; - case miopenInt8: case miopenFloat: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution(handle, primitive_parameters, bn_problem); break; + case miopenDouble: + RunCKSolution(handle, primitive_parameters, bn_problem); + break; + case miopenBFloat16: + case miopenInt8: case miopenInt32: case miopenInt8x4: - case miopenBFloat16: - case miopenDouble: default: MIOPEN_THROW("Unsupported datatype"); } }; diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 5d2b08f1bb..be27234c08 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -135,22 +135,22 @@ void convHostForward(const tensor& input, } } -template +template void batchNormSpatialHostInference(const tensor& input, tensor& output, const tensor& scale, const tensor& bias, double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance) + const tensor& estimatedMean, + const tensor& estimatedVariance) { int n_batches, channels, height, width; std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); par_for(channels, 1, [&](int cidx) { // via channel - double mean = estimatedMean(0, cidx, 0, 0); - double variance = estimatedVariance(0, cidx, 0, 0); - double invertVar = 1.0 / sqrt(variance + epsilon); + V mean = estimatedMean(0, cidx, 0, 0); + V variance = estimatedVariance(0, cidx, 0, 0); + V invertVar = 1.0 / sqrt(variance + epsilon); // process the batch per channel for(int row = 0; row < height; row++) { // via rows @@ -158,8 +158,8 @@ void batchNormSpatialHostInference(const tensor& input, { // via columns for(int bidx = 0; bidx < n_batches; bidx++) { // via mini_batch - double elemStd = static_cast(input(bidx, cidx, row, column)) - mean; - double inhat = elemStd * invertVar; + V elemStd = static_cast(input(bidx, cidx, row, column)) - mean; + V inhat = elemStd * invertVar; output(bidx, cidx, row, column) = static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index fd9fcc8f85..86adf49f7b 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -31,7 +31,8 @@ #include "bn_test_data.hpp" #include "test_operations.hpp" -template +template struct BNInferTest : public ::testing::TestWithParam> { protected: @@ -57,6 +58,8 @@ struct BNInferTest : public ::testing::TestWithParam::quiet_NaN()); } void TearDown() override @@ -65,12 +68,23 @@ struct BNInferTest : public ::testing::TestWithParam(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); + handle.Read(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); test::ComputeCPUBNInference(bn_infer_test_data); - test::BnCmpare(bn_infer_test_data.output, bn_infer_test_data.ref_out); + + if constexpr(std::is_same_v) + { + // tolerance for CK solver + test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8); + } + else{ + test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out); + } + } + BNTestCase bn_config; bool test_skipped = false; - BNInferTestData bn_infer_test_data; + BNInferTestData bn_infer_test_data; miopenTensorLayout_t tensor_layout; }; diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 6df3d547a5..8785724884 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -26,17 +26,48 @@ #include "bn.hpp" -struct BNInferTestFloat : BNInferTest +struct BNInferTestHalf : BNInferTest { }; -TEST_P(BNInferTestFloat, BnFwdInferCK) +struct BNInferTestFloat : BNInferTest +{ +}; + +struct BNInferTestDouble : BNInferTest +{ +}; + + +TEST_P(BNInferTestHalf, BnFwdInferCKHalf) +{ + +} + +TEST_P(BNInferTestFloat, BnFwdInferCKFloat) +{ + +} + +TEST_P(BNInferTestDouble, BnFwdInferCKDouble) { } +INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite, + BNInferTestHalf, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, BNInferTestFloat, testing::Combine(testing::ValuesIn(Network1()), testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNInferTestDoubleNHWCSuite, + BNInferTestDouble, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 689cb1d058..9ddb30bfeb 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -60,6 +60,7 @@ std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, @@ -90,7 +91,7 @@ std::vector Network1() {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } -template +template struct BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) @@ -105,9 +106,9 @@ struct BNTestData } const miopen::TensorDescriptor& GetInputDesc() const { return input.desc; } - tensor input; - tensor output; - tensor ref_out; + tensor input; + tensor output; + tensor ref_out; miopen::Allocator::ManageDataPtr in_dev; miopen::Allocator::ManageDataPtr out_dev; @@ -122,8 +123,8 @@ struct BNTestData private: void CreateTensors() { - input = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; - output = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; + input = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; + output = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; ref_out = output; } @@ -133,7 +134,7 @@ struct BNTestData std::mt19937 gen{rd()}; std::uniform_int_distribution<> d{0, 100}; auto gen_value = [&](auto...) { - return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); }; input.generate(gen_value); } @@ -148,21 +149,22 @@ struct BNTestData } }; -template -struct BNInferTestData : public BNTestData +template +struct BNInferTestData : public BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) { - BNTestData::SetUpImpl(config, t_layout); + BNTestData::SetUpImpl(config, t_layout); CreateTensors(); InitTensorsWithRandValue(); WriteToGPU(); } - tensor scale; - tensor shift; - tensor estMean; - tensor estVariance; + tensor scale; + tensor shift; + tensor estMean; + tensor estVariance; miopen::Allocator::ManageDataPtr scale_dev; miopen::Allocator::ManageDataPtr shift_dev; // bias miopen::Allocator::ManageDataPtr estMean_dev; @@ -179,19 +181,19 @@ struct BNInferTestData : public BNTestData { auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, - BNTestData::input.desc, - BNTestData::bn_mode); - scale = tensor{miopen_type{}, - BNTestData::tensor_layout, + BNTestData::input.desc, + BNTestData::bn_mode); + scale = tensor{miopen_type{}, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; - shift = tensor{miopen_type{}, - BNTestData::tensor_layout, + shift = tensor{miopen_type{}, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; - estMean = tensor{miopen_type{}, - BNTestData::tensor_layout, + estMean = tensor{miopen_type{}, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; - estVariance = tensor{miopen_type{}, - BNTestData::tensor_layout, + estVariance = tensor{miopen_type{}, + BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; } @@ -201,12 +203,13 @@ struct BNInferTestData : public BNTestData std::mt19937 gen{rd()}; std::uniform_int_distribution<> d{0, 100}; auto gen_value = [&](auto...) { - return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); }; scale.generate(gen_value); shift.generate(gen_value); - estMean.generate(gen_value); - auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; + + auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; + estMean.generate(gen_var); estVariance.generate(gen_var); } void WriteToGPU() diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 398aaf406a..3ac2d40d5a 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -39,15 +39,13 @@ void ComputeCPUBNInference(DLModule& dl_module) } template -void BnCmpare(const tensor& output, const tensor& ref_out) +void CompareTensor(const tensor& output, const tensor& ref_out, const T threshold = std::numeric_limits::epsilon()) { EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) << "Non finite number found in the GPU data"; EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); - const double tolerance = 80; - double threshold = std::numeric_limits::epsilon() * tolerance; auto error = miopen::rms_range(ref_out, output); EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) << "Non finite number found in the CPU data"; From bd7f9f6e2690e438882296d71a77178deedbeb94 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 4 Sep 2023 17:28:19 +0000 Subject: [PATCH 08/28] bg/LWPMIOPEN-194 : minor fixes --- test/gtest/bn.hpp | 2 +- test/gtest/bn_infer.cpp | 16 ---------------- test/gtest/bn_test_data.hpp | 2 +- 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 86adf49f7b..e5ff25517b 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -73,7 +73,7 @@ struct BNInferTest : public ::testing::TestWithParam) { - // tolerance for CK solver + // tolerance for CK solver tolerance for test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8); } else{ diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 8785724884..8d49d3c99e 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -36,12 +36,6 @@ struct BNInferTestFloat : BNInferTest -{ -}; - - TEST_P(BNInferTestHalf, BnFwdInferCKHalf) { @@ -52,11 +46,6 @@ TEST_P(BNInferTestFloat, BnFwdInferCKFloat) } -TEST_P(BNInferTestDouble, BnFwdInferCKDouble) -{ - -} - INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite, BNInferTestHalf, testing::Combine(testing::ValuesIn(Network1()), @@ -66,8 +55,3 @@ INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, BNInferTestFloat, testing::Combine(testing::ValuesIn(Network1()), testing::Values(miopenTensorNHWC))); - -INSTANTIATE_TEST_SUITE_P(BNInferTestDoubleNHWCSuite, - BNInferTestDouble, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 9ddb30bfeb..4692db5443 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -207,9 +207,9 @@ struct BNInferTestData : public BNTestData }; scale.generate(gen_value); shift.generate(gen_value); + estMean.generate(gen_value); auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; - estMean.generate(gen_var); estVariance.generate(gen_var); } void WriteToGPU() From 291d7522c87559b0896ee27c626f9520f0dac627 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Tue, 5 Sep 2023 13:58:43 +0000 Subject: [PATCH 09/28] bg/LWPMIOPEN-194: fix clang format --- src/include/miopen/batchnorm/solvers.hpp | 1 - src/ocl/batchnormocl.cpp | 2 +- src/solver/batchnorm/forward_inference_ck.cpp | 80 ++++++++++--------- test/gtest/bn.hpp | 59 +++++++------- test/gtest/bn_infer.cpp | 17 ++-- test/gtest/bn_test_data.hpp | 39 +++++---- test/gtest/test_operations.hpp | 22 ++--- 7 files changed, 114 insertions(+), 106 deletions(-) diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 6cf494ce37..4a2275294f 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -145,7 +145,6 @@ struct BnCKFwdInference final : BatchnormSolver // ----------- end BnCKFwdInference --------------- - } // namespace batchnorm } // namespace solver diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 1025509da4..6c8a079a2a 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -223,7 +223,7 @@ void BatchNormForwardInference(Handle& handle, const auto algo = AlgorithmName{"miopenBatchNormalizationForwardInference"}; const auto solvers = solver::SolverContainer{}; + solver::batchnorm::BnCKFwdInference>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index e914b2bbf3..88968c2572 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -25,7 +25,6 @@ * *******************************************************************************/ - #include #include #include @@ -47,9 +46,9 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; constexpr index_t Rank = 4; constexpr index_t NumBatchNormReduceDim = 3; -using F16 = ck::half_t; -using F32 = float; -using F64 = double; +using F16 = ck::half_t; +using F32 = float; +using F64 = double; struct CKArgsBNormFwd { @@ -87,17 +86,16 @@ template -int CheckCKApplicability( - const miopen::batchnorm::ProblemDescription& problem) +int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) { - const auto& args = CKArgsBNormFwd{problem}; - using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + const auto& args = CKArgsBNormFwd{problem}; + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< ck::Tuple, ck::Tuple, Normalize, Rank>; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); + DeviceOp>::GetInstances(); assert(!bn_fwd_ptrs.empty()); int count = 0; for(const auto& it : bn_fwd_ptrs) @@ -112,7 +110,8 @@ int CheckCKApplicability( {nullptr, nullptr, nullptr, nullptr, nullptr}, {nullptr}, Normalize{0.0}); - if(it->IsSupportedArgument(argument_ptr.get())){ + if(it->IsSupportedArgument(argument_ptr.get())) + { return count; } count++; @@ -127,8 +126,8 @@ template static void RunCKSolution(const Handle& handle, - const AnyInvokeParams& primitive_parameters, - const miopen::batchnorm::ProblemDescription& problem) + const AnyInvokeParams& primitive_parameters, + const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; @@ -138,28 +137,29 @@ static void RunCKSolution(const Handle& handle, Normalize, Rank>; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - int kernel_index = CheckCKApplicability(problem); + DeviceOp>::GetInstances(); + + int kernel_index = CheckCKApplicability(problem); assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size()); - auto& bn_ptr = bn_fwd_ptrs.at(kernel_index); + auto& bn_ptr = bn_fwd_ptrs.at(kernel_index); const auto& params = primitive_parameters.CastTo(); - auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, - {args.xyStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides}, - {args.xyStrides}, - {params.x, - params.estimatedMean, - params.estimatedVariance, - params.bnScale, - params.bnBias}, - {params.y}, - Normalize{params.epsilon}); + auto argument_ptr = bn_ptr->MakeArgumentPointer( + args.xyLengths, + {args.xyStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides}, + {args.xyStrides}, + {params.x, params.estimatedMean, params.estimatedVariance, params.bnScale, params.bnBias}, + {params.y}, + Normalize{params.epsilon}); auto invoker_ptr = bn_ptr->MakeInvokerPointer(); const auto enable_profiling = handle.IsProfilingEnabled(); @@ -193,7 +193,8 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, { case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); - case miopenDouble: return (CheckCKApplicability(bn_problem) != -1); + case miopenDouble: + return (CheckCKApplicability(bn_problem) != -1); case miopenBFloat16: case miopenInt32: case miopenInt8: @@ -204,8 +205,9 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, #endif } -ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& bn_problem) const +ConvSolution +BnCKFwdInference::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL std::ignore = context; @@ -218,17 +220,19 @@ ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context, result.invoker_factory = [=](const std::vector& kernels) { std::ignore = kernels; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { - switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem { case miopenHalf: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenFloat: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenDouble: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenBFloat16: case miopenInt8: diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index e5ff25517b..637c007e5d 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -31,8 +31,11 @@ #include "bn_test_data.hpp" #include "test_operations.hpp" -template +template struct BNInferTest : public ::testing::TestWithParam> { protected: @@ -43,48 +46,50 @@ struct BNInferTest : public ::testing::TestWithParam::quiet_NaN()); + std::fill(bn_infer_test_data.output.begin(), + bn_infer_test_data.output.end(), + std::numeric_limits::quiet_NaN()); } void TearDown() override { if(test_skipped) return; - auto&& handle = get_handle(); - bn_infer_test_data.output.data = - handle.Read(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); + auto&& handle = get_handle(); + bn_infer_test_data.output.data = handle.Read( + bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); test::ComputeCPUBNInference(bn_infer_test_data); - + if constexpr(std::is_same_v) { // tolerance for CK solver tolerance for - test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8); + test::CompareTensor( + bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8); } - else{ + else + { test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out); } - } BNTestCase bn_config; bool test_skipped = false; - BNInferTestData bn_infer_test_data; + BNInferTestData + bn_infer_test_data; miopenTensorLayout_t tensor_layout; }; diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 8d49d3c99e..e3ff28e776 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -26,25 +26,18 @@ #include "bn.hpp" -struct BNInferTestHalf : BNInferTest +struct BNInferTestHalf + : BNInferTest { }; -struct BNInferTestFloat : BNInferTest +struct BNInferTestFloat : BNInferTest { }; -TEST_P(BNInferTestHalf, BnFwdInferCKHalf) -{ - -} +TEST_P(BNInferTestHalf, BnFwdInferCKHalf) {} -TEST_P(BNInferTestFloat, BnFwdInferCKFloat) -{ - -} +TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {} INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite, BNInferTestHalf, diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 4692db5443..a7aff78da3 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -149,8 +149,12 @@ struct BNTestData } }; -template +template struct BNInferTestData : public BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) @@ -170,8 +174,8 @@ struct BNInferTestData : public BNTestData miopen::Allocator::ManageDataPtr estMean_dev; miopen::Allocator::ManageDataPtr estVariance_dev; double epsilon = 1.0e-5; - float alpha = static_cast(1.0f); - float beta = static_cast(0); + float alpha = static_cast(1.0f); + float beta = static_cast(0); const float activ_alpha = static_cast(0.5f); const float activ_beta = static_cast(0.5f); const float activ_gamma = static_cast(0.5f); @@ -183,18 +187,19 @@ struct BNInferTestData : public BNTestData miopen::DeriveBNTensorDescriptor(derivedBnDesc, BNTestData::input.desc, BNTestData::bn_mode); - scale = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - shift = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - estMean = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - estVariance = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; + scale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + estMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + estVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; } void InitTensorsWithRandValue() @@ -208,7 +213,7 @@ struct BNInferTestData : public BNTestData scale.generate(gen_value); shift.generate(gen_value); estMean.generate(gen_value); - + auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; estVariance.generate(gen_var); } diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 3ac2d40d5a..d4e524e472 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -25,34 +25,36 @@ *******************************************************************************/ #pragma once -namespace test{ +namespace test { template void ComputeCPUBNInference(DLModule& dl_module) { batchNormSpatialHostInference(dl_module.input, - dl_module.ref_out, - dl_module.scale, - dl_module.shift, - dl_module.epsilon, - dl_module.estMean, - dl_module.estVariance); + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); } template -void CompareTensor(const tensor& output, const tensor& ref_out, const T threshold = std::numeric_limits::epsilon()) +void CompareTensor(const tensor& output, + const tensor& ref_out, + const T threshold = std::numeric_limits::epsilon()) { EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) << "Non finite number found in the GPU data"; EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); - auto error = miopen::rms_range(ref_out, output); + auto error = miopen::rms_range(ref_out, output); EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) << "Non finite number found in the CPU data"; EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } -} +} // namespace test namespace test { namespace FusionPlan { From f09c16ea19475b0dca85a1506fe648dce988f852 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 7 Sep 2023 00:14:07 +0000 Subject: [PATCH 10/28] bg/LWPMIOPEN-193_bn_back : first commit test working --- src/CMakeLists.txt | 1 + src/include/miopen/batchnorm/solvers.hpp | 12 + src/ocl/batchnormocl.cpp | 4 +- src/solver/batchnorm/backward_ck.cpp | 277 ++++++++++++++++++ src/solver/batchnorm/forward_inference_ck.cpp | 1 + test/gtest/bn.hpp | 79 ++++- test/gtest/bn_bwd.cpp | 65 ++++ test/gtest/bn_test_data.hpp | 175 +++++++++-- test/gtest/test_operations.hpp | 14 + 9 files changed, 594 insertions(+), 34 deletions(-) create mode 100644 src/solver/batchnorm/backward_ck.cpp create mode 100644 test/gtest/bn_bwd.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dc86c4136c..677a23b4fb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -146,6 +146,7 @@ set( MIOpen_Source solver/activ/bwd_1.cpp solver/activ/fwd_0.cpp solver/activ/fwd_1.cpp + solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp solver/batchnorm/backward_spatial_multiple.cpp diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 6cf494ce37..3cc42e36cf 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -145,6 +145,18 @@ struct BnCKFwdInference final : BatchnormSolver // ----------- end BnCKFwdInference --------------- +// ----------- start BnCKBwdBackward --------------- +struct BnCKBwdBackward final : BatchnormSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + +// ----------- end BnCKBwdBackward --------------- } // namespace batchnorm diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 1025509da4..10d2bb8372 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -338,7 +338,6 @@ void BatchNormBackward(Handle& handle, tmp.dx = dx; tmp.bnScale = bnScale; tmp.resultBnScaleDiff = resultBnScaleDiff; - tmp.resultBnScaleDiff = resultBnScaleDiff; tmp.resultBnBiasDiff = resultBnBiasDiff; tmp.epsilon = epsilon; tmp.savedMean = savedMean; @@ -346,7 +345,8 @@ void BatchNormBackward(Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp new file mode 100644 index 0000000000..8300a3de62 --- /dev/null +++ b/src/solver/batchnorm/backward_ck.cpp @@ -0,0 +1,277 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#include +#endif +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_BACK) + +namespace miopen { +namespace solver { +namespace batchnorm { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using index_t = int32_t; + +constexpr index_t Rank = 4; +constexpr index_t NumBatchNormReduceDim = 3; + +using F16 = ck::half_t; +using F32 = float; +using F64 = double; +using BF16 = bfloat16; + +struct CKArgsBNormFwd +{ + CKArgsBNormFwd(const miopen::batchnorm::ProblemDescription& problem) + { + std::copy(problem.GetXDesc().GetLengths().begin(), + problem.GetXDesc().GetLengths().end(), + xyLengths.begin()); + + std::copy(problem.GetXDesc().GetStrides().begin(), + problem.GetXDesc().GetStrides().end(), + xyStrides.begin()); + arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel + arrScaleBiasMeanVarStrides[0] = 1; + + // prep for CK + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); + } + + std::array xyLengths; // inOutLengths + std::array xyStrides; // inOutStrides + std::vector invariantDims; + + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + + double epsilon = 0.0001; + std::array reduceDims{0, 1, 2}; +}; + +template +int CheckCKApplicability( + const miopen::batchnorm::ProblemDescription& problem) +{ + const auto& args = CKArgsBNormFwd{problem}; + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; + const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + assert(!bn_bwd_ptrs.empty()); + int count = 0; + for(const auto& it : bn_bwd_ptrs) + { + auto argument_ptr = it->MakeArgumentPointer(args.xyLengths, + args.xyStrides, + args.xyStrides, + args.xyStrides, + args.reduceDims, + args.arrScaleBiasMeanVarLengths, + args.arrScaleBiasMeanVarStrides, + args.arrScaleBiasMeanVarStrides, + args.arrScaleBiasMeanVarStrides, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + args.epsilon, + PassThrough{}, + nullptr, + nullptr, + nullptr); + if(it->IsSupportedArgument(argument_ptr.get())){ + return count; + } + count++; + } + return -1; +} + +template +static void RunCKSolution(const Handle& handle, + const AnyInvokeParams& primitive_parameters, + const miopen::batchnorm::ProblemDescription& problem) +{ + const auto& args = CKArgsBNormFwd{problem}; + + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; + const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + int kernel_index = CheckCKApplicability(problem); + assert(kernel_index >= 0 && kernel_index < bn_bwd_ptrs.size()); + auto& bn_ptr = bn_bwd_ptrs.at(kernel_index); + const auto& params = primitive_parameters.CastTo(); + auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, + args.xyStrides, + args.xyStrides, + args.xyStrides, + args.reduceDims, + args.arrScaleBiasMeanVarLengths, + args.arrScaleBiasMeanVarStrides, + args.arrScaleBiasMeanVarStrides, + args.arrScaleBiasMeanVarStrides, + params.x, + params.dy, + params.bnScale, + params.savedMean, + params.savedInvVariance, + args.epsilon, + PassThrough{}, + params.dx, + params.resultBnScaleDiff, + params.resultBnBiasDiff); + auto invoker_ptr = bn_ptr->MakeInvokerPointer(); + const auto enable_profiling = handle.IsProfilingEnabled(); + + float elapsed_time = + invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); + if(enable_profiling) + { + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed_time); + } +} +#endif + +bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; + std::ignore = fdesc_problem; + return false; +#else + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_BACK{})) + return false; + if(!bn_problem.IsLayoutNHWC()) + return false; + if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) + return false; + + switch(bn_problem.GetXDesc().GetType()) + { + case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); + case miopenDouble: return (CheckCKApplicability(bn_problem) != -1); + case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); + case miopenBFloat16: return (CheckCKApplicability(bn_problem) != -1); + case miopenInt32: + case miopenInt8: + case miopenInt8x4: + default: MIOPEN_THROW("Unsupported datatype"); + } + return false; +#endif +} + +ConvSolution BnCKBwdBackward::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = context; + std::ignore = bn_problem; + return {}; +#else + std::ignore = context; + + ConvSolution result; + result.invoker_factory = [=](const std::vector& kernels) { + std::ignore = kernels; + return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { + + switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem + { + case miopenFloat: + RunCKSolution(handle, primitive_parameters, bn_problem); + break; + case miopenDouble: + RunCKSolution(handle, primitive_parameters, bn_problem); + break; + case miopenHalf: + RunCKSolution(handle, primitive_parameters, bn_problem); + break; + case miopenBFloat16: + RunCKSolution(handle, primitive_parameters, bn_problem); + break; + case miopenInt8: + case miopenInt32: + case miopenInt8x4: + default: MIOPEN_THROW("Unsupported datatype"); + } + }; + }; + return result; +#endif +} + +} // namespace batchnorm +} // namespace solver +} // namespace miopen diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index e914b2bbf3..70f3854caf 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -50,6 +50,7 @@ constexpr index_t NumBatchNormReduceDim = 3; using F16 = ck::half_t; using F32 = float; using F64 = double; +using BF16 = ck::bhalf_t; struct CKArgsBNormFwd { diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index e5ff25517b..907241a3e5 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -26,8 +26,8 @@ #pragma once #include +#include #include - #include "bn_test_data.hpp" #include "test_operations.hpp" @@ -88,3 +88,80 @@ struct BNInferTest : public ::testing::TestWithParam bn_infer_test_data; miopenTensorLayout_t tensor_layout; }; + + +template +struct BNBwdTest : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(bn_config, tensor_layout) = GetParam(); + bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout); + + auto&& handle = get_handle(); + miopenBatchNormalizationBackward( + &handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, // const xDesc + bn_bwd_test_data.in_dev.get(), // const x + &bn_bwd_test_data.dy.desc, // const dyDesc + bn_bwd_test_data.dy_dev.get(), // const dy + + &bn_bwd_test_data.output.desc, // const dxDesc + bn_bwd_test_data.out_dev.get(), // dx (not -const) + + &bn_bwd_test_data.bnScale.desc, // const bnScale + bn_bwd_test_data.bnScale_dev.get(), // const bnScale + + bn_bwd_test_data.dScale_dev.get(), // resultBnScaleDiff (not const) + bn_bwd_test_data.dBias_dev.get(), // resultBnBiasDiff (not const) + + bn_bwd_test_data.epsilon, + + bn_bwd_test_data.savedMean_dev.get(), // const savedMean + bn_bwd_test_data.savedInvVar_dev.get()); // const savedInvVariance + + std::fill(bn_bwd_test_data.output.begin(), bn_bwd_test_data.output.end(), std::numeric_limits::quiet_NaN()); + } + + void TearDown() override + { + if(test_skipped) + return; + auto&& handle = get_handle(); + bn_bwd_test_data.output.data = + handle.Read(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size()); + bn_bwd_test_data.dScale.data = + handle.Read(bn_bwd_test_data.dScale_dev, bn_bwd_test_data.dScale.data.size()); + bn_bwd_test_data.dBias.data = + handle.Read(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size()); + test::ComputeCPUBNBwd(bn_bwd_test_data); + // using tolerance = 1e-4 since this the tolerance CK uses + test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4); + test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4); + test::CompareTensor(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 1e-4); + } + + BNTestCase bn_config; + bool test_skipped = false; + BNBwdTestData bn_bwd_test_data; + miopenTensorLayout_t tensor_layout; +}; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp new file mode 100644 index 0000000000..e004b16734 --- /dev/null +++ b/test/gtest/bn_bwd.cpp @@ -0,0 +1,65 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + + +// struct BNBwdTestBFloat16 : BNBwdTest +// { +// }; + +struct BNBwdTestFloat : BNBwdTest +{ +}; + +// struct BNBwdTestDouble : BNBwdTest +// { +// }; + +TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat) +{ + +} + +// TEST_P(BNBwdTestBFloat16, BnFwdBwdCKBFloat16) +// { + +// } + +INSTANTIATE_TEST_SUITE_P(BNBwdTestFloatNHWCSuite, + BNBwdTestFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +// INSTANTIATE_TEST_SUITE_P(BNBwdTestBFloat16NHWCSuite, +// BNBwdTestBFloat16, +// testing::Combine(testing::ValuesIn(Network1()), +// testing::Values(miopenTensorNHWC))); + +// INSTANTIATE_TEST_SUITE_P(BNBwdTestDoubleNHWCSuite, +// BNBwdTestDouble, +// testing::Combine(testing::ValuesIn(Network1()), +// testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 4692db5443..3e9cdf6a3d 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -60,35 +60,34 @@ std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } template @@ -125,7 +124,7 @@ struct BNTestData { input = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; output = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; - ref_out = output; + ref_out = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; } void InitTensorsWithRandValue() @@ -136,7 +135,7 @@ struct BNTestData auto gen_value = [&](auto...) { return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); }; - input.generate(gen_value); + input.generate(gen_value); } void SetDirection() { direction = bn_config.Direction; } @@ -221,3 +220,117 @@ struct BNInferTestData : public BNTestData estVariance_dev = handle.Write(estVariance.data); } }; + + + +template +struct BNBwdTestData : public BNTestData +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + BNTestData::SetUpImpl(config, t_layout); + CreateTensors(); + InitTensorsWithRandValue(); + WriteToGPU(); + } + + tensor bnScale; + + tensor savedMean; + tensor savedInvVar; + + + tensor dy; + tensor dScale; + tensor dBias; + tensor dScale_ref; + tensor dBias_ref; + + miopen::Allocator::ManageDataPtr bnScale_dev; + miopen::Allocator::ManageDataPtr savedMean_dev; + miopen::Allocator::ManageDataPtr savedInvVar_dev; + + miopen::Allocator::ManageDataPtr dy_dev; + miopen::Allocator::ManageDataPtr dScale_dev; + miopen::Allocator::ManageDataPtr dBias_dev; + miopen::Allocator::ManageDataPtr dScale_ref_dev; + miopen::Allocator::ManageDataPtr dBias_ref_dev; + double epsilon = std::numeric_limits::epsilon(); + + float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); + float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); + +private: + void CreateTensors() + { + dy = tensor{miopen_type{}, + BNTestData::tensor_layout, + BNTestData::bn_config.GetInput()}; + + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + bnScale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + savedMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + savedInvVar = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dScale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dBias = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dScale_ref = dScale; + dBias_ref = dBias; + } + + void InitTensorsWithRandValue() + { + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::uniform_int_distribution<> d{0, 100}; + auto gen_value = [&](auto...) { + return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + }; + dy.generate(gen_value); + bnScale.generate(gen_value); + savedMean.generate(gen_value); + + auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; + savedInvVar.generate(gen_var); + + std::fill(dScale.begin(), dScale.end(), 0.); + std::fill(dBias.begin(), dBias.end(), 0.); + + std::fill(dScale_ref.begin(), dScale_ref.end(), 0.); + std::fill(dBias_ref.begin(), dBias_ref.end(), 0.); + } + void WriteToGPU() + { + auto&& handle = get_handle(); + + bnScale_dev = handle.Write(bnScale.data); + savedMean_dev = handle.Write(savedMean.data); + savedInvVar_dev = handle.Write(savedInvVar.data); + dy_dev = handle.Write(dy.data); + + dScale_dev = handle.Write(dScale.data); + dBias_dev = handle.Write(dBias.data); + + dScale_ref_dev = handle.Write(dScale.data); + dBias_ref_dev = handle.Write(dBias.data); + } +}; diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 3ac2d40d5a..c0a71d8c02 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -38,6 +38,20 @@ void ComputeCPUBNInference(DLModule& dl_module) dl_module.estVariance); } +template +void ComputeCPUBNBwd(DLModule& dl_module) +{ + batchNormSpatialHostBwdTrain(dl_module.input, + dl_module.dy, + dl_module.ref_out, + dl_module.bnScale, + dl_module.dScale_ref, + dl_module.dBias_ref, + dl_module.savedMean, + dl_module.savedInvVar); +} + + template void CompareTensor(const tensor& output, const tensor& ref_out, const T threshold = std::numeric_limits::epsilon()) { From e7667c3a49e6eda0593fa32ff0acb9935d58a77d Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 8 Sep 2023 14:50:26 +0000 Subject: [PATCH 11/28] bg/LWPMIOPEN-194 : add test for all types --- src/include/miopen/batchnorm/solvers.hpp | 3 --- src/solver/batchnorm/forward_inference_ck.cpp | 13 +++++++---- test/fusionHost.hpp | 10 ++++---- test/gtest/bn.hpp | 13 ++--------- test/gtest/bn_infer.cpp | 23 +++++++++++++++++++ test/gtest/test_operations.hpp | 2 +- 6 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 4a2275294f..c7d050abeb 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -132,7 +132,6 @@ struct BnFwdInference final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; -// ----------- start BnCKFwdInference --------------- struct BnCKFwdInference final : BatchnormSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } @@ -143,8 +142,6 @@ struct BnCKFwdInference final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; -// ----------- end BnCKFwdInference --------------- - } // namespace batchnorm } // namespace solver diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 88968c2572..ac6b542825 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -46,9 +46,10 @@ using Normalize = ck::tensor_operation::element_wise::NormalizeInInfer; constexpr index_t Rank = 4; constexpr index_t NumBatchNormReduceDim = 3; -using F16 = ck::half_t; -using F32 = float; -using F64 = double; +using F16 = ck::half_t; +using F32 = float; +using F64 = double; +using BF16 = ushort; struct CKArgsBNormFwd { @@ -196,6 +197,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, case miopenDouble: return (CheckCKApplicability(bn_problem) != -1); case miopenBFloat16: + return (CheckCKApplicability(bn_problem) != -1); case miopenInt32: case miopenInt8: case miopenInt8x4: @@ -220,7 +222,7 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context, result.invoker_factory = [=](const std::vector& kernels) { std::ignore = kernels; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { - switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem + switch(bn_problem.GetXDesc().GetType()) { case miopenHalf: RunCKSolution( @@ -235,6 +237,9 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context, handle, primitive_parameters, bn_problem); break; case miopenBFloat16: + RunCKSolution( + handle, primitive_parameters, bn_problem); + break; case miopenInt8: case miopenInt32: case miopenInt8x4: diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index be27234c08..cffefea0e2 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -148,9 +148,9 @@ void batchNormSpatialHostInference(const tensor& input, int n_batches, channels, height, width; std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); par_for(channels, 1, [&](int cidx) { // via channel - V mean = estimatedMean(0, cidx, 0, 0); - V variance = estimatedVariance(0, cidx, 0, 0); - V invertVar = 1.0 / sqrt(variance + epsilon); + V mean = estimatedMean(0, cidx, 0, 0); + V variance = estimatedVariance(0, cidx, 0, 0); + double invertVar = 1.0 / sqrt(variance + epsilon); // process the batch per channel for(int row = 0; row < height; row++) { // via rows @@ -158,8 +158,8 @@ void batchNormSpatialHostInference(const tensor& input, { // via columns for(int bidx = 0; bidx < n_batches; bidx++) { // via mini_batch - V elemStd = static_cast(input(bidx, cidx, row, column)) - mean; - V inhat = elemStd * invertVar; + double elemStd = static_cast(input(bidx, cidx, row, column)) - mean; + double inhat = elemStd * invertVar; output(bidx, cidx, row, column) = static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 637c007e5d..0b763da411 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -74,17 +74,8 @@ struct BNInferTest : public ::testing::TestWithParam( bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); test::ComputeCPUBNInference(bn_infer_test_data); - - if constexpr(std::is_same_v) - { - // tolerance for CK solver tolerance for - test::CompareTensor( - bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8); - } - else - { - test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out); - } + // 4e-3 is tolerance used by CK kernel. + test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, 4e-3); } BNTestCase bn_config; diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index e3ff28e776..6598ef7169 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -35,10 +35,23 @@ struct BNInferTestFloat : BNInferTest { }; +struct BNInferTestDouble : BNInferTest +{ +}; + +struct BNInferTestBFloat16 : BNInferTest +{ +}; + TEST_P(BNInferTestHalf, BnFwdInferCKHalf) {} TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {} +// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig +// only supports half and float +TEST_P(BNInferTestDouble, DISABLED_BnFwdInferCKDouble) {} +TEST_P(BNInferTestBFloat16, DISABLED_BnFwdInferCKBFloat16) {} + INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite, BNInferTestHalf, testing::Combine(testing::ValuesIn(Network1()), @@ -48,3 +61,13 @@ INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, BNInferTestFloat, testing::Combine(testing::ValuesIn(Network1()), testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, + BNInferTestDouble, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, + BNInferTestBFloat16, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index d4e524e472..d1528fe2bb 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -41,7 +41,7 @@ void ComputeCPUBNInference(DLModule& dl_module) template void CompareTensor(const tensor& output, const tensor& ref_out, - const T threshold = std::numeric_limits::epsilon()) + const double threshold = std::numeric_limits::epsilon()) { EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; From ce4beef136b72a70956cbf299e6fc6905c107ef7 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 8 Sep 2023 18:03:13 +0000 Subject: [PATCH 12/28] bg/LWPMIOPEN-193_bn_back : add test for all types --- src/batch_norm_api.cpp | 7 - src/ocl/batchnormocl.cpp | 6 +- src/solver/batchnorm/backward_ck.cpp | 62 ++++--- src/solver/batchnorm/forward_inference_ck.cpp | 74 ++++---- test/gtest/bn.hpp | 137 +++++++------- test/gtest/bn_bwd.cpp | 23 ++- test/gtest/bn_infer.cpp | 32 ++-- test/gtest/bn_test_data.hpp | 174 +++++++++--------- test/gtest/test_operations.hpp | 23 +-- 9 files changed, 284 insertions(+), 254 deletions(-) diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 22cf757627..6ca7366f9a 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -271,13 +271,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance) { - // bfloat16 not supported for batchnorm operation - if(miopen::deref(xDesc).GetType() == miopenBFloat16 || - miopen::deref(dyDesc).GetType() == miopenBFloat16 || - miopen::deref(dxDesc).GetType() == miopenBFloat16) - { - return miopenStatusNotImplemented; - } MIOPEN_LOG_FUNCTION(handle, bn_mode, diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 10d2bb8372..7a20038c55 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -223,7 +223,7 @@ void BatchNormForwardInference(Handle& handle, const auto algo = AlgorithmName{"miopenBatchNormalizationForwardInference"}; const auto solvers = solver::SolverContainer{}; + solver::batchnorm::BnCKFwdInference>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } @@ -300,7 +300,7 @@ void BatchNormBackward(Handle& handle, { MIOPEN_THROW(miopenStatusBadParm); } - if(dxDesc.GetType() != dyDesc.GetType() || dyDesc.GetType() != xDesc.GetType()) + if(dxDesc.GetType() != dyDesc.GetType()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -345,7 +345,7 @@ void BatchNormBackward(Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index 8300a3de62..492cb2a619 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -48,7 +48,7 @@ constexpr index_t NumBatchNormReduceDim = 3; using F16 = ck::half_t; using F32 = float; using F64 = double; -using BF16 = bfloat16; +using BF16 = ushort; struct CKArgsBNormFwd { @@ -63,9 +63,9 @@ struct CKArgsBNormFwd xyStrides.begin()); arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel arrScaleBiasMeanVarStrides[0] = 1; - + // prep for CK - std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); } @@ -87,11 +87,10 @@ template -int CheckCKApplicability( - const miopen::batchnorm::ProblemDescription& problem) +int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; - using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); + DeviceOp>::GetInstances(); assert(!bn_bwd_ptrs.empty()); int count = 0; for(const auto& it : bn_bwd_ptrs) @@ -126,7 +125,8 @@ int CheckCKApplicability( nullptr, nullptr, nullptr); - if(it->IsSupportedArgument(argument_ptr.get())){ + if(it->IsSupportedArgument(argument_ptr.get())) + { return count; } count++; @@ -142,12 +142,12 @@ template static void RunCKSolution(const Handle& handle, - const AnyInvokeParams& primitive_parameters, - const miopen::batchnorm::ProblemDescription& problem) + const AnyInvokeParams& primitive_parameters, + const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; - using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); + DeviceOp>::GetInstances(); int kernel_index = CheckCKApplicability(problem); assert(kernel_index >= 0 && kernel_index < bn_bwd_ptrs.size()); - auto& bn_ptr = bn_bwd_ptrs.at(kernel_index); - const auto& params = primitive_parameters.CastTo(); - auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, + auto& bn_ptr = bn_bwd_ptrs.at(kernel_index); + const auto& params = primitive_parameters.CastTo(); + auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, args.xyStrides, args.xyStrides, args.xyStrides, @@ -203,7 +203,7 @@ static void RunCKSolution(const Handle& handle, #endif bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, - const miopen::batchnorm::ProblemDescription& bn_problem) const + const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL std::ignore = ctx; @@ -219,10 +219,14 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, switch(bn_problem.GetXDesc().GetType()) { - case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); - case miopenDouble: return (CheckCKApplicability(bn_problem) != -1); - case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); - case miopenBFloat16: return (CheckCKApplicability(bn_problem) != -1); + case miopenFloat: + return (CheckCKApplicability(bn_problem) != -1); + case miopenDouble: + return (CheckCKApplicability(bn_problem) != -1); + case miopenHalf: + return (CheckCKApplicability(bn_problem) != -1); + case miopenBFloat16: + return (CheckCKApplicability(bn_problem) != -1); case miopenInt32: case miopenInt8: case miopenInt8x4: @@ -232,8 +236,9 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, #endif } -ConvSolution BnCKBwdBackward::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& bn_problem) const +ConvSolution +BnCKBwdBackward::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL std::ignore = context; @@ -246,20 +251,23 @@ ConvSolution BnCKBwdBackward::GetSolution(const ExecutionContext& context, result.invoker_factory = [=](const std::vector& kernels) { std::ignore = kernels; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { - switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem { case miopenFloat: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenDouble: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenHalf: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenBFloat16: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenInt8: case miopenInt32: diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 70f3854caf..ba6a3682c9 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -25,7 +25,6 @@ * *******************************************************************************/ - #include #include #include @@ -88,17 +87,16 @@ template -int CheckCKApplicability( - const miopen::batchnorm::ProblemDescription& problem) +int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) { - const auto& args = CKArgsBNormFwd{problem}; - using DeviceOp = ck::tensor_operation::device::DeviceElementwise< + const auto& args = CKArgsBNormFwd{problem}; + using DeviceOp = ck::tensor_operation::device::DeviceElementwise< ck::Tuple, ck::Tuple, Normalize, Rank>; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); + DeviceOp>::GetInstances(); assert(!bn_fwd_ptrs.empty()); int count = 0; for(const auto& it : bn_fwd_ptrs) @@ -113,7 +111,8 @@ int CheckCKApplicability( {nullptr, nullptr, nullptr, nullptr, nullptr}, {nullptr}, Normalize{0.0}); - if(it->IsSupportedArgument(argument_ptr.get())){ + if(it->IsSupportedArgument(argument_ptr.get())) + { return count; } count++; @@ -128,8 +127,8 @@ template static void RunCKSolution(const Handle& handle, - const AnyInvokeParams& primitive_parameters, - const miopen::batchnorm::ProblemDescription& problem) + const AnyInvokeParams& primitive_parameters, + const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; @@ -139,28 +138,29 @@ static void RunCKSolution(const Handle& handle, Normalize, Rank>; const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - int kernel_index = CheckCKApplicability(problem); + DeviceOp>::GetInstances(); + + int kernel_index = CheckCKApplicability(problem); assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size()); - auto& bn_ptr = bn_fwd_ptrs.at(kernel_index); + auto& bn_ptr = bn_fwd_ptrs.at(kernel_index); const auto& params = primitive_parameters.CastTo(); - auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, - {args.xyStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides, - args.aligned_scaleBiasMeanVarStrides}, - {args.xyStrides}, - {params.x, - params.estimatedMean, - params.estimatedVariance, - params.bnScale, - params.bnBias}, - {params.y}, - Normalize{params.epsilon}); + auto argument_ptr = bn_ptr->MakeArgumentPointer( + args.xyLengths, + {args.xyStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides, + args.aligned_scaleBiasMeanVarStrides}, + {args.xyStrides}, + {params.x, params.estimatedMean, params.estimatedVariance, params.bnScale, params.bnBias}, + {params.y}, + Normalize{params.epsilon}); auto invoker_ptr = bn_ptr->MakeInvokerPointer(); const auto enable_profiling = handle.IsProfilingEnabled(); @@ -194,7 +194,8 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, { case miopenHalf: return (CheckCKApplicability(bn_problem) != -1); case miopenFloat: return (CheckCKApplicability(bn_problem) != -1); - case miopenDouble: return (CheckCKApplicability(bn_problem) != -1); + case miopenDouble: + return (CheckCKApplicability(bn_problem) != -1); case miopenBFloat16: case miopenInt32: case miopenInt8: @@ -205,8 +206,9 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, #endif } -ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& bn_problem) const +ConvSolution +BnCKFwdInference::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL std::ignore = context; @@ -219,17 +221,19 @@ ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context, result.invoker_factory = [=](const std::vector& kernels) { std::ignore = kernels; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { - switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem { case miopenHalf: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenFloat: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenDouble: - RunCKSolution(handle, primitive_parameters, bn_problem); + RunCKSolution( + handle, primitive_parameters, bn_problem); break; case miopenBFloat16: case miopenInt8: diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 907241a3e5..77d5f62d95 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -31,8 +31,11 @@ #include "bn_test_data.hpp" #include "test_operations.hpp" -template +template struct BNInferTest : public ::testing::TestWithParam> { protected: @@ -43,53 +46,54 @@ struct BNInferTest : public ::testing::TestWithParam::quiet_NaN()); + miopenBatchNormalizationForwardInference(&handle, + bn_config.mode, + &bn_infer_test_data.alpha, + &bn_infer_test_data.beta, + &bn_infer_test_data.input.desc, + bn_infer_test_data.in_dev.get(), + &bn_infer_test_data.output.desc, + bn_infer_test_data.out_dev.get(), + &bn_infer_test_data.scale.desc, + bn_infer_test_data.scale_dev.get(), + bn_infer_test_data.shift_dev.get(), + bn_infer_test_data.estMean_dev.get(), + bn_infer_test_data.estVariance_dev.get(), + bn_infer_test_data.epsilon); + + std::fill(bn_infer_test_data.output.begin(), + bn_infer_test_data.output.end(), + std::numeric_limits::quiet_NaN()); } void TearDown() override { if(test_skipped) return; - auto&& handle = get_handle(); - bn_infer_test_data.output.data = - handle.Read(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); - test::ComputeCPUBNInference(bn_infer_test_data); - - if constexpr(std::is_same_v) - { - // tolerance for CK solver tolerance for - test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8); - } - else{ - test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out); - } + auto&& handle = get_handle(); + bn_infer_test_data.output.data = handle.Read( + bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); + // test::ComputeCPUBNInference(bn_infer_test_data); + // if constexpr(std::is_same_v) + // { + // // tolerance for CK solver tolerance for + // test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, + // 1e-8); + // } + // else{ + // test::CompareTensor(bn_infer_test_data.output, + // bn_infer_test_data.ref_out); + // } } BNTestCase bn_config; bool test_skipped = false; - BNInferTestData bn_infer_test_data; + BNInferTestData + bn_infer_test_data; miopenTensorLayout_t tensor_layout; }; - template ::quiet_NaN()); + &handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, // const xDesc + bn_bwd_test_data.in_dev.get(), // const x + &bn_bwd_test_data.dy.desc, // const dyDesc + bn_bwd_test_data.dy_dev.get(), // const dy + + &bn_bwd_test_data.output.desc, // const dxDesc + bn_bwd_test_data.out_dev.get(), // dx (not -const) + + &bn_bwd_test_data.bnScale.desc, // const bnScale + bn_bwd_test_data.bnScale_dev.get(), // const bnScale + + bn_bwd_test_data.dScale_dev.get(), // resultBnScaleDiff (not const) + bn_bwd_test_data.dBias_dev.get(), // resultBnBiasDiff (not const) + + bn_bwd_test_data.epsilon, + + bn_bwd_test_data.savedMean_dev.get(), // const savedMean + bn_bwd_test_data.savedInvVar_dev.get()); // const savedInvVariance + + std::fill(bn_bwd_test_data.output.begin(), + bn_bwd_test_data.output.end(), + std::numeric_limits::quiet_NaN()); } void TearDown() override @@ -143,10 +149,11 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size()); - bn_bwd_test_data.dScale.data = - handle.Read(bn_bwd_test_data.dScale_dev, bn_bwd_test_data.dScale.data.size()); + bn_bwd_test_data.dScale.data = handle.Read(bn_bwd_test_data.dScale_dev, + bn_bwd_test_data.dScale.data.size()); bn_bwd_test_data.dBias.data = handle.Read(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size()); + test::ComputeCPUBNBwd(bn_bwd_test_data); // using tolerance = 1e-4 since this the tolerance CK uses test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4); @@ -162,6 +169,8 @@ struct BNBwdTest : public ::testing::TestWithParam bn_bwd_test_data; + MeanVarDataType, + BNTestCase> + bn_bwd_test_data; miopenTensorLayout_t tensor_layout; }; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index e004b16734..244be887c0 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -26,6 +26,10 @@ #include "bn.hpp" +struct BNBwdTestTestHalf + : BNBwdTest +{ +}; // struct BNBwdTestBFloat16 : BNBwdTest // { @@ -39,16 +43,25 @@ struct BNBwdTestFloat : BNBwdTest -{ -}; +// struct BNInferTestHalf : BNInferTest +// { +// }; -struct BNInferTestFloat : BNInferTest +struct BNInferTestFloat : BNInferTest { }; -TEST_P(BNInferTestHalf, BnFwdInferCKHalf) -{ - -} +// TEST_P(BNInferTestHalf, BnFwdInferCKHalf) +// { -TEST_P(BNInferTestFloat, BnFwdInferCKFloat) -{ - -} +// } -INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite, - BNInferTestHalf, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); +TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {} + +// INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite, +// BNInferTestHalf, +// testing::Combine(testing::ValuesIn(Network1()), +// testing::Values(miopenTensorNHWC))); INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite, BNInferTestFloat, diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 3e9cdf6a3d..394d5c251a 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -60,34 +60,34 @@ std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; + {1, 1, 4, 4, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } template @@ -135,7 +135,7 @@ struct BNTestData auto gen_value = [&](auto...) { return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); }; - input.generate(gen_value); + input.generate(gen_value); } void SetDirection() { direction = bn_config.Direction; } @@ -148,8 +148,12 @@ struct BNTestData } }; -template +template struct BNInferTestData : public BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) @@ -169,8 +173,8 @@ struct BNInferTestData : public BNTestData miopen::Allocator::ManageDataPtr estMean_dev; miopen::Allocator::ManageDataPtr estVariance_dev; double epsilon = 1.0e-5; - float alpha = static_cast(1.0f); - float beta = static_cast(0); + float alpha = static_cast(1.0f); + float beta = static_cast(0); const float activ_alpha = static_cast(0.5f); const float activ_beta = static_cast(0.5f); const float activ_gamma = static_cast(0.5f); @@ -182,18 +186,19 @@ struct BNInferTestData : public BNTestData miopen::DeriveBNTensorDescriptor(derivedBnDesc, BNTestData::input.desc, BNTestData::bn_mode); - scale = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - shift = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - estMean = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - estVariance = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; + scale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + estMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + estVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; } void InitTensorsWithRandValue() @@ -207,7 +212,7 @@ struct BNInferTestData : public BNTestData scale.generate(gen_value); shift.generate(gen_value); estMean.generate(gen_value); - + auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; estVariance.generate(gen_var); } @@ -221,8 +226,6 @@ struct BNInferTestData : public BNTestData } }; - - template tensor bnScale; - tensor savedMean; - tensor savedInvVar; + tensor savedMean; + tensor savedInvVar; - - tensor dy; - tensor dScale; - tensor dBias; - tensor dScale_ref; - tensor dBias_ref; + tensor dy; + tensor dScale; + tensor dBias; + tensor dScale_ref; + tensor dBias_ref; miopen::Allocator::ManageDataPtr bnScale_dev; miopen::Allocator::ManageDataPtr savedMean_dev; @@ -262,7 +264,7 @@ struct BNBwdTestData : public BNTestData miopen::Allocator::ManageDataPtr dBias_dev; miopen::Allocator::ManageDataPtr dScale_ref_dev; miopen::Allocator::ManageDataPtr dBias_ref_dev; - double epsilon = std::numeric_limits::epsilon(); + double epsilon = std::numeric_limits::epsilon(); float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); @@ -270,29 +272,33 @@ struct BNBwdTestData : public BNTestData private: void CreateTensors() { - dy = tensor{miopen_type{}, - BNTestData::tensor_layout, + dy = tensor{miopen_type{}, + BNTestData::tensor_layout, BNTestData::bn_config.GetInput()}; auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, BNTestData::input.desc, BNTestData::bn_mode); - bnScale = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - savedMean = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - savedInvVar = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - dScale = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - dBias = tensor{miopen_type{}, - BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; + bnScale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + savedMean = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + savedInvVar = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dScale = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dBias = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; dScale_ref = dScale; dBias_ref = dBias; } @@ -311,26 +317,26 @@ struct BNBwdTestData : public BNTestData auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; savedInvVar.generate(gen_var); - + std::fill(dScale.begin(), dScale.end(), 0.); std::fill(dBias.begin(), dBias.end(), 0.); - + std::fill(dScale_ref.begin(), dScale_ref.end(), 0.); std::fill(dBias_ref.begin(), dBias_ref.end(), 0.); } void WriteToGPU() { - auto&& handle = get_handle(); + auto&& handle = get_handle(); - bnScale_dev = handle.Write(bnScale.data); - savedMean_dev = handle.Write(savedMean.data); + bnScale_dev = handle.Write(bnScale.data); + savedMean_dev = handle.Write(savedMean.data); savedInvVar_dev = handle.Write(savedInvVar.data); dy_dev = handle.Write(dy.data); - - dScale_dev = handle.Write(dScale.data); - dBias_dev = handle.Write(dBias.data); - dScale_ref_dev = handle.Write(dScale.data); - dBias_ref_dev = handle.Write(dBias.data); + dScale_dev = handle.Write(dScale.data); + dBias_dev = handle.Write(dBias.data); + + dScale_ref_dev = handle.Write(dScale.data); + dBias_ref_dev = handle.Write(dBias.data); } }; diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index c0a71d8c02..08b119444b 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -25,17 +25,17 @@ *******************************************************************************/ #pragma once -namespace test{ +namespace test { template void ComputeCPUBNInference(DLModule& dl_module) { batchNormSpatialHostInference(dl_module.input, - dl_module.ref_out, - dl_module.scale, - dl_module.shift, - dl_module.epsilon, - dl_module.estMean, - dl_module.estVariance); + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); } template @@ -51,22 +51,23 @@ void ComputeCPUBNBwd(DLModule& dl_module) dl_module.savedInvVar); } - template -void CompareTensor(const tensor& output, const tensor& ref_out, const T threshold = std::numeric_limits::epsilon()) +void CompareTensor(const tensor& output, + const tensor& ref_out, + const T threshold = std::numeric_limits::epsilon()) { EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) << "Non finite number found in the GPU data"; EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); - auto error = miopen::rms_range(ref_out, output); + auto error = miopen::rms_range(ref_out, output); EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) << "Non finite number found in the CPU data"; EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } -} +} // namespace test namespace test { namespace FusionPlan { From 714b6367c9f63a6143422dd219f8f4af0c91f174 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 8 Sep 2023 21:13:09 +0000 Subject: [PATCH 13/28] bg/LWPMIOPEN-193_bn_back : fix bn backward host template --- test/fusionHost.hpp | 12 ++++++------ test/gtest/bn_test_data.hpp | 5 +---- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index be27234c08..4b4bb0eeee 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -280,14 +280,14 @@ void batchNormSpatialHostFwdTrain(const tensor& input, } template -void batchNormSpatialHostBwdTrain(const tensor& x_input, +void batchNormSpatialHostBwdTrain(const tensor& x_input, const tensor& dy_input, tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 394d5c251a..01a46228f7 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -60,7 +60,6 @@ std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { - {1, 1, 4, 4, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, @@ -238,7 +237,7 @@ struct BNBwdTestData : public BNTestData { void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) { - BNTestData::SetUpImpl(config, t_layout); + BNTestData::SetUpImpl(config, t_layout); CreateTensors(); InitTensorsWithRandValue(); WriteToGPU(); @@ -336,7 +335,5 @@ struct BNBwdTestData : public BNTestData dScale_dev = handle.Write(dScale.data); dBias_dev = handle.Write(dBias.data); - dScale_ref_dev = handle.Write(dScale.data); - dBias_ref_dev = handle.Write(dBias.data); } }; From e0792eedc41405eca2904f8ae87a5f4a8cf3a2e2 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 11 Sep 2023 02:11:18 +0000 Subject: [PATCH 14/28] bg/LWPMIOPEN-193_bn_back : clang format --- test/fusionHost.hpp | 2 +- test/gtest/bn.hpp | 2 +- test/gtest/bn_bwd.cpp | 8 +++----- test/gtest/bn_infer.cpp | 1 - test/gtest/bn_test_data.hpp | 1 - 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 35032890a5..eed9173541 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -283,7 +283,7 @@ template void batchNormSpatialHostBwdTrain(const tensor& x_input, const tensor& dy_input, tensor& dx_out, - const tensor& scale, + const tensor& scale, tensor& dscale, tensor& dbias, const tensor& savedMean, diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 87a37ec044..57b7cec6e6 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -80,7 +80,7 @@ struct BNInferTest : public ::testing::TestWithParam + BNInferTestData bn_infer_test_data; miopenTensorLayout_t tensor_layout; }; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index 9a9ab2ad9d..76a96c883d 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -45,14 +45,12 @@ struct BNBwdTestDouble : BNBwdTest dScale_dev = handle.Write(dScale.data); dBias_dev = handle.Write(dBias.data); - } }; From 335c2127f32a88017aabedeff372c2448b231d8b Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 11 Sep 2023 02:43:00 +0000 Subject: [PATCH 15/28] bg/LWPMIOPEN-193_bn_back : add solver to registry --- src/solver.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver.cpp b/src/solver.cpp index 9d4f28ef5e..77835d7668 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -565,7 +565,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) RegisterWithSolver( registry, ++id, ConvHipImplicitGemm3DGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); RegisterWithSolver(registry, ++id, ConvWinoFuryRxS<2, 3>{}, miopenConvolutionAlgoWinograd); - + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } From 8a4177ba408faab9198bb95bb26011ac72ca70da Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 11 Sep 2023 14:36:42 +0000 Subject: [PATCH 16/28] bg/LWPMIOPEN-193_bn_back : clean ups --- src/solver/batchnorm/backward_ck.cpp | 4 +-- test/gtest/bn.hpp | 49 ++++++++++++++-------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index 3e9f597150..d6c1dc95db 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -87,7 +87,7 @@ template -int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) +static int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormBwd{problem}; using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd& kernels) { std::ignore = kernels; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { - switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem + switch(bn_problem.GetXDesc().GetType()) { case miopenFloat: RunCKSolution( diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 57b7cec6e6..f5c2baac31 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -102,31 +102,30 @@ struct BNBwdTest : public ::testing::TestWithParam Date: Mon, 11 Sep 2023 14:48:56 +0000 Subject: [PATCH 17/28] bg/LWPMIOPEN-194 : add static to CheckCKApplicability function --- src/solver.cpp | 2 +- src/solver/batchnorm/forward_inference_ck.cpp | 2 +- test/gtest/bn_test_data.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/solver.cpp b/src/solver.cpp index 9d4f28ef5e..a4bdf5f0f1 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -561,10 +561,10 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId()); RegisterWithSolver( registry, ++id, ConvHipImplicitGemmGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); - Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId()); RegisterWithSolver( registry, ++id, ConvHipImplicitGemm3DGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); RegisterWithSolver(registry, ++id, ConvWinoFuryRxS<2, 3>{}, miopenConvolutionAlgoWinograd); + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index ac6b542825..c074aa6a0d 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -87,7 +87,7 @@ template -int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) +static int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) { const auto& args = CKArgsBNormFwd{problem}; using DeviceOp = ck::tensor_operation::device::DeviceElementwise< diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index a7aff78da3..36d7813fd2 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -170,7 +170,7 @@ struct BNInferTestData : public BNTestData tensor estMean; tensor estVariance; miopen::Allocator::ManageDataPtr scale_dev; - miopen::Allocator::ManageDataPtr shift_dev; // bias + miopen::Allocator::ManageDataPtr shift_dev; miopen::Allocator::ManageDataPtr estMean_dev; miopen::Allocator::ManageDataPtr estVariance_dev; double epsilon = 1.0e-5; From 3ce76ffbca3f5e1a6dca14157e4e21ec7faac8fc Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 11 Sep 2023 17:02:49 +0000 Subject: [PATCH 18/28] bg/LWPMIOPEN-194: fix analyze error --- src/solver/batchnorm/forward_inference_ck.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index c074aa6a0d..b72b4c9ac2 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -175,11 +175,11 @@ static void RunCKSolution(const Handle& handle, } #endif -bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx, +bool BnCKFwdInference::IsApplicable(const ExecutionContext& context const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = ctx; + std::ignore = context; std::ignore = fdesc_problem; return false; #else From cdc4cb719c9a8d2803111015d8cffea000d3b32a Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 11 Sep 2023 17:13:02 +0000 Subject: [PATCH 19/28] bg/LWPMIOPEN-194: fix compile error --- src/solver/batchnorm/forward_inference_ck.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index b72b4c9ac2..5de1c43c3c 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -175,7 +175,7 @@ static void RunCKSolution(const Handle& handle, } #endif -bool BnCKFwdInference::IsApplicable(const ExecutionContext& context +bool BnCKFwdInference::IsApplicable(const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL @@ -187,7 +187,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context return false; if(!bn_problem.IsLayoutNHWC()) return false; - if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) + if(!ck_utility::is_ck_supported_hardware(context.GetStream())) return false; switch(bn_problem.GetXDesc().GetType()) From 83bd77f057efa09f036647506492ae87cec0d4e9 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 18 Sep 2023 13:50:33 +0000 Subject: [PATCH 20/28] bg/LWPMIOPEN-193_bn_back: fix review comments --- .../miopen/solver/implicitgemm_ck_util.hpp | 65 +++- src/solver/batchnorm/backward_ck.cpp | 280 ++++++++---------- test/fusionHost.hpp | 20 +- 3 files changed, 186 insertions(+), 179 deletions(-) diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index 8656bdbabc..ce64ee5109 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -41,8 +41,10 @@ typename ConvPtrsType::iterator FindConvPtrByID(ConvPtrsType& conv_ptrs, }); } -template -std::vector FillValidKernelsIDs(const ProblemDescription& problem) +template +std::vector FillValidKernelsIDs(const ProblemDescriptionType& problem) { const auto args = CKArgsType{problem}; const auto conv_ptrs = DeviceOpType::GetInstances(); @@ -59,8 +61,10 @@ std::vector FillValidKernelsIDs(const ProblemDescription& problem) return valid_kernels; } -template -bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& kernel_id) +template +bool IsCKArgsSupported(const ProblemDescriptionType& problem, const std::string& kernel_id) { auto conv_ptrs = DeviceOpType::GetInstances(); auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); @@ -68,20 +72,25 @@ bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& ker return (ptr_iter != conv_ptrs.end()) && CKArgsType{problem}.IsSupportedBy(*ptr_iter); } -template -bool IsCKApplicable(const ProblemDescription& problem) +template +bool IsCKApplicable(const ProblemDescriptionType& problem) { const auto args = CKArgsType{problem}; - if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; })) - return false; + // if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; })) + // return false; const auto ptrs = DeviceOpType::GetInstances(); return std::any_of( ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); }); } -template -ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::string& kernel_id) +template +ConvSolution InitInvokerFactory(const ProblemDescriptionType& problem, const std::string& kernel_id) { auto conv_ptrs = DeviceOpType::GetInstances(); auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); @@ -112,5 +121,41 @@ ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::st return result; } +template +ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem, + const std::string& kernel_id) +{ + auto conv_ptrs = DeviceOpType::GetInstances(); + auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); + + if(ptr_iter == conv_ptrs.end()) + return ConvSolution(miopenStatusInvalidValue); + + ConvSolution result; + result.invoker_factory = + [ck_args = CKArgsType{problem}, + sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](const std::vector&) mutable { + return [ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr)]( + const Handle& handle, const AnyInvokeParams& primitive_parameters) { + const auto& data_ctx = primitive_parameters.CastTo(); + auto argument_ptr = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx); + auto invoker_ptr = sh_conv_ptr->MakeInvokerPointer(); + + const auto enable_profiling = handle.IsProfilingEnabled(); + float elapsed_time = + invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); + if(enable_profiling) + { + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed_time); + } + }; + }; + return result; +} + } // namespace solver } // namespace miopen diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index d6c1dc95db..a0e3968a63 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -31,6 +31,7 @@ #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL #include #include +#include #endif MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_BACK) @@ -50,27 +51,81 @@ using F32 = float; using F64 = double; using BF16 = ushort; +template +using DeviceOpBNBwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceBatchNormBwd>; + struct CKArgsBNormBwd { CKArgsBNormBwd(const miopen::batchnorm::ProblemDescription& problem) { std::copy(problem.GetXDesc().GetLengths().begin(), problem.GetXDesc().GetLengths().end(), - xyLengths.begin()); + lens.begin()); std::copy(problem.GetXDesc().GetStrides().begin(), problem.GetXDesc().GetStrides().end(), - xyStrides.begin()); - arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel + strides.begin()); + arrScaleBiasMeanVarLengths[0] = lens[1]; // get channel arrScaleBiasMeanVarStrides[0] = 1; // prep for CK - std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); - std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); + std::sort(strides.begin(), strides.end(), std::greater<>()); + std::rotate(lens.begin() + 1, lens.begin() + 2, lens.end()); + } + + CKArgsBNormBwd(const CKArgsBNormBwd&) = default; + CKArgsBNormBwd(CKArgsBNormBwd&&) = default; + CKArgsBNormBwd& operator=(const CKArgsBNormBwd&) = default; + + template + auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const + { + return invoker_ptr->MakeArgumentPointer(lens, + strides, + strides, + strides, + reduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + data_ctx.x, + data_ctx.dy, + data_ctx.bnScale, + data_ctx.savedMean, + data_ctx.savedInvVariance, + epsilon, + PassThrough{}, + data_ctx.dx, + data_ctx.resultBnScaleDiff, + data_ctx.resultBnBiasDiff); + } + + template + bool IsSupportedBy(const ConvPtr& invoker_ptr) const + { + auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::BwdInvokeParams{}); + return invoker_ptr->IsSupportedArgument(arg_ptr.get()); } - std::array xyLengths; // inOutLengths - std::array xyStrides; // inOutStrides + std::array lens; // inOutLengths + std::array strides; // inOutStrides std::vector invariantDims; std::array arrScaleBiasMeanVarLengths; @@ -87,119 +142,18 @@ template -static int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) -{ - const auto& args = CKArgsBNormBwd{problem}; - using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; - const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - assert(!bn_bwd_ptrs.empty()); - int count = 0; - for(const auto& it : bn_bwd_ptrs) - { - auto argument_ptr = it->MakeArgumentPointer(args.xyLengths, - args.xyStrides, - args.xyStrides, - args.xyStrides, - args.reduceDims, - args.arrScaleBiasMeanVarLengths, - args.arrScaleBiasMeanVarStrides, - args.arrScaleBiasMeanVarStrides, - args.arrScaleBiasMeanVarStrides, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - args.epsilon, - PassThrough{}, - nullptr, - nullptr, - nullptr); - if(it->IsSupportedArgument(argument_ptr.get())) - { - return count; - } - count++; - } - return -1; -} - -template -static void RunCKSolution(const Handle& handle, - const AnyInvokeParams& primitive_parameters, - const miopen::batchnorm::ProblemDescription& problem) +static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) { - const auto& args = CKArgsBNormBwd{problem}; - - using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; - const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - int kernel_index = CheckCKApplicability(problem); - assert(kernel_index >= 0 && kernel_index < bn_bwd_ptrs.size()); - auto& bn_ptr = bn_bwd_ptrs.at(kernel_index); - const auto& params = primitive_parameters.CastTo(); - auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths, - args.xyStrides, - args.xyStrides, - args.xyStrides, - args.reduceDims, - args.arrScaleBiasMeanVarLengths, - args.arrScaleBiasMeanVarStrides, - args.arrScaleBiasMeanVarStrides, - args.arrScaleBiasMeanVarStrides, - params.x, - params.dy, - params.bnScale, - params.savedMean, - params.savedInvVariance, - args.epsilon, - PassThrough{}, - params.dx, - params.resultBnScaleDiff, - params.resultBnBiasDiff); - auto invoker_ptr = bn_ptr->MakeInvokerPointer(); - const auto enable_profiling = handle.IsProfilingEnabled(); - - float elapsed_time = - invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); - if(enable_profiling) - { - handle.ResetKernelTime(); - handle.AccumKernelTime(elapsed_time); - } + MeanVarDataType>, + CKArgsBNormBwd>(problem); } + #endif bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, @@ -216,17 +170,16 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, return false; if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) return false; + if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType()) + return false; switch(bn_problem.GetXDesc().GetType()) { - case miopenFloat: - return (CheckCKApplicability(bn_problem) != -1); - case miopenDouble: - return (CheckCKApplicability(bn_problem) != -1); - case miopenHalf: - return (CheckCKApplicability(bn_problem) != -1); + case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenDouble: return CheckCKApplicability(bn_problem); + case miopenHalf: return CheckCKApplicability(bn_problem); case miopenBFloat16: - return (CheckCKApplicability(bn_problem) != -1); + return CheckCKApplicability(bn_problem); case miopenInt32: case miopenInt8: case miopenInt8x4: @@ -236,48 +189,57 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, #endif } -ConvSolution -BnCKBwdBackward::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& bn_problem) const +template +ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem) { -#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = context; - std::ignore = bn_problem; - return {}; -#else - std::ignore = context; + const auto& valid_kernel_ids = FillValidKernelsIDs, + CKArgsBNormBwd>(bn_problem); + assert(!valid_kernel_ids.empty()); + const auto& kernel_id = valid_kernel_ids[0]; + return InitAnyInvokerFactory, + CKArgsBNormBwd, + miopen::batchnorm::BwdInvokeParams>(bn_problem, kernel_id); +} + +ConvSolution BnCKBwdBackward::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(bn_problem.GetXDesc().GetType()) + { - ConvSolution result; - result.invoker_factory = [=](const std::vector& kernels) { - std::ignore = kernels; - return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { - switch(bn_problem.GetXDesc().GetType()) - { - case miopenFloat: - RunCKSolution( - handle, primitive_parameters, bn_problem); - break; - case miopenDouble: - RunCKSolution( - handle, primitive_parameters, bn_problem); - break; - case miopenHalf: - RunCKSolution( - handle, primitive_parameters, bn_problem); - break; - case miopenBFloat16: - RunCKSolution( - handle, primitive_parameters, bn_problem); - break; - case miopenInt8: - case miopenInt32: - case miopenInt8x4: - default: MIOPEN_THROW("Unsupported datatype"); - } - }; - }; - return result; + case miopenFloat: return MakeAnyInvokerFactory(bn_problem); + case miopenDouble: return MakeAnyInvokerFactory(bn_problem); + case miopenHalf: return MakeAnyInvokerFactory(bn_problem); + case miopenBFloat16: + return MakeAnyInvokerFactory(bn_problem); + case miopenInt8: + case miopenInt32: + case miopenInt8x4: + default: + MIOPEN_THROW(miopenStatusInternalError, "BnCKBwdBackward operation not for this data type"); + } #endif + return {}; } } // namespace batchnorm diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index eed9173541..219b18682c 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -279,15 +279,15 @@ void batchNormSpatialHostFwdTrain(const tensor& input, }); } -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -335,7 +335,7 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) From d6b19eca4314677a23123a4c80cf2952e6b8bf0f Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 18 Sep 2023 17:01:16 +0000 Subject: [PATCH 21/28] bg/LWPMIOPEN-193_bn_back : fix tidy error --- src/include/miopen/solver/implicitgemm_ck_util.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index ce64ee5109..318d970170 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -132,7 +132,7 @@ ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem, auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); if(ptr_iter == conv_ptrs.end()) - return ConvSolution(miopenStatusInvalidValue); + return {miopenStatusInvalidValue}; ConvSolution result; result.invoker_factory = From 87fb611e643df601dc747e7b54eea62636c96c8c Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 21 Sep 2023 13:48:32 +0000 Subject: [PATCH 22/28] bg/LWPMIOPEN-193_bn_back : make solver epsilon and test driver epsilon same --- src/solver/batchnorm/backward_ck.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index a0e3968a63..53f647f483 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -131,7 +131,7 @@ struct CKArgsBNormBwd std::array arrScaleBiasMeanVarLengths; std::array arrScaleBiasMeanVarStrides; - double epsilon = 0.0001; + double epsilon = 1e-5; std::array reduceDims{0, 1, 2}; }; From b77d93b0ebfb02e20a5b7a98a13acc3298a44bb1 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Mon, 25 Sep 2023 14:11:30 +0000 Subject: [PATCH 23/28] bg/LWPMIOPEN-193_bn_back : add CK's cpu ref for batch norm backward test --- test/fusionHost.hpp | 105 +++++++++++++++++++++++++++++++++ test/gtest/bn.hpp | 9 ++- test/gtest/test_operations.hpp | 25 +++++--- 3 files changed, 129 insertions(+), 10 deletions(-) diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 219b18682c..b446835b4b 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -42,6 +42,14 @@ #include "verify.hpp" #include +// add ck guard +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp" + + + + template void convHostForward(const tensor& input, tensor& output, @@ -342,6 +350,103 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, }); // for (channel) } +template +void batchNormSpatialHostBwdTrainCK(DLModule& dl_module) +{ + // batchNormSpatialHostBwdTrain(dl_module.input, + // dl_module.x_input, + // dl_module.ref_out, + // dl_module.bnScale, + // dl_module.resBnScaleDiff, + // dl_module.resBnBiasDiff, + // dl_module.savedMean, + // dl_module.savedInvVariance); + + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; + + constexpr ck::index_t Rank = 4; + constexpr ck::index_t NumReduceDim = 3; + + using ReferenceBatchNormBwdInstance = + ck::tensor_operation::host::ReferenceBatchNormBwd; + + auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{}; + std::array arrReduceDims{0, 1, 2}; + + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + + arrScaleBiasMeanVarLengths[0] = dl_module.input.desc.GetLengths()[1]; // get channel + arrScaleBiasMeanVarStrides[0] = 1; + + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + dl_module.input.desc, + dl_module.bn_mode); + + std::array xyLengths; // inOutLengths + std::array xyStrides; + + std::copy(dl_module.input.desc.GetLengths().begin(), + dl_module.input.desc.GetLengths().end(), + xyLengths.begin()); + + std::copy(dl_module.input.desc.GetStrides().begin(), + dl_module.input.desc.GetStrides().end(), + xyStrides.begin()); + + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); + + auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer( + xyLengths, + xyStrides, + xyStrides, + xyStrides, + arrReduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + dl_module.input.data.data(), + dl_module.dy.data.data(), + dl_module.bnScale.data.data(), + dl_module.savedMean.data.data(), + dl_module.savedInvVar.data.data(), + dl_module.epsilon, + PassThroughOp{}, + dl_module.ref_out.data.data(), + dl_module.dScale_ref.data.data(), + dl_module.dBias_ref.data.data()); + + if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cerr << "The runtime parameters not supported by the reference instance, exiting!" + << std::endl; + exit(1); + }; + + auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); +} + template void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double gamma, diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index fb1bf69171..2f0f67af16 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -139,7 +139,14 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size()); - test::ComputeCPUBNBwd(bn_bwd_test_data); + test::ComputeCPUBNBwd(bn_bwd_test_data); + // using tolerance = 1e-4 since this the tolerance CK uses test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4); test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4); diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 66c306651b..ad33833da2 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -38,17 +38,24 @@ void ComputeCPUBNInference(DLModule& dl_module) dl_module.estVariance); } -template +template void ComputeCPUBNBwd(DLModule& dl_module) { - batchNormSpatialHostBwdTrain(dl_module.input, - dl_module.dy, - dl_module.ref_out, - dl_module.bnScale, - dl_module.dScale_ref, - dl_module.dBias_ref, - dl_module.savedMean, - dl_module.savedInvVar); + batchNormSpatialHostBwdTrainCK(dl_module); } template From 8929dbb03da624737fd80a30857412889ef36ffe Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 27 Sep 2023 14:32:32 +0000 Subject: [PATCH 24/28] bg/LWPMIOPEN-193_bn_back: moved bn_spatial_nhwc_test.cpp to gtest --- test/bn_spatial_nhwc_test.cpp | 749 ---------------------------------- test/fusionHost.hpp | 10 +- 2 files changed, 1 insertion(+), 758 deletions(-) delete mode 100644 test/bn_spatial_nhwc_test.cpp diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp deleted file mode 100644 index abca57e7ce..0000000000 --- a/test/bn_spatial_nhwc_test.cpp +++ /dev/null @@ -1,749 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include "driver.hpp" -#include "get_handle.hpp" -#include "tensor_holder.hpp" -#include "test.hpp" -#include "verify.hpp" -#include "random.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MIO_BN_TEST_EXPAVGFACTOR 0.1 -#define MIO_BN_TEST_EPSILON 1e-5 -#define MIO_BN_USE_MIX_PREC 1 -#if MIO_BN_USE_MIX_PREC == 1 -#define PREC_TYPE float -#else -#define PREC_TYPE T -#endif - -template -struct verify_forward_train_bn_spatial -{ - const tensor input; - const tensor scale; - const tensor shift; - - std::tuple, tensor, tensor, tensor, tensor> cpu() const - { - double epsilon = MIO_BN_TEST_EPSILON; - double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR; - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - - std::size_t rs_n_batch, rs_channels, rs_height, rs_width; - auto derivedBnDesc = - miopen::TensorDescriptor(input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(rs_n_batch, rs_height, rs_width, rs_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - tensor runMean; - tensor runVar; - if(input.desc.GetType() == miopenFloat) - { - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - } - else - { - prng::reset_seed(); - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - - const U Data_scale = static_cast(0.001); - for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++) - { - runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - runVar[i] = prng::gen_descreet_unsigned(Data_scale, 100); - } - } - auto saveMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - auto saveInvVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - auto out = input; - std::fill(out.begin(), out.end(), 0); - - const auto nhw = double(height * width * n_batch); - par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - double variance_accum = 0.; - double mean_accum = 0.; - double invVar = 0.; - double newRunMean = 0.; - double adjust = 0.; - - std::vector variance_accum_arr(height, 0.0); - std::vector mean_accum_arr(height, 0.0); - std::vector dshift_accum_arr(height, 0.0); - std::vector dscale_accum_arr(height, 0.0); - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - mean_accum_arr[row] += input(bidx, cidx, row, column); - } - } - } - for(std::size_t i = 0; i < height; i++) - mean_accum += mean_accum_arr[i]; - - mean_accum /= nhw; - - elemStd = 0.; - variance_accum = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - out(bidx, cidx, row, column) = elemStd = - input(bidx, cidx, row, column) - mean_accum; - variance_accum_arr[row] += elemStd * elemStd; - } - } - } - for(std::size_t i = 0; i < height; i++) - variance_accum += variance_accum_arr[i]; - - variance_accum /= nhw; - invVar = 1.0 / sqrt(variance_accum + epsilon); - - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - out(bidx, cidx, row, column) = - scale(0, 0, 0, cidx) * (invVar * out(bidx, cidx, row, column)) + - shift(0, 0, 0, cidx); - } - } - } - - saveMean(0, 0, 0, cidx) = mean_accum; - saveInvVar(0, 0, 0, cidx) = invVar; - - newRunMean = runMean(0, 0, 0, cidx) * (1 - expAvgFactor); - runMean(0, 0, 0, cidx) = mean_accum * expAvgFactor + newRunMean; - adjust = (n_batch * height * width == 1) ? variance_accum - : (nhw / (nhw - 1)) * variance_accum; - runVar(0, 0, 0, cidx) = - (1 - expAvgFactor) * runVar(0, 0, 0, cidx) + expAvgFactor * adjust; - }); - - return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar); - } - - std::tuple, tensor, tensor, tensor, tensor> gpu() const - { - auto&& handle = get_handle(); - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - - auto out = input; - std::fill(out.begin(), out.end(), 0); - - std::size_t rs_n_batch, rs_channels, rs_height, rs_width; - auto derivedBnDesc = - miopen::TensorDescriptor(input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(rs_n_batch, rs_height, rs_width, rs_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - tensor runMean; - tensor runVar; - if(input.desc.GetType() == miopenFloat) - { - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - } - else - { - prng::reset_seed(); - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - - const U Data_scale = static_cast(0.001); - for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++) - { - runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - runVar[i] = prng::gen_descreet_unsigned(Data_scale, 100); - } - } - - auto saveMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - auto saveInvVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - - auto in_dev = handle.Write(input.data); - auto scale_dev = handle.Write(scale.data); - auto shift_dev = handle.Write(shift.data); - - auto runMean_dev = handle.Write(runMean.data); - auto runVar_dev = handle.Write(runVar.data); - auto saveMean_dev = handle.Create(channels); - auto saveInvVar_dev = handle.Create(channels); - auto out_dev = handle.Create(n_batch * channels * height * width); - - double epsilon = MIO_BN_TEST_EPSILON; - double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR; - - float alpha = 1.0; - float beta = 0.0; - - miopen::BatchNormForwardTraining(handle, - miopenBNSpatial, - &alpha, - &beta, - input.desc, - in_dev.get(), - out.desc, - out_dev.get(), - scale.desc, - scale_dev.get(), - shift_dev.get(), - expAvgFactor, - runMean_dev.get(), - runVar_dev.get(), - epsilon, - saveMean_dev.get(), - saveInvVar_dev.get()); - - saveMean.data = handle.Read(saveMean_dev, saveMean.data.size()); - saveInvVar.data = handle.Read(saveInvVar_dev, saveInvVar.data.size()); - runMean.data = handle.Read(runMean_dev, runMean.data.size()); - runVar.data = handle.Read(runVar_dev, runVar.data.size()); - out.data = handle.Read(out_dev, out.data.size()); - - return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar); - } - - void fail(int badtensor) const - { - std::cout << "Forward Train Spatial Batch Normalization: " << std::endl; - std::cout << "Input tensor: " << input.desc.ToString() << std::endl; - - switch(badtensor) - { - case(0): std::cout << "Output tensor output failed verification." << std::endl; break; - case(1): std::cout << "Running Mean output tensor failed verification." << std::endl; break; - case(2): - std::cout << "Running Variance output tensor failed verification." << std::endl; - break; - case(3): std::cout << "Saved Mean tensor failed verification." << std::endl; break; - case(4): std::cout << "Saved Variance tensor failed verification." << std::endl; break; - default: break; - } - } -}; - -template -struct verify_backward_bn_spatial_recalc -{ - const tensor x_input; - const tensor dy_input; - const tensor scale; - - std::tuple, tensor, tensor> cpu() const - { - double epsilon = MIO_BN_TEST_EPSILON; - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - const auto nhw = double(height * width * n_batch); - - par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.; - double invVar = 0.; - double dyelem = 0.; - double variance = 0.; - - std::vector xhat(height * width * n_batch, 0.0); - std::vector variance_accum_arr(height, 0.0); - std::vector mean_accum_arr(height, 0.0); - std::vector dshift_accum_arr(height, 0.0); - std::vector dscale_accum_arr(height, 0.0); - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - mean_accum_arr[row] += x_input(bidx, cidx, row, column); - } - } - } - for(std::size_t i = 0; i < height; i++) - mean += mean_accum_arr[i]; - - mean /= nhw; - - elemStd = 0.; - variance = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - elemStd = x_input(bidx, cidx, row, column) - mean; - variance_accum_arr[row] += elemStd * elemStd; - } - } - } - for(std::size_t i = 0; i < height; i++) - variance += variance_accum_arr[i]; - - variance /= nhw; - invVar = 1. / double(sqrt(variance + epsilon)); - - dscale(0, cidx, 0, 0) = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - elemStd = x_input(bidx, cidx, row, column) - mean; - xhat[xhat_index] = elemStd * invVar; - dyelem = dy_input(bidx, cidx, row, column); - dshift_accum_arr[row] += dyelem; - dscale_accum_arr[row] += xhat[xhat_index] * dyelem; - } - } - } - for(std::size_t i = 0; i < height; i++) - { - dshift(0, cidx, 0, 0) += dshift_accum_arr[i]; - dscale(0, cidx, 0, 0) += dscale_accum_arr[i]; - } - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - - double tmp1 = - nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1); - } - } - } - }); - - return std::make_tuple(dx_out, dscale, dshift); - } - - std::tuple, tensor, tensor> gpu() const - { - auto&& handle = get_handle(); - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - float alpha = 1.0; - float beta = 0.0; - - auto xin_dev = handle.Write(x_input.data); - auto dyin_dev = handle.Write(dy_input.data); - auto scale_dev = handle.Write(scale.data); - auto dscale_dev = handle.Write(dscale.data); - auto dshift_dev = handle.Write(dshift.data); - auto dx_out_dev = handle.Write(dx_out.data); - - double epsilon = MIO_BN_TEST_EPSILON; - - miopen::BatchNormBackward(handle, - miopenBNSpatial, - &alpha, - &beta, - &alpha, - &beta, - x_input.desc, - xin_dev.get(), - dy_input.desc, - dyin_dev.get(), - dx_out.desc, - dx_out_dev.get(), - scale.desc, - scale_dev.get(), - dscale_dev.get(), - dshift_dev.get(), - epsilon, - nullptr, - nullptr); - - dx_out.data = handle.Read(dx_out_dev, dx_out.data.size()); - dscale.data = handle.Read(dscale_dev, dscale.data.size()); - dshift.data = handle.Read(dshift_dev, dshift.data.size()); - - return std::make_tuple(dx_out, dscale, dshift); - } - - void fail(int badtensor) const - { - std::cout << "Backward Batch Spatial Normalization Recalc Mean and Variance: " << std::endl; - std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl; - std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl; - switch(badtensor) - { - case(0): - std::cout << "Delta X output tensor output failed verification." << std::endl; - break; - case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break; - case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break; - default: break; - } - } -}; - -template -struct verify_backward_bn_spatial_use_saved -{ - const tensor x_input; - const tensor dy_input; - const tensor scale; - const tensor savedMean; - const tensor savedInvVar; - std::tuple, tensor, tensor> cpu() const - { - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - const auto nhw = double(height * width * n_batch); - - par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = savedMean(0, 0, 0, cidx); - double invVar = savedInvVar(0, 0, 0, cidx); - double dyelem = 0.; - - std::vector xhat(n_batch * height * width, 0.0); - std::vector dshift_accum_arr(height, 0.0); - std::vector dscale_accum_arr(height, 0.0); - dscale(0, cidx, 0, 0) = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - elemStd = x_input(bidx, cidx, row, column) - mean; - xhat[xhat_index] = elemStd * invVar; - dyelem = dy_input(bidx, cidx, row, column); - dshift_accum_arr[row] += dyelem; - dscale_accum_arr[row] += xhat[xhat_index] * dyelem; - } - } - } - for(std::size_t i = 0; i < height; i++) - { - dshift(0, cidx, 0, 0) += dshift_accum_arr[i]; - dscale(0, cidx, 0, 0) += dscale_accum_arr[i]; - } - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - - double tmp1 = - nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1); - } - } - } - }); - - return std::make_tuple(dx_out, dscale, dshift); - } - - std::tuple, tensor, tensor> gpu() const - { - auto&& handle = get_handle(); - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - float alpha = 1.0; - float beta = 0.0; - - auto xin_dev = handle.Write(x_input.data); - auto dyin_dev = handle.Write(dy_input.data); - auto scale_dev = handle.Write(scale.data); - auto dscale_dev = handle.Write(dscale.data); - auto dshift_dev = handle.Write(dshift.data); - auto dx_out_dev = handle.Write(dx_out.data); - auto savedMean_dev = handle.Write(savedMean.data); - auto savedInvVar_dev = handle.Write(savedInvVar.data); - - double epsilon = MIO_BN_TEST_EPSILON; - - miopen::BatchNormBackward(handle, - miopenBNSpatial, - &alpha, - &beta, - &alpha, - &beta, - x_input.desc, - xin_dev.get(), - dy_input.desc, - dyin_dev.get(), - dx_out.desc, - dx_out_dev.get(), - scale.desc, - scale_dev.get(), - dscale_dev.get(), - dshift_dev.get(), - epsilon, - savedMean_dev.get(), - savedInvVar_dev.get()); - - dx_out.data = handle.Read(dx_out_dev, dx_out.data.size()); - dscale.data = handle.Read(dscale_dev, dscale.data.size()); - dshift.data = handle.Read(dshift_dev, dshift.data.size()); - - return std::make_tuple(dx_out, dscale, dshift); - } - - void fail(int badtensor) const - { - std::cout << "Backward Batch Spatial Normalization Use Saved Mean and Variance: " - << std::endl; - std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl; - std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl; - switch(badtensor) - { - case(0): - std::cout << "Delta X output tensor output failed verification." << std::endl; - break; - case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break; - case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break; - default: break; - } - } -}; - -template -struct batch_norm_spatial_nhwc_driver : test_driver -{ - tensor input; - tensor scale; - tensor shift; - batch_norm_spatial_nhwc_driver() - { - this->batch_factor = 4; - add(input, - "input", - get_bn_spatial_input_tensor( - tensor_elem_gen_integer{miopen_type{} == miopenHalf ? 5 : 17})); - } - - void run() - { - std::size_t n, c, h, w; - std::tie(n, c, h, w) = miopen::tien<4>(input.desc.GetLengths()); - - std::size_t ssn, ssc, ssh, ssw; - auto derivedBnDesc = miopen::TensorDescriptor(input.desc.GetType(), - std::vector{1, 1, 1, c}, - std::vector{c, c, c, 1}); - std::tie(ssn, ssh, ssw, ssc) = miopen::tien<4>(derivedBnDesc.GetLengths()); - - std::vector new_len = input.desc.GetLengths(); - std::vector new_str; - miopen::tensor_layout_to_strides(new_len, "NCHW", "NHWC", new_str); - input.desc = miopen::TensorDescriptor(miopen_type{}, new_len, new_str); - - if(input.desc.GetType() == miopenFloat) - { - scale = tensor{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17}); - shift = tensor{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17}); - } - else - { - scale = tensor{ssn, ssh, ssw, ssc}; - shift = tensor{ssn, ssh, ssw, ssc}; - - const PREC_TYPE Data_scale = static_cast(1e-4); - for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++) - { - scale[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - shift[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - } - for(std::size_t i = 0; i < input.desc.GetElementSize(); i++) - { - input[i] = prng::gen_descreet_uniform_sign(static_cast(1e-5), 100); - } - } - - auto outpair = verify(verify_forward_train_bn_spatial{input, scale, shift}); - - auto dy_input = std::get<0>(outpair.second); - for(std::size_t bidx = 0; bidx < n; bidx++) - { - for(std::size_t cidx = 0; cidx < c; cidx++) - { - for(std::size_t row = 0; row < h; row++) - { - for(std::size_t column = 0; column < w; column++) - { - dy_input(bidx, cidx, row, column) *= 0.1; - } - } - } - } - this->tolerance = 80 * input.desc.GetElementSize(); - verify(verify_backward_bn_spatial_recalc{input, dy_input, scale}); - - auto savedMean = std::get<3>(outpair.second); - auto savedInvVar = std::get<4>(outpair.second); - verify(verify_backward_bn_spatial_use_saved{ - input, dy_input, scale, savedMean, savedInvVar}); - } -}; - -int main(int argc, const char* argv[]) -{ - test_drive(argc, argv); - return 0; -} diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index b446835b4b..95d3645c72 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -360,15 +360,7 @@ template void batchNormSpatialHostBwdTrainCK(DLModule& dl_module) { - // batchNormSpatialHostBwdTrain(dl_module.input, - // dl_module.x_input, - // dl_module.ref_out, - // dl_module.bnScale, - // dl_module.resBnScaleDiff, - // dl_module.resBnBiasDiff, - // dl_module.savedMean, - // dl_module.savedInvVariance); - + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; constexpr ck::index_t Rank = 4; From f348749eaed24fb404cfc82fdd92b1edaecfa7ca Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 27 Sep 2023 14:56:49 +0000 Subject: [PATCH 25/28] bg/LWPMIOPEN-193_bn_back : add new rand --- test/gtest/bn_test_data.hpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index bc77447434..b4950d6af0 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -24,8 +24,7 @@ * *******************************************************************************/ #pragma once - -#include +#include "random.hpp" #include #include @@ -60,7 +59,7 @@ std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, @@ -307,17 +306,16 @@ struct BNBwdTestData : public BNTestData void InitTensorsWithRandValue() { - std::random_device rd{}; - std::mt19937 gen{rd()}; - std::uniform_int_distribution<> d{0, 100}; - auto gen_value = [&](auto...) { - return 1e-2 * static_cast(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1); + auto gen_value = [](auto...) { + return prng::gen_descreet_uniform_sign(static_cast(1e-2), 100); }; dy.generate(gen_value); bnScale.generate(gen_value); savedMean.generate(gen_value); - auto gen_var = [&](auto...) { return 1e-2 * (static_cast(d(gen)) + 1); }; + auto gen_var = [](auto...) { + return static_cast(1e-2) * static_cast(prng::gen_0_to_B(100) + 1); + }; savedInvVar.generate(gen_var); std::fill(dScale.begin(), dScale.end(), 0.); From 6874015cad5fceef6d9b7067c31f74af819843cd Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 27 Sep 2023 15:19:40 +0000 Subject: [PATCH 26/28] bg/LWPMIOPEN-193_bn_back: handle all type in switch case of solver --- src/solver/batchnorm/backward_ck.cpp | 4 ++ test/fusionHost.hpp | 98 ---------------------------- test/gtest/test_operations.hpp | 18 ++--- 3 files changed, 13 insertions(+), 107 deletions(-) diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index 53f647f483..fba8724990 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -183,6 +183,8 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, case miopenInt32: case miopenInt8: case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: default: MIOPEN_THROW("Unsupported datatype"); } return false; @@ -235,6 +237,8 @@ ConvSolution BnCKBwdBackward::GetSolution( case miopenInt8: case miopenInt32: case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: default: MIOPEN_THROW(miopenStatusInternalError, "BnCKBwdBackward operation not for this data type"); } diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 95d3645c72..f3219048fb 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -36,20 +36,11 @@ #include #include #include -// #include "driver.hpp" #include "get_handle.hpp" #include "tensor_holder.hpp" #include "verify.hpp" #include -// add ck guard -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" - -#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp" - - - - template void convHostForward(const tensor& input, tensor& output, @@ -350,95 +341,6 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, }); // for (channel) } -template -void batchNormSpatialHostBwdTrainCK(DLModule& dl_module) -{ - - using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; - - constexpr ck::index_t Rank = 4; - constexpr ck::index_t NumReduceDim = 3; - - using ReferenceBatchNormBwdInstance = - ck::tensor_operation::host::ReferenceBatchNormBwd; - - auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{}; - std::array arrReduceDims{0, 1, 2}; - - std::array arrScaleBiasMeanVarLengths; - std::array arrScaleBiasMeanVarStrides; - - arrScaleBiasMeanVarLengths[0] = dl_module.input.desc.GetLengths()[1]; // get channel - arrScaleBiasMeanVarStrides[0] = 1; - - auto derivedBnDesc = miopen::TensorDescriptor{}; - miopen::DeriveBNTensorDescriptor(derivedBnDesc, - dl_module.input.desc, - dl_module.bn_mode); - - std::array xyLengths; // inOutLengths - std::array xyStrides; - - std::copy(dl_module.input.desc.GetLengths().begin(), - dl_module.input.desc.GetLengths().end(), - xyLengths.begin()); - - std::copy(dl_module.input.desc.GetStrides().begin(), - dl_module.input.desc.GetStrides().end(), - xyStrides.begin()); - - std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); - std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); - - auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer( - xyLengths, - xyStrides, - xyStrides, - xyStrides, - arrReduceDims, - arrScaleBiasMeanVarLengths, - arrScaleBiasMeanVarStrides, - arrScaleBiasMeanVarStrides, - arrScaleBiasMeanVarStrides, - dl_module.input.data.data(), - dl_module.dy.data.data(), - dl_module.bnScale.data.data(), - dl_module.savedMean.data.data(), - dl_module.savedInvVar.data.data(), - dl_module.epsilon, - PassThroughOp{}, - dl_module.ref_out.data.data(), - dl_module.dScale_ref.data.data(), - dl_module.dBias_ref.data.data()); - - if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get())) - { - std::cerr << "The runtime parameters not supported by the reference instance, exiting!" - << std::endl; - exit(1); - }; - - auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer(); - - (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); -} - template void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double gamma, diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index ad33833da2..5d1e62a5b8 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -44,18 +44,18 @@ template void ComputeCPUBNBwd(DLModule& dl_module) { - batchNormSpatialHostBwdTrainCK(dl_module); + batchNormSpatialHostBwdTrain(dl_module.input, + dl_module.dy, + dl_module.ref_out, + dl_module.bnScale, + dl_module.dScale_ref, + dl_module.dBias_ref, + dl_module.savedMean, + dl_module.savedInvVar); } template From 11f66e92a0cec484e171e5ace64e0b3c5ee7aaab Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 27 Sep 2023 15:25:52 +0000 Subject: [PATCH 27/28] bg/LWPMIOPEN-193_bn_back: clang format --- test/gtest/bn.hpp | 14 +++++++------- test/gtest/bn_test_data.hpp | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 2f0f67af16..a89e94718e 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -140,13 +140,13 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size()); test::ComputeCPUBNBwd(bn_bwd_test_data); - + DxDataType, + DyDataType, + AccDataType, + ScaleDataType, + DscaleDbiasDataType, + MeanVarDataType>(bn_bwd_test_data); + // using tolerance = 1e-4 since this the tolerance CK uses test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4); test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index b4950d6af0..822369725e 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -314,7 +314,8 @@ struct BNBwdTestData : public BNTestData savedMean.generate(gen_value); auto gen_var = [](auto...) { - return static_cast(1e-2) * static_cast(prng::gen_0_to_B(100) + 1); + return static_cast(1e-2) * + static_cast(prng::gen_0_to_B(100) + 1); }; savedInvVar.generate(gen_var); From d5eb31cb97a0c170ccf3e70678d7340d73487ae2 Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Thu, 5 Oct 2023 00:08:47 -0500 Subject: [PATCH 28/28] bg/LWPMIOPEN-192: Integrate CK's batch norm forward training into non-tunable MIOpen solver (#2386) * bg/LWPMIOPEN-192: add batch norm foward CK kernel * bg/LWPMIOPEN-192 : analyze cleanup * fix a typo * bg/LWPMIOPEN-192: fix review comments * bg/LWPMIOPEN-192 : fix compile error * bg/LWPMIOPEN-192 : fix clang tidy --------- Co-authored-by: Jun Liu --- src/CMakeLists.txt | 1 + src/include/miopen/batchnorm/solvers.hpp | 10 + src/ocl/batchnormocl.cpp | 3 +- src/solver.cpp | 1 + src/solver/batchnorm/forward_training_ck.cpp | 239 +++++++++++++++++++ test/fusionHost.hpp | 10 +- test/gtest/bn.hpp | 91 ++++++- test/gtest/bn_fwd_train.cpp | 73 ++++++ test/gtest/bn_test_data.hpp | 103 ++++++++ test/gtest/test_operations.hpp | 15 ++ 10 files changed, 539 insertions(+), 7 deletions(-) create mode 100644 src/solver/batchnorm/forward_training_ck.cpp create mode 100644 test/gtest/bn_fwd_train.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 554c8ff4ec..abc0679a8a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -164,6 +164,7 @@ set( MIOpen_Source solver/batchnorm/forward_per_activation_fused.cpp solver/batchnorm/forward_spatial_multiple.cpp solver/batchnorm/forward_spatial_single.cpp + solver/batchnorm/forward_training_ck.cpp solver/conv_asm_1x1u.cpp solver/conv_asm_1x1u_bias_activ_fused.cpp solver/conv_asm_1x1u_stride2.cpp diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 51fa1d1643..70d64bb204 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -152,6 +152,16 @@ struct BnCKBwdBackward final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; +struct BnCKFwdTraining final : BatchnormSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + } // namespace batchnorm } // namespace solver diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 7a20038c55..6147a827b8 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -131,7 +131,8 @@ void BatchNormForwardTraining(Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; diff --git a/src/solver.cpp b/src/solver.cpp index ef90f508b2..4cd680dd9c 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -570,6 +570,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) registry, ++id, ConvHipImplicitGemm3DGroupBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId()); Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdTraining{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp new file mode 100644 index 0000000000..a65cec14a9 --- /dev/null +++ b/src/solver/batchnorm/forward_training_ck.cpp @@ -0,0 +1,239 @@ + +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#include +#include +#endif +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING) + +namespace miopen { +namespace solver { +namespace batchnorm { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; +using index_t = int32_t; + +constexpr index_t Rank = 4; +constexpr index_t NumBatchNormReduceDim = 3; + +using F16 = ck::half_t; +using F32 = float; +using F64 = double; +using BF16 = ushort; + +template +using DeviceOpBNFwdTrainingPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceBatchNormFwd>; + +struct CKArgsBNormFwdTraining +{ + CKArgsBNormFwdTraining(const miopen::batchnorm::ProblemDescription& problem) + { + std::copy(problem.GetXDesc().GetLengths().begin(), + problem.GetXDesc().GetLengths().end(), + xyLengths.begin()); + + std::copy(problem.GetXDesc().GetStrides().begin(), + problem.GetXDesc().GetStrides().end(), + xyStrides.begin()); + arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel + arrScaleBiasMeanVarStrides[0] = 1; + + // prep for CK + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); + } + + CKArgsBNormFwdTraining(const CKArgsBNormFwdTraining&) = default; + CKArgsBNormFwdTraining(CKArgsBNormFwdTraining&&) = default; + CKArgsBNormFwdTraining& operator=(const CKArgsBNormFwdTraining&) = default; + + template + auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const + { + return invoker_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + data_ctx.x, + data_ctx.bnScale, + data_ctx.bnBias, + data_ctx.epsilon, + PassThroughOp{}, + data_ctx.y, + data_ctx.resultSaveMean, + data_ctx.resultSaveInvVariance, + data_ctx.expAvgFactor, + data_ctx.resultRunningMean, + data_ctx.resultRunningVariance); + } + + template + bool IsSupportedBy(const ConvPtr& invoker_ptr) const + { + auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::InvokeParams{}); + return invoker_ptr->IsSupportedArgument(arg_ptr.get()); + } + + std::array xyLengths; + std::array xyStrides; + std::vector invariantDims; + + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + + std::array reduceDims{0, 1, 2}; +}; + +template +static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) +{ + return IsCKApplicable, + CKArgsBNormFwdTraining>(problem); +} +#endif + +bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = context; + std::ignore = fdesc_problem; + return false; +#else + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING{})) + return false; + if(!bn_problem.IsLayoutNHWC()) + return false; + if(!ck_utility::is_ck_supported_hardware(context.GetStream())) + return false; + + switch(bn_problem.GetXDesc().GetType()) + { + case miopenHalf: return CheckCKApplicability(bn_problem); + case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenDouble: return CheckCKApplicability(bn_problem); + case miopenBFloat16: return CheckCKApplicability(bn_problem); + case miopenInt32: + case miopenInt8: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type"); + } + return false; +#endif +} + +template +ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem) +{ + const auto& valid_kernel_ids = FillValidKernelsIDs, + CKArgsBNormFwdTraining>(bn_problem); + assert(!valid_kernel_ids.empty()); + const auto& kernel_id = valid_kernel_ids[0]; + return InitAnyInvokerFactory, + CKArgsBNormFwdTraining, + miopen::batchnorm::InvokeParams>(bn_problem, kernel_id); +} + +ConvSolution BnCKFwdTraining::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(bn_problem.GetXDesc().GetType()) + { + + case miopenFloat: return MakeAnyInvokerFactory(bn_problem); + case miopenDouble: return MakeAnyInvokerFactory(bn_problem); + case miopenHalf: return MakeAnyInvokerFactory(bn_problem); + case miopenBFloat16: return MakeAnyInvokerFactory(bn_problem); + case miopenInt8: + case miopenInt32: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: + MIOPEN_THROW(miopenStatusInternalError, "BnCKFwdTraining operation not for this data type"); + } +#endif + return {}; +} + +} // namespace batchnorm +} // namespace solver +} // namespace miopen diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index f3219048fb..5374abd1fa 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -202,17 +202,17 @@ void batchNormPerActivHostInference(const tensor& input, }); } -template +template void batchNormSpatialHostFwdTrain(const tensor& input, tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index a89e94718e..22f8391fe6 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -84,7 +84,6 @@ struct BNInferTest : public ::testing::TestWithParam +struct BNFwdTrainTest + : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(bn_config, tensor_layout) = GetParam(); + bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); + + auto&& handle = get_handle(); + miopenBatchNormalizationForwardTraining(&handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + + std::fill(bn_fwd_train_test_data.output.begin(), + bn_fwd_train_test_data.output.end(), + std::numeric_limits::quiet_NaN()); + std::fill(bn_fwd_train_test_data.saveMean_ref.begin(), + bn_fwd_train_test_data.saveMean_ref.end(), + std::numeric_limits::quiet_NaN()); + std::fill(bn_fwd_train_test_data.saveVariance_ref.begin(), + bn_fwd_train_test_data.saveVariance_ref.end(), + std::numeric_limits::quiet_NaN()); + } + + void TearDown() override + { + if(test_skipped) + return; + auto&& handle = get_handle(); + bn_fwd_train_test_data.output.data = handle.Read( + bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size()); + + bn_fwd_train_test_data.saveMean.data = handle.Read( + bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size()); + bn_fwd_train_test_data.saveVariance.data = + handle.Read(bn_fwd_train_test_data.saveVariance_dev, + bn_fwd_train_test_data.saveVariance_ref.data.size()); + bn_fwd_train_test_data.runMean.data = handle.Read( + bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size()); + bn_fwd_train_test_data.runVariance.data = + handle.Read(bn_fwd_train_test_data.runVariance_dev, + bn_fwd_train_test_data.runVariance_ref.data.size()); + test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data); + + // 4e-3 is tolerance used by CK kernel. + test::CompareTensor( + bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3); + } + + BNTestCase bn_config; + bool test_skipped = false; + BNFwdTrainTestData + bn_fwd_train_test_data; + miopenTensorLayout_t tensor_layout; +}; diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp new file mode 100644 index 0000000000..4a4dd4c728 --- /dev/null +++ b/test/gtest/bn_fwd_train.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +struct BNFwdTrainTestHalf + : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestFloat : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestDouble : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestBFloat16 : BNFwdTrainTest +{ +}; + +TEST_P(BNFwdTrainTestHalf, BnFwdTrainCKHalf) {} + +TEST_P(BNFwdTrainTestFloat, BnFwdTrainCKFloat) {} + +// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig +// only supports half and float +TEST_P(BNFwdTrainTestDouble, DISABLED_BnFwdTrainCKDouble) {} +TEST_P(BNFwdTrainTestBFloat16, DISABLED_BnFwdTrainCKBFloat16) {} + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestHalfNHWCSuite, + BNFwdTrainTestHalf, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestDouble, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestBFloat16, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 822369725e..f65c694281 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -338,3 +338,106 @@ struct BNBwdTestData : public BNTestData dBias_dev = handle.Write(dBias.data); } }; + +template +struct BNFwdTrainTestData : public BNTestData +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + BNTestData::SetUpImpl(config, t_layout); + CreateTensors(); + InitTensorsWithRandValue(); + WriteToGPU(); + } + + tensor scale; + tensor shift; + tensor saveMean; + tensor saveVariance; + tensor runMean; + tensor runVariance; + + tensor saveMean_ref; + tensor saveVariance_ref; + tensor runMean_ref; + tensor runVariance_ref; + + miopen::Allocator::ManageDataPtr scale_dev; + miopen::Allocator::ManageDataPtr shift_dev; // bias + miopen::Allocator::ManageDataPtr saveMean_dev; + miopen::Allocator::ManageDataPtr saveVariance_dev; + miopen::Allocator::ManageDataPtr runMean_dev; + miopen::Allocator::ManageDataPtr runVariance_dev; + double epsilon = 1.0e-5; + double averageFactor = 0.1; + float alpha = static_cast(1.0f); + float beta = static_cast(0); + const float activ_alpha = static_cast(0.5f); + const float activ_beta = static_cast(0.5f); + const float activ_gamma = static_cast(0.5f); + +private: + void CreateTensors() + { + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + scale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + } + + void InitTensorsWithRandValue() + { + auto gen_value = [](auto...) { + return prng::gen_descreet_uniform_sign(static_cast(1e-2), 100); + }; + scale.generate(gen_value); + shift.generate(gen_value); + + auto gen_var = [](auto...) { + return static_cast(1e-2) * + static_cast(prng::gen_0_to_B(100) + 1); + }; + runMean.generate(gen_var); + runVariance.generate(gen_var); + + saveMean_ref = saveMean; + saveVariance_ref = saveVariance; + runMean_ref = runMean; + runVariance_ref = runVariance; + } + void WriteToGPU() + { + auto&& handle = get_handle(); + scale_dev = handle.Write(scale.data); + shift_dev = handle.Write(shift.data); + saveMean_dev = handle.Write(saveMean.data); + saveVariance_dev = handle.Write(saveVariance.data); + runMean_dev = handle.Write(runMean.data); + runVariance_dev = handle.Write(runVariance.data); + } +}; diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 5d1e62a5b8..da41212302 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -58,6 +58,21 @@ void ComputeCPUBNBwd(DLModule& dl_module) dl_module.savedInvVar); } +template +void ComputeCPUBNFwdTrain(DLModule& dl_module) +{ + batchNormSpatialHostFwdTrain(dl_module.input, + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.averageFactor, + dl_module.saveMean_ref, + dl_module.saveVariance_ref, + dl_module.runMean_ref, + dl_module.runVariance_ref); +} + template void CompareTensor(const tensor& output, const tensor& ref_out,