From d5eb31cb97a0c170ccf3e70678d7340d73487ae2 Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Thu, 5 Oct 2023 00:08:47 -0500 Subject: [PATCH] bg/LWPMIOPEN-192: Integrate CK's batch norm forward training into non-tunable MIOpen solver (#2386) * bg/LWPMIOPEN-192: add batch norm foward CK kernel * bg/LWPMIOPEN-192 : analyze cleanup * fix a typo * bg/LWPMIOPEN-192: fix review comments * bg/LWPMIOPEN-192 : fix compile error * bg/LWPMIOPEN-192 : fix clang tidy --------- Co-authored-by: Jun Liu --- src/CMakeLists.txt | 1 + src/include/miopen/batchnorm/solvers.hpp | 10 + src/ocl/batchnormocl.cpp | 3 +- src/solver.cpp | 1 + src/solver/batchnorm/forward_training_ck.cpp | 239 +++++++++++++++++++ test/fusionHost.hpp | 10 +- test/gtest/bn.hpp | 91 ++++++- test/gtest/bn_fwd_train.cpp | 73 ++++++ test/gtest/bn_test_data.hpp | 103 ++++++++ test/gtest/test_operations.hpp | 15 ++ 10 files changed, 539 insertions(+), 7 deletions(-) create mode 100644 src/solver/batchnorm/forward_training_ck.cpp create mode 100644 test/gtest/bn_fwd_train.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 554c8ff4ec..abc0679a8a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -164,6 +164,7 @@ set( MIOpen_Source solver/batchnorm/forward_per_activation_fused.cpp solver/batchnorm/forward_spatial_multiple.cpp solver/batchnorm/forward_spatial_single.cpp + solver/batchnorm/forward_training_ck.cpp solver/conv_asm_1x1u.cpp solver/conv_asm_1x1u_bias_activ_fused.cpp solver/conv_asm_1x1u_stride2.cpp diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 51fa1d1643..70d64bb204 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -152,6 +152,16 @@ struct BnCKBwdBackward final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; +struct BnCKFwdTraining final : BatchnormSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + } // namespace batchnorm } // namespace solver diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 7a20038c55..6147a827b8 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -131,7 +131,8 @@ void BatchNormForwardTraining(Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; diff --git a/src/solver.cpp b/src/solver.cpp index ef90f508b2..4cd680dd9c 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -570,6 +570,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) registry, ++id, ConvHipImplicitGemm3DGroupBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId()); Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdTraining{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp new file mode 100644 index 0000000000..a65cec14a9 --- /dev/null +++ b/src/solver/batchnorm/forward_training_ck.cpp @@ -0,0 +1,239 @@ + +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#include +#include +#endif +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING) + +namespace miopen { +namespace solver { +namespace batchnorm { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; +using index_t = int32_t; + +constexpr index_t Rank = 4; +constexpr index_t NumBatchNormReduceDim = 3; + +using F16 = ck::half_t; +using F32 = float; +using F64 = double; +using BF16 = ushort; + +template +using DeviceOpBNFwdTrainingPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceBatchNormFwd>; + +struct CKArgsBNormFwdTraining +{ + CKArgsBNormFwdTraining(const miopen::batchnorm::ProblemDescription& problem) + { + std::copy(problem.GetXDesc().GetLengths().begin(), + problem.GetXDesc().GetLengths().end(), + xyLengths.begin()); + + std::copy(problem.GetXDesc().GetStrides().begin(), + problem.GetXDesc().GetStrides().end(), + xyStrides.begin()); + arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel + arrScaleBiasMeanVarStrides[0] = 1; + + // prep for CK + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); + } + + CKArgsBNormFwdTraining(const CKArgsBNormFwdTraining&) = default; + CKArgsBNormFwdTraining(CKArgsBNormFwdTraining&&) = default; + CKArgsBNormFwdTraining& operator=(const CKArgsBNormFwdTraining&) = default; + + template + auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const + { + return invoker_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + data_ctx.x, + data_ctx.bnScale, + data_ctx.bnBias, + data_ctx.epsilon, + PassThroughOp{}, + data_ctx.y, + data_ctx.resultSaveMean, + data_ctx.resultSaveInvVariance, + data_ctx.expAvgFactor, + data_ctx.resultRunningMean, + data_ctx.resultRunningVariance); + } + + template + bool IsSupportedBy(const ConvPtr& invoker_ptr) const + { + auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::InvokeParams{}); + return invoker_ptr->IsSupportedArgument(arg_ptr.get()); + } + + std::array xyLengths; + std::array xyStrides; + std::vector invariantDims; + + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + + std::array reduceDims{0, 1, 2}; +}; + +template +static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) +{ + return IsCKApplicable, + CKArgsBNormFwdTraining>(problem); +} +#endif + +bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = context; + std::ignore = fdesc_problem; + return false; +#else + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING{})) + return false; + if(!bn_problem.IsLayoutNHWC()) + return false; + if(!ck_utility::is_ck_supported_hardware(context.GetStream())) + return false; + + switch(bn_problem.GetXDesc().GetType()) + { + case miopenHalf: return CheckCKApplicability(bn_problem); + case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenDouble: return CheckCKApplicability(bn_problem); + case miopenBFloat16: return CheckCKApplicability(bn_problem); + case miopenInt32: + case miopenInt8: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type"); + } + return false; +#endif +} + +template +ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem) +{ + const auto& valid_kernel_ids = FillValidKernelsIDs, + CKArgsBNormFwdTraining>(bn_problem); + assert(!valid_kernel_ids.empty()); + const auto& kernel_id = valid_kernel_ids[0]; + return InitAnyInvokerFactory, + CKArgsBNormFwdTraining, + miopen::batchnorm::InvokeParams>(bn_problem, kernel_id); +} + +ConvSolution BnCKFwdTraining::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(bn_problem.GetXDesc().GetType()) + { + + case miopenFloat: return MakeAnyInvokerFactory(bn_problem); + case miopenDouble: return MakeAnyInvokerFactory(bn_problem); + case miopenHalf: return MakeAnyInvokerFactory(bn_problem); + case miopenBFloat16: return MakeAnyInvokerFactory(bn_problem); + case miopenInt8: + case miopenInt32: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: + MIOPEN_THROW(miopenStatusInternalError, "BnCKFwdTraining operation not for this data type"); + } +#endif + return {}; +} + +} // namespace batchnorm +} // namespace solver +} // namespace miopen diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index f3219048fb..5374abd1fa 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -202,17 +202,17 @@ void batchNormPerActivHostInference(const tensor& input, }); } -template +template void batchNormSpatialHostFwdTrain(const tensor& input, tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index a89e94718e..22f8391fe6 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -84,7 +84,6 @@ struct BNInferTest : public ::testing::TestWithParam +struct BNFwdTrainTest + : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(bn_config, tensor_layout) = GetParam(); + bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); + + auto&& handle = get_handle(); + miopenBatchNormalizationForwardTraining(&handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + + std::fill(bn_fwd_train_test_data.output.begin(), + bn_fwd_train_test_data.output.end(), + std::numeric_limits::quiet_NaN()); + std::fill(bn_fwd_train_test_data.saveMean_ref.begin(), + bn_fwd_train_test_data.saveMean_ref.end(), + std::numeric_limits::quiet_NaN()); + std::fill(bn_fwd_train_test_data.saveVariance_ref.begin(), + bn_fwd_train_test_data.saveVariance_ref.end(), + std::numeric_limits::quiet_NaN()); + } + + void TearDown() override + { + if(test_skipped) + return; + auto&& handle = get_handle(); + bn_fwd_train_test_data.output.data = handle.Read( + bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size()); + + bn_fwd_train_test_data.saveMean.data = handle.Read( + bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size()); + bn_fwd_train_test_data.saveVariance.data = + handle.Read(bn_fwd_train_test_data.saveVariance_dev, + bn_fwd_train_test_data.saveVariance_ref.data.size()); + bn_fwd_train_test_data.runMean.data = handle.Read( + bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size()); + bn_fwd_train_test_data.runVariance.data = + handle.Read(bn_fwd_train_test_data.runVariance_dev, + bn_fwd_train_test_data.runVariance_ref.data.size()); + test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data); + + // 4e-3 is tolerance used by CK kernel. + test::CompareTensor( + bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3); + } + + BNTestCase bn_config; + bool test_skipped = false; + BNFwdTrainTestData + bn_fwd_train_test_data; + miopenTensorLayout_t tensor_layout; +}; diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp new file mode 100644 index 0000000000..4a4dd4c728 --- /dev/null +++ b/test/gtest/bn_fwd_train.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +struct BNFwdTrainTestHalf + : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestFloat : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestDouble : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestBFloat16 : BNFwdTrainTest +{ +}; + +TEST_P(BNFwdTrainTestHalf, BnFwdTrainCKHalf) {} + +TEST_P(BNFwdTrainTestFloat, BnFwdTrainCKFloat) {} + +// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig +// only supports half and float +TEST_P(BNFwdTrainTestDouble, DISABLED_BnFwdTrainCKDouble) {} +TEST_P(BNFwdTrainTestBFloat16, DISABLED_BnFwdTrainCKBFloat16) {} + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestHalfNHWCSuite, + BNFwdTrainTestHalf, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestDouble, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestBFloat16, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 822369725e..f65c694281 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -338,3 +338,106 @@ struct BNBwdTestData : public BNTestData dBias_dev = handle.Write(dBias.data); } }; + +template +struct BNFwdTrainTestData : public BNTestData +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + BNTestData::SetUpImpl(config, t_layout); + CreateTensors(); + InitTensorsWithRandValue(); + WriteToGPU(); + } + + tensor scale; + tensor shift; + tensor saveMean; + tensor saveVariance; + tensor runMean; + tensor runVariance; + + tensor saveMean_ref; + tensor saveVariance_ref; + tensor runMean_ref; + tensor runVariance_ref; + + miopen::Allocator::ManageDataPtr scale_dev; + miopen::Allocator::ManageDataPtr shift_dev; // bias + miopen::Allocator::ManageDataPtr saveMean_dev; + miopen::Allocator::ManageDataPtr saveVariance_dev; + miopen::Allocator::ManageDataPtr runMean_dev; + miopen::Allocator::ManageDataPtr runVariance_dev; + double epsilon = 1.0e-5; + double averageFactor = 0.1; + float alpha = static_cast(1.0f); + float beta = static_cast(0); + const float activ_alpha = static_cast(0.5f); + const float activ_beta = static_cast(0.5f); + const float activ_gamma = static_cast(0.5f); + +private: + void CreateTensors() + { + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + scale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + } + + void InitTensorsWithRandValue() + { + auto gen_value = [](auto...) { + return prng::gen_descreet_uniform_sign(static_cast(1e-2), 100); + }; + scale.generate(gen_value); + shift.generate(gen_value); + + auto gen_var = [](auto...) { + return static_cast(1e-2) * + static_cast(prng::gen_0_to_B(100) + 1); + }; + runMean.generate(gen_var); + runVariance.generate(gen_var); + + saveMean_ref = saveMean; + saveVariance_ref = saveVariance; + runMean_ref = runMean; + runVariance_ref = runVariance; + } + void WriteToGPU() + { + auto&& handle = get_handle(); + scale_dev = handle.Write(scale.data); + shift_dev = handle.Write(shift.data); + saveMean_dev = handle.Write(saveMean.data); + saveVariance_dev = handle.Write(saveVariance.data); + runMean_dev = handle.Write(runMean.data); + runVariance_dev = handle.Write(runVariance.data); + } +}; diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 5d1e62a5b8..da41212302 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -58,6 +58,21 @@ void ComputeCPUBNBwd(DLModule& dl_module) dl_module.savedInvVar); } +template +void ComputeCPUBNFwdTrain(DLModule& dl_module) +{ + batchNormSpatialHostFwdTrain(dl_module.input, + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.averageFactor, + dl_module.saveMean_ref, + dl_module.saveVariance_ref, + dl_module.runMean_ref, + dl_module.runVariance_ref); +} + template void CompareTensor(const tensor& output, const tensor& ref_out,