From e39aa0444843f1d63b31e4c93cd8079455072e29 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 6 Jul 2023 06:40:57 +0000
Subject: [PATCH 01/28] bg/LWPMIOPEN-194 : ck batch norm test pass

---
 src/CMakeLists.txt                            |   1 +
 .../miopen/batchnorm/problem_description.hpp  |   2 +
 src/include/miopen/batchnorm/solvers.hpp      |  13 +
 src/include/miopen/fusion/solvers.hpp         |  54 +++
 src/solver/batchnorm/forward_inference_ck.cpp | 420 ++++++++++++++++++
 test/gtest/bn.hpp                             |  67 +++
 test/gtest/bn_test_base.hpp                   | 219 +++++++++
 test/gtest/log_test_helper.cpp                |   4 +-
 test/gtest/na.hpp                             | 185 ++------
 test/gtest/na_infer.cpp                       |  77 +++-
 test/gtest/test_fusion_plan_base.hpp          | 123 +++++
 11 files changed, 997 insertions(+), 168 deletions(-)
 create mode 100644 src/solver/batchnorm/forward_inference_ck.cpp
 create mode 100644 test/gtest/bn.hpp
 create mode 100644 test/gtest/bn_test_base.hpp
 create mode 100644 test/gtest/test_fusion_plan_base.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ea9a82d120..08290b928d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -149,6 +149,7 @@ set( MIOpen_Source
     solver/batchnorm/backward_spatial_multiple.cpp
     solver/batchnorm/backward_spatial_single.cpp
     solver/batchnorm/forward_inference.cpp
+    solver/batchnorm/forward_inference_ck.cpp
     solver/batchnorm/forward_inference_fused.cpp
     solver/batchnorm/forward_per_activation.cpp
     solver/batchnorm/forward_per_activation_fused.cpp
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index b3af4e1938..224ec0c92d 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -84,6 +84,8 @@ struct ProblemDescription : ProblemDescriptionBase
           scaleBiasDesc(bnScaleBiasMeanVarDesc_),
           epsilon(epsilon_)
     {
+        in_layout  = xDesc.GetLayout(xDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
+        out_layout = yOrDyDesc.GetLayout(yOrDyDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
     }
 
     // Backward
diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 64f1e237bf..c7b91fb697 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -132,6 +132,19 @@ struct BnFwdInference final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
+struct CKBnFwdInference final : BatchnormSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<BnFwdTrainingSpatialSingle>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::batchnorm::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& problem) const override;
+};
+
 } // namespace batchnorm
 
 } // namespace solver
diff --git a/src/include/miopen/fusion/solvers.hpp b/src/include/miopen/fusion/solvers.hpp
index f5621e72dc..c0b4ca3659 100644
--- a/src/include/miopen/fusion/solvers.hpp
+++ b/src/include/miopen/fusion/solvers.hpp
@@ -317,6 +317,60 @@ struct BnBwdTrgActivationFused final : FusionSolverBase
                              const FusionDescription& problem) const override;
 };
 
+struct PerformanceConfigCKBnFwdInference : PerfConfigBase<PerformanceConfigCKBnFwdInference>
+{
+    int index;
+    std::string kernel_id;
+    std::vector<std::string> valid_kernels;
+    PerformanceConfigCKBnFwdInference(int idx, std::string kernl_id)
+        : index(idx), kernel_id(kernl_id)
+    {
+    }
+    PerformanceConfigCKBnFwdInference() : PerformanceConfigCKBnFwdInference(0, "") {}
+    PerformanceConfigCKBnFwdInference(bool) : PerformanceConfigCKBnFwdInference(0, "") {}
+    void HeuristicInit(const FusionDescription& fdesc_problem);
+    bool SetNextValue(const FusionDescription& fdesc_problem);
+    bool IsValidValue() const;
+    bool IsValid(const FusionContext&, const FusionDescription& fdesc_problem) const;
+
+    template <typename Self, typename F>
+    static void Visit(Self&& s, F f)
+    {
+        f(s.kernel_id, "kernel_id");
+    }
+    bool operator==(const PerformanceConfigCKBnFwdInference& other) const;
+
+private:
+    template <typename DataType>
+    void Init(const miopen::batchnorm::ProblemDescription&);
+    template <typename DataType>
+    bool CheckIsSupportCKArgs(const miopen::batchnorm::ProblemDescription&) const;
+};
+
+struct CKBnFwdInference final : FusionTunableSolver<PerformanceConfigCKBnFwdInference>
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<CKBnFwdInference>(); }
+
+    PerformanceConfigCKBnFwdInference
+    GetDefaultPerformanceConfig(const FusionContext& ctx,
+                                const FusionDescription& fdesc_problem) const override;
+    bool IsValidPerformanceConfig(const FusionContext& ctx,
+                                  const FusionDescription& fdesc_problem,
+                                  const PerformanceConfigCKBnFwdInference& config) const override;
+    PerformanceConfigCKBnFwdInference Search(const FusionContext& ctx,
+                                             const FusionDescription& fdesc_problem,
+                                             const AnyInvokeParams& invoke_ctx) const override;
+    bool IsApplicable(const FusionContext& ctx,
+                      const FusionDescription& fdesc_problem) const override;
+    ConvSolution GetSolution(const FusionContext& ctx,
+                             const FusionDescription& fdesc_problem,
+                             const PerformanceConfigCKBnFwdInference& config) const override;
+
+private:
+    template <typename DataType>
+    bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription&) const;
+};
+
 } // namespace fusion
 } // namespace solver
 } // namespace miopen
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
new file mode 100644
index 0000000000..53765a3e85
--- /dev/null
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -0,0 +1,420 @@
+
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <vector>
+#include <cstdint>
+
+#include <miopen/check_numerics.hpp>
+#include <miopen/solver.hpp>
+#include <miopen/fusion/solvers.hpp>
+#include <miopen/generic_search.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/solver/problem_description_interpreter.hpp>
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp>
+#endif
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_INFER)
+
+namespace miopen {
+namespace solver {
+namespace fusion {
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using index_t     = int32_t;
+using Normalize   = ck::tensor_operation::element_wise::NormalizeInInfer;
+
+constexpr index_t Rank                  = 4;
+constexpr index_t NumBatchNormReduceDim = 3;
+
+template <typename DataType>
+using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<DataType, DataType, DataType, DataType, DataType>,
+    ck::Tuple<DataType>,
+    Normalize,
+    Rank>;
+
+struct CKArgsBNormFwd
+{
+    CKArgsBNormFwd(const miopen::batchnorm::ProblemDescription& problem)
+    {
+        std::copy(problem.GetXDesc().GetLengths().begin(),
+                  problem.GetXDesc().GetLengths().end(),
+                  xyLengths.begin());
+
+        std::copy(problem.GetXDesc().GetStrides().begin(),
+                  problem.GetXDesc().GetStrides().end(),
+                  xyStrides.begin());
+
+        aligned_scaleBiasMeanVarStrides[0] = 0;
+        aligned_scaleBiasMeanVarStrides[1] = 1;
+        aligned_scaleBiasMeanVarStrides[2] = 0;
+        aligned_scaleBiasMeanVarStrides[3] = 0;
+    }
+
+    std::array<ck::index_t, Rank> xyLengths; // inOutLengths
+    std::array<ck::index_t, Rank> xyStrides; // inOutStrides
+    std::vector<int> invariantDims;
+
+    std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{3};
+
+    double epsilon = 0.0001;
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+};
+
+template <typename DataType>
+void PerformanceConfigCKBnFwdInference::Init(const miopen::batchnorm::ProblemDescription& problem)
+{
+    const auto& args       = CKArgsBNormFwd{problem};
+    const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp<DataType>>::GetInstances();
+    assert(!bn_fwd_ptrs.empty());
+    for(const auto& it : bn_fwd_ptrs)
+    {
+        auto argument_ptr = it->MakeArgumentPointer(args.xyLengths,
+                                                    {args.xyStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides},
+                                                    {args.xyStrides},
+                                                    {nullptr, nullptr, nullptr, nullptr, nullptr},
+                                                    {nullptr},
+                                                    Normalize{0.0});
+        if(it->IsSupportedArgument(argument_ptr.get()))
+        {
+            valid_kernels.push_back(it->GetTypeString());
+        }
+    }
+
+    assert(!valid_kernels.empty());
+    this->index     = 0;
+    this->kernel_id = valid_kernels[0];
+}
+
+template <typename DataType>
+bool PerformanceConfigCKBnFwdInference::CheckIsSupportCKArgs(
+    const miopen::batchnorm::ProblemDescription& problem) const
+{
+    const auto& args       = CKArgsBNormFwd{problem};
+    const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp<DataType>>::GetInstances();
+
+    int i = 0;
+    for(; i < bn_fwd_ptrs.size(); i++)
+    {
+        if(bn_fwd_ptrs[i]->GetTypeString() == this->kernel_id)
+        {
+            break;
+        }
+    }
+    if(i == valid_kernels.size())
+    {
+        return false;
+    }
+    auto argument_ptr =
+        bn_fwd_ptrs[i]->MakeArgumentPointer(args.xyLengths,
+                                            {args.xyStrides,
+                                             args.aligned_scaleBiasMeanVarStrides,
+                                             args.aligned_scaleBiasMeanVarStrides,
+                                             args.aligned_scaleBiasMeanVarStrides,
+                                             args.aligned_scaleBiasMeanVarStrides},
+                                            {args.xyStrides},
+                                            {nullptr, nullptr, nullptr, nullptr, nullptr},
+                                            {nullptr},
+                                            Normalize{0.0});
+    return bn_fwd_ptrs[i]->IsSupportedArgument(argument_ptr.get());
+}
+
+template <typename DataType>
+bool CKBnFwdInference::CheckCKApplicability(
+    const miopen::batchnorm::ProblemDescription& problem) const
+{
+    const auto& args       = CKArgsBNormFwd{problem};
+    const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp<DataType>>::GetInstances();
+    assert(!bn_fwd_ptrs.empty());
+
+    for(const auto& it : bn_fwd_ptrs)
+    {
+        auto argument_ptr = it->MakeArgumentPointer(args.xyLengths,
+                                                    {args.xyStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides},
+                                                    {args.xyStrides},
+                                                    {nullptr, nullptr, nullptr, nullptr, nullptr},
+                                                    {nullptr},
+                                                    Normalize{0.0});
+        if(it->IsSupportedArgument(argument_ptr.get()))
+            return true;
+    }
+    return false;
+}
+
+template <typename DataType>
+void RunCKSolution(const Handle& handle,
+                   const AnyInvokeParams& primitive_parameters,
+                   const miopen::batchnorm::ProblemDescription& problem,
+                   const PerformanceConfigCKBnFwdInference& config)
+{
+    const auto& args = CKArgsBNormFwd{problem};
+
+    const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp<DataType>>::GetInstances();
+
+    int index = 0;
+    for(; index < bn_fwd_ptrs.size(); index++)
+    {
+        if(bn_fwd_ptrs[index]->GetTypeString() == config.kernel_id)
+        {
+            break;
+        }
+    }
+    assert(index < bn_fwd_ptrs.size());
+    auto& bn_ptr           = bn_fwd_ptrs.at(index);
+    const auto& invoke_ctx = primitive_parameters.CastTo<miopen::fusion::FusionInvokeParams>();
+    assert(invoke_ctx.op_args.params[0] != nullptr);
+    const auto& params = dynamic_cast<miopen::fusion::BatchNormInferenceOpInvokeParam&>(
+        *invoke_ctx.op_args.params[0]);
+
+    auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths,
+                                                    {args.xyStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides,
+                                                     args.aligned_scaleBiasMeanVarStrides},
+                                                    {args.xyStrides},
+                                                    {invoke_ctx.in,
+                                                     params.estimatedMean,
+                                                     params.estimatedVariance,
+                                                     params.bnScale,
+                                                     params.bnBias},
+                                                    {invoke_ctx.out},
+                                                    Normalize{params.epsilon});
+
+    auto invoker_ptr            = bn_ptr->MakeInvokerPointer();
+    const auto enable_profiling = handle.IsProfilingEnabled();
+
+    float elapsed_time =
+        invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
+    if(enable_profiling)
+    {
+        handle.ResetKernelTime();
+        handle.AccumKernelTime(elapsed_time);
+    }
+}
+#endif
+
+void PerformanceConfigCKBnFwdInference::HeuristicInit(const FusionDescription& fdesc_problem)
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = fdesc_problem;
+#else
+    const auto& bn_problem =
+        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
+    switch(bn_problem.GetXDesc().GetType())
+    {
+    case miopenHalf: Init<ck::half_t>(bn_problem); break;
+    case miopenInt8:
+    case miopenFloat: Init<float>(bn_problem); break;
+    case miopenInt32:
+    case miopenInt8x4:
+    case miopenBFloat16:
+    case miopenDouble:
+    default: MIOPEN_THROW("Unsupported datatype");
+    }
+
+#endif
+}
+
+bool PerformanceConfigCKBnFwdInference::SetNextValue(const FusionDescription& fdesc_problem)
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    if(this->valid_kernels.empty())
+    {
+        this->HeuristicInit(fdesc_problem);
+        assert(!valid_kernels.empty());
+        return true;
+    }
+    if((this->index + 1) < valid_kernels.size())
+    {
+        ++this->index;
+        this->kernel_id = this->valid_kernels[index];
+        return true;
+    }
+    else
+        return false;
+#endif
+}
+
+bool PerformanceConfigCKBnFwdInference::IsValidValue() const
+{
+    return this->index >= 0 && this->index < valid_kernels.size();
+}
+
+bool PerformanceConfigCKBnFwdInference::IsValid(const FusionContext&,
+                                                const FusionDescription& fdesc_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    // Extract convolution problem from the fusion context.
+    const auto& bn_problem =
+        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
+    switch(bn_problem.GetDXDesc().GetType())
+    {
+    case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(bn_problem);
+    case miopenInt8:
+    case miopenFloat: return CheckIsSupportCKArgs<float>(bn_problem);
+    case miopenInt32:
+    case miopenInt8x4:
+    case miopenBFloat16:
+    case miopenDouble:
+    default: MIOPEN_THROW("Unsupported datatype");
+    }
+    return false;
+#endif
+}
+
+bool PerformanceConfigCKBnFwdInference::operator==(
+    const PerformanceConfigCKBnFwdInference& other) const
+{
+    return this->kernel_id == other.kernel_id;
+}
+PerformanceConfigCKBnFwdInference
+CKBnFwdInference::GetDefaultPerformanceConfig(const FusionContext&,
+                                              const FusionDescription& fdesc_problem) const
+{
+    PerformanceConfigCKBnFwdInference pp;
+    pp.HeuristicInit(fdesc_problem);
+    MIOPEN_LOG_I(pp.ToString());
+    return pp;
+}
+
+bool CKBnFwdInference::IsValidPerformanceConfig(
+    const FusionContext& ctx,
+    const FusionDescription& fdesc_problem,
+    const PerformanceConfigCKBnFwdInference& config) const
+{
+    return config.IsValid(ctx, fdesc_problem);
+}
+
+PerformanceConfigCKBnFwdInference CKBnFwdInference::Search(const FusionContext& ctx,
+                                                           const FusionDescription& fdesc_problem,
+                                                           const AnyInvokeParams& invoke_ctx) const
+{
+    return GenericSearch(*this, ctx, fdesc_problem, invoke_ctx);
+}
+
+bool CKBnFwdInference::IsApplicable(const FusionContext& ctx,
+                                    const FusionDescription& fdesc_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    const auto& desc = *fdesc_problem.fusion_plan_desc;
+    if(desc.op_map.empty())
+        MIOPEN_THROW(miopenStatusInternalError, "desc.op_map.empty()");
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_INFER{}))
+        return false;
+    if(desc.op_map.size() != 1)
+        return false;
+    const auto& bn_op = dynamic_cast<BatchNormInferenceFusionOpDescriptor&>(*desc.op_map[0]);
+    if(bn_op.kind() != miopenFusionOpBatchNormInference)
+        return false;
+    const auto& bn_problem =
+        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
+    if(!bn_problem.IsLayoutNHWC())
+        return false;
+
+    const std::string arch = ctx.GetStream().GetDeviceName();
+    if(arch != "gfx908" && arch != "gfx90a")
+        return false;
+
+    switch(bn_problem.GetXDesc().GetType())
+    {
+    case miopenHalf: return CheckCKApplicability<ck::half_t>(bn_problem);
+    case miopenInt8:
+    case miopenFloat: return CheckCKApplicability<float>(bn_problem);
+    case miopenInt32:
+    case miopenInt8x4:
+    case miopenBFloat16:
+    case miopenDouble:
+    default: MIOPEN_THROW("Unsupported datatype");
+    }
+    return false;
+#endif
+}
+
+ConvSolution CKBnFwdInference::GetSolution(const FusionContext&,
+                                           const FusionDescription& fdesc_problem,
+                                           const PerformanceConfigCKBnFwdInference& config) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = fdesc_problem;
+    std::ignore = config;
+    return {};
+#else
+    const auto& bn_problem =
+        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
+
+    ConvSolution result;
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        std::ignore = kernels;
+        return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
+            switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
+            {
+            case miopenHalf:
+                RunCKSolution<ck::half_t>(handle, primitive_parameters, bn_problem, config);
+                break;
+            case miopenInt8:
+            case miopenFloat:
+                RunCKSolution<float>(handle, primitive_parameters, bn_problem, config);
+                break;
+            case miopenInt32:
+            case miopenInt8x4:
+            case miopenBFloat16:
+            case miopenDouble:
+            default: MIOPEN_THROW("Unsupported datatype");
+            }
+        };
+    };
+    return result;
+#endif
+}
+
+} // namespace fusion
+} // namespace solver
+} // namespace miopen
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
new file mode 100644
index 0000000000..eaa374db48
--- /dev/null
+++ b/test/gtest/bn.hpp
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "bn_test_base.hpp"
+#include "test_fusion_plan_base.hpp"
+
+template <typename T>
+struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped                       = false;
+        std::tie(bn_config, tensor_layout) = GetParam();
+        bn_infer_data.SetUpImpl(bn_config, tensor_layout);
+
+        test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data);
+        test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data);
+    }
+
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+        test::FusionPlan::ComputeRefBN(bn_infer_data);
+        auto&& handle = get_handle();
+        bn_infer_data.output.data =
+            handle.Read<T>(bn_infer_data.out_dev, bn_infer_data.output.data.size());
+        test::FusionPlan::ComputeRefBN(bn_infer_data);
+        test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out);
+    }
+    BNTestCase bn_config;
+
+    bool test_skipped = false;
+    miopen::FusionPlanDescriptor fusePlanDesc;
+    miopen::OperatorArgs params;
+
+    BNInferSolverTest<T, BNTestCase> bn_infer_data;
+
+    miopenTensorLayout_t tensor_layout;
+};
diff --git a/test/gtest/bn_test_base.hpp b/test/gtest/bn_test_base.hpp
new file mode 100644
index 0000000000..2ae1e0aa16
--- /dev/null
+++ b/test/gtest/bn_test_base.hpp
@@ -0,0 +1,219 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <random>
+
+#include <miopen/miopen.h>
+#include <miopen/solver_id.hpp>
+#include <serialize.hpp>
+#include <fusionHost.hpp>
+
+#include "tensor_util.hpp"
+#include "get_handle.hpp"
+
+struct BNTestCase
+{
+    size_t N;
+    size_t C;
+    size_t H;
+    size_t W;
+    miopenBatchNormMode_t mode;
+    miopen::batchnorm::Direction Direction;
+    bool save;
+    bool keepRunning;
+
+    friend std::ostream& operator<<(std::ostream& ss, const BNTestCase& tc)
+    {
+        return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W
+                  << " mode: " << tc.mode << " Direction: " << static_cast<int>(tc.Direction)
+                  << " save: " << tc.save << " keepRunning: " << tc.keepRunning;
+    }
+    std::vector<size_t> GetInput() const { return {N, C, H, W}; }
+};
+
+std::vector<BNTestCase> Network1()
+{
+    // pyt_mlperf_resnet50v1.5
+    return {
+        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
+}
+
+template <typename T, typename TConfig>
+struct BNSolverTestBase
+{
+    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    {
+        bn_config     = config;
+        tensor_layout = t_layout;
+        CreateTensors();
+        InitTensorsWithRandValue();
+        SetDirection();
+        SetBNMode();
+        WriteToGPU();
+    }
+    const miopen::TensorDescriptor& GetInputDesc() const { return input.desc; }
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_out;
+    miopen::Allocator::ManageDataPtr in_dev;
+    miopen::Allocator::ManageDataPtr out_dev;
+
+    miopen::FusionPlanDescriptor fusePlanDesc;
+    miopen::OperatorArgs params;
+    miopen::TensorDescriptor bn_desc;
+    miopenBatchNormMode_t bn_mode;
+    miopen::batchnorm::Direction direction;
+    miopenTensorLayout_t tensor_layout;
+    TConfig bn_config;
+
+private:
+    void CreateTensors()
+    {
+        input   = tensor<T>{miopen_type<T>{}, tensor_layout, bn_config.GetInput()};
+        output  = tensor<T>{miopen_type<T>{}, tensor_layout, bn_config.GetInput()};
+        ref_out = output;
+    }
+
+    void InitTensorsWithRandValue()
+    {
+        std::random_device rd{};
+        std::mt19937 gen{rd()};
+        std::uniform_int_distribution<> d{0, 100};
+        auto gen_value = [&](auto...) {
+            return 1e-2 * static_cast<T>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+        };
+        input.generate(gen_value);
+    }
+
+    void SetDirection() { direction = bn_config.Direction; }
+    void SetBNMode() { bn_mode = bn_config.mode; }
+    void WriteToGPU()
+    {
+        auto&& handle = get_handle();
+        in_dev        = handle.Write(input.data);
+        out_dev       = handle.Write(output.data);
+    }
+};
+
+template <typename T, typename TConfig>
+struct BNInferSolverTest : public BNSolverTestBase<T, TConfig>
+{
+    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    {
+        BNSolverTestBase<T, TConfig>::SetUpImpl(config, t_layout);
+        CreateTensors();
+        InitTensorsWithRandValue();
+        WriteToGPU();
+    }
+
+    tensor<T> scale;
+    tensor<T> shift;
+    tensor<T> estMean;
+    tensor<T> estVariance;
+    miopen::Allocator::ManageDataPtr scale_dev;
+    miopen::Allocator::ManageDataPtr shift_dev; // bias
+    miopen::Allocator::ManageDataPtr estMean_dev;
+    miopen::Allocator::ManageDataPtr estVariance_dev;
+    double epsilon          = 1.0e-5;
+    const float alpha       = static_cast<float>(1.0f);
+    const float beta        = static_cast<float>(0);
+    const float activ_alpha = static_cast<double>(0.5f);
+    const float activ_beta  = static_cast<double>(0.5f);
+    const float activ_gamma = static_cast<double>(0.5f);
+
+private:
+    void CreateTensors()
+    {
+        auto derivedBnDesc = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         BNSolverTestBase<T, TConfig>::input.desc,
+                                         BNSolverTestBase<T, TConfig>::bn_mode);
+        scale       = tensor<T>{miopen_type<T>{},
+                          BNSolverTestBase<T, TConfig>::tensor_layout,
+                          derivedBnDesc.GetLengths()};
+        shift       = tensor<T>{miopen_type<T>{},
+                          BNSolverTestBase<T, TConfig>::tensor_layout,
+                          derivedBnDesc.GetLengths()};
+        estMean     = tensor<T>{miopen_type<T>{},
+                            BNSolverTestBase<T, TConfig>::tensor_layout,
+                            derivedBnDesc.GetLengths()};
+        estVariance = tensor<T>{miopen_type<T>{},
+                                BNSolverTestBase<T, TConfig>::tensor_layout,
+                                derivedBnDesc.GetLengths()};
+    }
+
+    void InitTensorsWithRandValue()
+    {
+        std::random_device rd{};
+        std::mt19937 gen{rd()};
+        std::uniform_int_distribution<> d{0, 100};
+        auto gen_value = [&](auto...) {
+            return 1e-2 * static_cast<T>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+        };
+        scale.generate(gen_value);
+        shift.generate(gen_value);
+        estMean.generate(gen_value);
+        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<T>(d(gen)) + 1); };
+        estVariance.generate(gen_var);
+    }
+    void WriteToGPU()
+    {
+        auto&& handle   = get_handle();
+        scale_dev       = handle.Write(scale.data);
+        shift_dev       = handle.Write(shift.data);
+        estMean_dev     = handle.Write(estMean.data);
+        estVariance_dev = handle.Write(estVariance.data);
+    }
+};
diff --git a/test/gtest/log_test_helper.cpp b/test/gtest/log_test_helper.cpp
index 1c3f521dc4..bcc5e07919 100644
--- a/test/gtest/log_test_helper.cpp
+++ b/test/gtest/log_test_helper.cpp
@@ -194,7 +194,7 @@ void TestLogFun(std::function<void(const miopenTensorDescriptor_t&,
     std::string str = capture_cerr.getString();
     // now do the assertions
     if(set_env)
-        ASSERT_TRUE(isSubStr(str, sub_str)) << "str     : " << str << "str_sub : " << sub_str;
+        ASSERT_TRUE(isSubStr(str, sub_str)) << "str     : " << str << "\nstr_sub : " << sub_str;
     else
-        ASSERT_FALSE(isSubStr(str, sub_str)) << "str     : " << str << "str_sub : " << sub_str;
+        ASSERT_FALSE(isSubStr(str, sub_str)) << "str     : " << str << "\nstr_sub : " << sub_str;
 }
diff --git a/test/gtest/na.hpp b/test/gtest/na.hpp
index 02bc98d0bc..1377a93038 100644
--- a/test/gtest/na.hpp
+++ b/test/gtest/na.hpp
@@ -25,186 +25,51 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
-
 #include <gtest/gtest.h>
-#include <miopen/miopen.h>
-#include <miopen/solver_id.hpp>
-#include <serialize.hpp>
-#include <fusionHost.hpp>
-
-#include "tensor_util.hpp"
-#include "get_handle.hpp"
-
-struct BNTestCase
-{
-    size_t N;
-    size_t C;
-    size_t H;
-    size_t W;
-    miopenBatchNormMode_t mode;
-    miopen::batchnorm::Direction Direction;
-    bool save;
-    bool keepRunning;
 
-    friend std::ostream& operator<<(std::ostream& ss, const BNTestCase& tc)
-    {
-        return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W
-                  << " mode: " << tc.mode << " Direction: " << static_cast<int>(tc.Direction)
-                  << " save: " << tc.save << " keepRunning: " << tc.keepRunning;
-    }
-    std::vector<size_t> GetInput() { return {N, C, H, W}; }
-};
-
-std::vector<BNTestCase> Network1()
-{
-    // pyt_mlperf_resnet50v1.5
-    return {
-        {64, 128, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
-}
+#include "bn_test_base.hpp"
+#include "test_fusion_plan_base.hpp"
 
 template <typename T>
-struct BNActivInferTest
-    : public ::testing::TestWithParam<std::tuple<miopenActivationMode_t, BNTestCase>>
+struct BNActivInferTest : public ::testing::TestWithParam<
+                              std::tuple<miopenActivationMode_t, BNTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
     {
-        test_skipped                    = false;
-        std::tie(activ_mode, bn_config) = GetParam();
-        bn_mode                         = bn_config.mode;
-        input                           = tensor<T>{bn_config.GetInput()};
-        auto derivedBnDesc              = miopen::TensorDescriptor{};
-        miopen::DeriveBNTensorDescriptor(derivedBnDesc, input.desc, bn_mode);
-        scale       = tensor<T>{derivedBnDesc.GetLengths()};
-        shift       = tensor<T>{derivedBnDesc.GetLengths()};
-        estMean     = tensor<T>{derivedBnDesc.GetLengths()};
-        estVariance = tensor<T>{derivedBnDesc.GetLengths()};
-        std::random_device rd{};
-        std::mt19937 gen{rd()};
-        std::uniform_int_distribution<> d{0, 100};
-        auto gen_value = [&](auto...) {
-            return 1e-2 * static_cast<T>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
-        };
-        input.generate(gen_value);
-        scale.generate(gen_value);
-        shift.generate(gen_value);
-        estMean.generate(gen_value);
-        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<T>(d(gen)) + 1); };
-        estVariance.generate(gen_var);
-        activ_desc    = {activ_mode, activ_alpha, activ_beta, activ_gamma};
-        output        = tensor<T>{bn_config.GetInput()};
-        auto&& handle = get_handle();
-        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
-        in_dev          = handle.Write(input.data);
-        scale_dev       = handle.Write(scale.data);
-        shift_dev       = handle.Write(shift.data);
-        estMean_dev     = handle.Write(estMean.data);
-        estVariance_dev = handle.Write(estVariance.data);
-        out_dev         = handle.Write(output.data);
+        test_skipped                                   = false;
+        std::tie(activ_mode, bn_config, tensor_layout) = GetParam();
+        bn_infer_data.SetUpImpl(bn_config, tensor_layout);
 
-        // Setup the Fusionplan
-        fusePlanDesc = miopen::FusionPlanDescriptor(miopenVerticalFusion, input.desc);
-        auto bnOp =
-            std::make_shared<miopen::BatchNormInferenceFusionOpDescriptor>(bn_mode, bn_desc);
-        EXPECT_EQ(fusePlanDesc.AddOp(bnOp), miopenStatusSuccess);
-        bnOp->SetArgs(params,
-                      &alpha,
-                      &beta,
-                      scale_dev.get(),
-                      shift_dev.get(),
-                      estMean_dev.get(),
-                      estVariance_dev.get(),
-                      epsilon);
-        auto activOp = std::make_shared<miopen::ActivFwdFusionOpDescriptor>(activ_desc.GetMode());
-        EXPECT_EQ(fusePlanDesc.AddOp(activOp), miopenStatusSuccess);
-        activOp->SetArgs(params, &alpha, &beta, activ_alpha, activ_beta, activ_gamma);
+        test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data);
+        test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data);
+        test::FusionPlan::AddActiv(fusePlanDesc, params, bn_infer_data, activ_mode);
     }
 
     void TearDown() override
     {
         if(test_skipped)
             return;
-        ref_out = tensor<T>{bn_config.GetInput()};
-        if(bn_mode == miopenBNPerActivation)
-        {
-            batchNormPerActivHostInference(
-                input, ref_out, scale, shift, epsilon, estMean, estVariance);
-        }
-        else
-        {
-            batchNormSpatialHostInference(
-                input, ref_out, scale, shift, epsilon, estMean, estVariance);
-        }
-        activationHostInfer(
-            activ_mode, activ_gamma, activ_beta, activ_alpha, ref_out.data, ref_out.data);
+        test::FusionPlan::ComputeRefBN(bn_infer_data);
+        activationHostInfer(activ_mode,
+                            bn_infer_data.activ_gamma,
+                            bn_infer_data.activ_beta,
+                            bn_infer_data.activ_alpha,
+                            bn_infer_data.ref_out.data,
+                            bn_infer_data.ref_out.data);
         auto&& handle = get_handle();
-        output.data   = handle.Read<T>(out_dev, output.data.size());
-        EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
-        EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
-        EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
-            << "Non finite number found in the GPU data";
-        EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
-        const double tolerance = 80;
-        double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
-        auto error             = miopen::rms_range(ref_out, output);
-        EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
-            << "Non finite number found in the CPU data";
-        EXPECT_TRUE(error < threshold)
-            << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+        bn_infer_data.output.data =
+            handle.Read<T>(bn_infer_data.out_dev, bn_infer_data.output.data.size());
+        test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out);
     }
     BNTestCase bn_config;
-    miopen::TensorDescriptor bn_desc;
-    miopen::ActivationDescriptor activ_desc;
-    miopenBatchNormMode_t bn_mode;
-    tensor<T> input;
-    tensor<T> output;
-    tensor<T> ref_out;
-    tensor<T> scale;
-    tensor<T> shift;
-    tensor<T> estMean;
-    tensor<T> estVariance;
-    miopen::Allocator::ManageDataPtr in_dev;
-    miopen::Allocator::ManageDataPtr out_dev;
-    miopen::Allocator::ManageDataPtr scale_dev;
-    miopen::Allocator::ManageDataPtr shift_dev;
-    miopen::Allocator::ManageDataPtr estMean_dev;
-    miopen::Allocator::ManageDataPtr estVariance_dev;
+
     bool test_skipped = false;
     miopenActivationMode_t activ_mode;
     miopen::FusionPlanDescriptor fusePlanDesc;
     miopen::OperatorArgs params;
-    const float alpha       = static_cast<float>(1.0f);
-    const float beta        = static_cast<float>(0);
-    const float activ_alpha = static_cast<double>(0.5f);
-    const float activ_beta  = static_cast<double>(0.5f);
-    const float activ_gamma = static_cast<double>(0.5f);
-    double epsilon          = 1.0e-5;
+
+    BNInferSolverTest<T, BNTestCase> bn_infer_data;
+
+    miopenTensorLayout_t tensor_layout;
 };
diff --git a/test/gtest/na_infer.cpp b/test/gtest/na_infer.cpp
index 8716a03348..b1a536ded0 100644
--- a/test/gtest/na_infer.cpp
+++ b/test/gtest/na_infer.cpp
@@ -28,6 +28,7 @@
 #include <miopen/fusion/fusion_invoke_params.hpp>
 
 #include "na.hpp"
+#include "bn.hpp"
 
 struct BNActivInferFloat : BNActivInferTest<float>
 {
@@ -37,6 +38,14 @@ struct BNActivInferHalf : BNActivInferTest<half_float::half>
 {
 };
 
+struct BNInferFloat : BNInferTest<float>
+{
+};
+
+struct BNInferHalf : BNInferTest<half_float::half>
+{
+};
+
 template <typename Solver, typename TestCase>
 void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
                const miopen::fusion::FusionInvokeParams& plan_params,
@@ -61,22 +70,77 @@ void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
     (invoker)(handle, plan_params);
     handle.Finish();
 }
+
+template <typename Solver, typename TestCase>
+void RunTunableSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
+                      const std::unique_ptr<miopen::fusion::FusionInvokeParams>& plan_params,
+                      const TestCase& config,
+                      bool& test_skipped)
+{
+    auto& handle = get_handle();
+    Solver solv{};
+    const auto fusion_problem = miopen::FusionDescription{&fusePlanDesc};
+    auto fusion_ctx           = miopen::FusionContext{handle};
+    fusion_ctx.DetectRocm();
+    if(!solv.IsApplicable(fusion_ctx, fusion_problem))
+    {
+        test_skipped = true;
+        GTEST_SKIP() << solv.SolverDbId() << " Not Applicable" << config;
+    }
+    ASSERT_TRUE(solv.IsApplicable(fusion_ctx, fusion_problem));
+    auto sol = solv.GetSolution(
+        fusion_ctx, fusion_problem, solv.GetDefaultPerformanceConfig(fusion_ctx, fusion_problem));
+    ASSERT_TRUE(sol.Succeeded());
+    ASSERT_TRUE(sol.invoker_factory);
+    const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
+    (invoker)(handle, *(plan_params.get()));
+    handle.Finish();
+}
+
 TEST_P(BNActivInferFloat, BnFwdInferActivationFused)
 {
-    const auto plan_params = miopen::fusion::FusionInvokeParams(
-        params, input.desc, in_dev.get(), output.desc, out_dev.get(), false);
+    const auto plan_params = miopen::fusion::FusionInvokeParams(params,
+                                                                bn_infer_data.input.desc,
+                                                                bn_infer_data.in_dev.get(),
+                                                                bn_infer_data.output.desc,
+                                                                bn_infer_data.out_dev.get(),
+                                                                false);
     RunSolver<miopen::solver::fusion::BnFwdInferActivationFused>(
         fusePlanDesc, plan_params, bn_config, test_skipped);
 }
 
+TEST_P(BNInferFloat, CKBnFwdInference)
+{
+    const auto plan_params =
+        std::make_unique<miopen::fusion::FusionInvokeParams>(params,
+                                                             bn_infer_data.input.desc,
+                                                             bn_infer_data.in_dev.get(),
+                                                             bn_infer_data.output.desc,
+                                                             bn_infer_data.out_dev.get(),
+                                                             false);
+    RunTunableSolver<miopen::solver::fusion::CKBnFwdInference>(
+        fusePlanDesc, plan_params, bn_config, test_skipped);
+}
+
+INSTANTIATE_TEST_SUITE_P(BNInferFloatSuite,
+                         BNInferFloat,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
 INSTANTIATE_TEST_SUITE_P(BNActivInferFloatSuite,
                          BNActivInferFloat,
                          testing::Combine(testing::Values(miopenActivationRELU),
-                                          testing::ValuesIn(Network1())));
+                                          testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNCHW)));
+
 TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused)
 {
-    const auto plan_params = miopen::fusion::FusionInvokeParams(
-        params, input.desc, in_dev.get(), output.desc, out_dev.get(), false);
+    const auto plan_params = miopen::fusion::FusionInvokeParams(params,
+                                                                bn_infer_data.input.desc,
+                                                                bn_infer_data.in_dev.get(),
+                                                                bn_infer_data.output.desc,
+                                                                bn_infer_data.out_dev.get(),
+                                                                false);
     RunSolver<miopen::solver::fusion::BnFwdInferActivationFused>(
         fusePlanDesc, plan_params, bn_config, test_skipped);
 }
@@ -84,4 +148,5 @@ TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused)
 INSTANTIATE_TEST_SUITE_P(BNActivInferHalfSuite,
                          BNActivInferHalf,
                          testing::Combine(testing::Values(miopenActivationRELU),
-                                          testing::ValuesIn(Network1())));
+                                          testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/test_fusion_plan_base.hpp b/test/gtest/test_fusion_plan_base.hpp
new file mode 100644
index 0000000000..e4b46ca1f6
--- /dev/null
+++ b/test/gtest/test_fusion_plan_base.hpp
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "bn_test_base.hpp"
+namespace test {
+namespace FusionPlan {
+template <typename DLModule>
+void InitFusionPlan(miopen::FusionPlanDescriptor& fusePlanDesc, DLModule& dl_module)
+{
+    fusePlanDesc = miopen::FusionPlanDescriptor(miopenVerticalFusion, dl_module.GetInputDesc());
+}
+
+template <typename DLModule>
+void AddBnInfer(miopen::FusionPlanDescriptor& fusePlanDesc,
+                miopen::OperatorArgs& params,
+                DLModule& dl_module)
+{
+    auto bnOp = std::make_shared<miopen::BatchNormInferenceFusionOpDescriptor>(dl_module.bn_mode,
+                                                                               dl_module.bn_desc);
+    EXPECT_EQ(fusePlanDesc.AddOp(bnOp), miopenStatusSuccess);
+    bnOp->SetArgs(params,
+                  &dl_module.alpha,
+                  &dl_module.beta,
+                  dl_module.scale_dev.get(),
+                  dl_module.shift_dev.get(),
+                  dl_module.estMean_dev.get(),
+                  dl_module.estVariance_dev.get(),
+                  dl_module.epsilon);
+}
+
+template <typename DLModule>
+void AddActiv(miopen::FusionPlanDescriptor& fusePlanDesc,
+              miopen::OperatorArgs& params,
+              DLModule& dl_module,
+              miopenActivationMode_t activ_mode)
+{
+    auto activOp = std::make_shared<miopen::ActivFwdFusionOpDescriptor>(activ_mode);
+    EXPECT_EQ(fusePlanDesc.AddOp(activOp), miopenStatusSuccess);
+    EXPECT_EQ(activOp->SetArgs(params,
+                               &dl_module.alpha,
+                               &dl_module.beta,
+                               dl_module.activ_alpha,
+                               dl_module.activ_beta,
+                               dl_module.activ_gamma),
+              miopenStatusSuccess);
+}
+
+bool Skip(miopen::Handle& handle)
+{
+    const std::string arch = handle.GetDeviceName();
+    bool skip_test         = (arch != "gfx908" && arch != "gfx90a");
+
+    return skip_test;
+}
+
+template <typename DLModule>
+void ComputeRefBN(DLModule& dl_module)
+{
+    if(dl_module.bn_mode == miopenBNPerActivation)
+    {
+        batchNormPerActivHostInference(dl_module.input,
+                                       dl_module.ref_out,
+                                       dl_module.scale,
+                                       dl_module.shift,
+                                       dl_module.epsilon,
+                                       dl_module.estMean,
+                                       dl_module.estVariance);
+    }
+    else
+    {
+        batchNormSpatialHostInference(dl_module.input,
+                                      dl_module.ref_out,
+                                      dl_module.scale,
+                                      dl_module.shift,
+                                      dl_module.epsilon,
+                                      dl_module.estMean,
+                                      dl_module.estVariance);
+    }
+}
+
+template <typename T>
+void BnCmpare(const tensor<T>& output, const tensor<T>& ref_out)
+{
+    EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
+    EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
+    EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
+        << "Non finite number found in the GPU data";
+    EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
+    const double tolerance = 80;
+    double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
+    auto error             = miopen::rms_range(ref_out, output);
+    EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
+        << "Non finite number found in the CPU data";
+    EXPECT_TRUE(error < threshold)
+        << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+}
+
+} // namespace FusionPlan
+} // namespace test

From 766d06014dfd0eabe6a3e9732e2c03a3f4352703 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 24 Jul 2023 14:45:12 +0000
Subject: [PATCH 02/28] bg/LWPMIOPEN-194 : removed stale code

---
 src/include/miopen/batchnorm/solvers.hpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index c7b91fb697..64f1e237bf 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -132,19 +132,6 @@ struct BnFwdInference final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
-struct CKBnFwdInference final : BatchnormSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<BnFwdTrainingSpatialSingle>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::batchnorm::ProblemDescription& problem) const override;
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::batchnorm::ProblemDescription& problem) const override;
-};
-
 } // namespace batchnorm
 
 } // namespace solver

From 75d8d95a185cef1f15e8a32fdd0892ecad307c24 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 24 Jul 2023 14:53:24 +0000
Subject: [PATCH 03/28] bg/LWPMIOPEN-194 : minor mixes

---
 test/gtest/bn.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index eaa374db48..87f504bef2 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -48,7 +48,6 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
     {
         if(test_skipped)
             return;
-        test::FusionPlan::ComputeRefBN(bn_infer_data);
         auto&& handle = get_handle();
         bn_infer_data.output.data =
             handle.Read<T>(bn_infer_data.out_dev, bn_infer_data.output.data.size());

From 171fa3a7f7b963f352981dfd2dc8d4f7f4734570 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 29 Aug 2023 15:24:46 +0000
Subject: [PATCH 04/28] bg/LWPMIOPEN-194 : fix rotate lens to pass NHWC layout
 is CK

---
 src/solver/batchnorm/forward_inference_ck.cpp | 16 +++++++++-------
 test/gtest/na_infer.cpp                       |  1 -
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 53765a3e85..c0f7909018 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -33,6 +33,7 @@
 #include <miopen/fusion/solvers.hpp>
 #include <miopen/generic_search.hpp>
 #include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/solver/ck_utility_common.hpp>
 #include <miopen/solver/problem_description_interpreter.hpp>
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
 #include <ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp>
@@ -69,15 +70,18 @@ struct CKArgsBNormFwd
         std::copy(problem.GetXDesc().GetStrides().begin(),
                   problem.GetXDesc().GetStrides().end(),
                   xyStrides.begin());
+        // prep for CK
+        std::sort(xyStrides.begin(), xyStrides.end(),  std::greater<>());
+        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
 
         aligned_scaleBiasMeanVarStrides[0] = 0;
-        aligned_scaleBiasMeanVarStrides[1] = 1;
+        aligned_scaleBiasMeanVarStrides[1] = 0;
         aligned_scaleBiasMeanVarStrides[2] = 0;
-        aligned_scaleBiasMeanVarStrides[3] = 0;
+        aligned_scaleBiasMeanVarStrides[3] = 1;
     }
 
-    std::array<ck::index_t, Rank> xyLengths; // inOutLengths
-    std::array<ck::index_t, Rank> xyStrides; // inOutStrides
+    std::array<ck::index_t, Rank> xyLengths; 
+    std::array<ck::index_t, Rank> xyStrides; 
     std::vector<int> invariantDims;
 
     std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{3};
@@ -358,9 +362,7 @@ bool CKBnFwdInference::IsApplicable(const FusionContext& ctx,
         fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
     if(!bn_problem.IsLayoutNHWC())
         return false;
-
-    const std::string arch = ctx.GetStream().GetDeviceName();
-    if(arch != "gfx908" && arch != "gfx90a")
+    if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())
diff --git a/test/gtest/na_infer.cpp b/test/gtest/na_infer.cpp
index 61c2c51021..2cc0f5f118 100644
--- a/test/gtest/na_infer.cpp
+++ b/test/gtest/na_infer.cpp
@@ -80,7 +80,6 @@ void RunTunableSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
     Solver solv{};
     const auto fusion_problem = miopen::FusionDescription{&fusePlanDesc};
     auto fusion_ctx           = miopen::FusionContext{handle};
-    fusion_ctx.DetectRocm();
     if(!solv.IsApplicable(fusion_ctx, fusion_problem))
     {
         test_skipped = true;

From 619fe9ee445b0bf75010432ab725ce8451d37b64 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Tue, 29 Aug 2023 19:12:37 -0700
Subject: [PATCH 05/28] fix clang format issue

---
 src/solver/batchnorm/forward_inference_ck.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index c0f7909018..78f2a5cf7b 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -71,7 +71,7 @@ struct CKArgsBNormFwd
                   problem.GetXDesc().GetStrides().end(),
                   xyStrides.begin());
         // prep for CK
-        std::sort(xyStrides.begin(), xyStrides.end(),  std::greater<>());
+        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
         std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
 
         aligned_scaleBiasMeanVarStrides[0] = 0;
@@ -80,8 +80,8 @@ struct CKArgsBNormFwd
         aligned_scaleBiasMeanVarStrides[3] = 1;
     }
 
-    std::array<ck::index_t, Rank> xyLengths; 
-    std::array<ck::index_t, Rank> xyStrides; 
+    std::array<ck::index_t, Rank> xyLengths;
+    std::array<ck::index_t, Rank> xyStrides;
     std::vector<int> invariantDims;
 
     std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{3};

From 279664b20383cd785dbdd845fecfc7645b560cb4 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sun, 3 Sep 2023 00:06:18 +0000
Subject: [PATCH 06/28] bg/LWPMIOPEN-194: inhert from non-tunable solver

---
 src/include/miopen/batchnorm/solvers.hpp      |  14 +
 src/include/miopen/fusion/solvers.hpp         |  54 ----
 src/ocl/batchnormocl.cpp                      |   3 +-
 src/solver.cpp                                |   1 +
 src/solver/batchnorm/forward_inference.cpp    |   6 +-
 src/solver/batchnorm/forward_inference_ck.cpp | 258 +++---------------
 test/gtest/bn.hpp                             |  40 ++-
 test/gtest/bn_infer.cpp                       |  42 +++
 .../{bn_test_base.hpp => bn_test_data.hpp}    |  23 +-
 test/gtest/na.hpp                             | 185 +++++++++++--
 test/gtest/na_infer.cpp                       |  76 +-----
 ...sion_plan_base.hpp => test_operations.hpp} |  75 +++--
 12 files changed, 330 insertions(+), 447 deletions(-)
 create mode 100644 test/gtest/bn_infer.cpp
 rename test/gtest/{bn_test_base.hpp => bn_test_data.hpp} (92%)
 rename test/gtest/{test_fusion_plan_base.hpp => test_operations.hpp} (88%)

diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 64f1e237bf..6cf494ce37 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -132,6 +132,20 @@ struct BnFwdInference final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
+// ----------- start BnCKFwdInference ---------------
+struct BnCKFwdInference final : BatchnormSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnCKFwdInference>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::batchnorm::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& problem) const override;
+};
+
+// ----------- end BnCKFwdInference ---------------
+
+
 } // namespace batchnorm
 
 } // namespace solver
diff --git a/src/include/miopen/fusion/solvers.hpp b/src/include/miopen/fusion/solvers.hpp
index c0b4ca3659..f5621e72dc 100644
--- a/src/include/miopen/fusion/solvers.hpp
+++ b/src/include/miopen/fusion/solvers.hpp
@@ -317,60 +317,6 @@ struct BnBwdTrgActivationFused final : FusionSolverBase
                              const FusionDescription& problem) const override;
 };
 
-struct PerformanceConfigCKBnFwdInference : PerfConfigBase<PerformanceConfigCKBnFwdInference>
-{
-    int index;
-    std::string kernel_id;
-    std::vector<std::string> valid_kernels;
-    PerformanceConfigCKBnFwdInference(int idx, std::string kernl_id)
-        : index(idx), kernel_id(kernl_id)
-    {
-    }
-    PerformanceConfigCKBnFwdInference() : PerformanceConfigCKBnFwdInference(0, "") {}
-    PerformanceConfigCKBnFwdInference(bool) : PerformanceConfigCKBnFwdInference(0, "") {}
-    void HeuristicInit(const FusionDescription& fdesc_problem);
-    bool SetNextValue(const FusionDescription& fdesc_problem);
-    bool IsValidValue() const;
-    bool IsValid(const FusionContext&, const FusionDescription& fdesc_problem) const;
-
-    template <typename Self, typename F>
-    static void Visit(Self&& s, F f)
-    {
-        f(s.kernel_id, "kernel_id");
-    }
-    bool operator==(const PerformanceConfigCKBnFwdInference& other) const;
-
-private:
-    template <typename DataType>
-    void Init(const miopen::batchnorm::ProblemDescription&);
-    template <typename DataType>
-    bool CheckIsSupportCKArgs(const miopen::batchnorm::ProblemDescription&) const;
-};
-
-struct CKBnFwdInference final : FusionTunableSolver<PerformanceConfigCKBnFwdInference>
-{
-    const std::string& SolverDbId() const override { return GetSolverDbId<CKBnFwdInference>(); }
-
-    PerformanceConfigCKBnFwdInference
-    GetDefaultPerformanceConfig(const FusionContext& ctx,
-                                const FusionDescription& fdesc_problem) const override;
-    bool IsValidPerformanceConfig(const FusionContext& ctx,
-                                  const FusionDescription& fdesc_problem,
-                                  const PerformanceConfigCKBnFwdInference& config) const override;
-    PerformanceConfigCKBnFwdInference Search(const FusionContext& ctx,
-                                             const FusionDescription& fdesc_problem,
-                                             const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const FusionContext& ctx,
-                      const FusionDescription& fdesc_problem) const override;
-    ConvSolution GetSolution(const FusionContext& ctx,
-                             const FusionDescription& fdesc_problem,
-                             const PerformanceConfigCKBnFwdInference& config) const override;
-
-private:
-    template <typename DataType>
-    bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription&) const;
-};
-
 } // namespace fusion
 } // namespace solver
 } // namespace miopen
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 5c8aac5386..1025509da4 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -222,7 +222,8 @@ void BatchNormForwardInference(Handle& handle,
         }();
 
         const auto algo    = AlgorithmName{"miopenBatchNormalizationForwardInference"};
-        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference>{};
+        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference,
+        solver::batchnorm::BnCKFwdInference>{};
 
         solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
     }
diff --git a/src/solver.cpp b/src/solver.cpp
index 89423b495d..48020d11b7 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -561,6 +561,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId());
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemmGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
+    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp
index 505349bedd..3a26673301 100644
--- a/src/solver/batchnorm/forward_inference.cpp
+++ b/src/solver/batchnorm/forward_inference.cpp
@@ -41,7 +41,11 @@ namespace batchnorm {
 bool BnFwdInference::IsApplicable(const ExecutionContext&,
                                   const miopen::batchnorm::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::batchnorm::Direction::ForwardInference;
+    if(problem.IsLayoutNHWC())
+        return false;
+    if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
+        return false;
+    return true;
 }
 
 ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context,
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index c0f7909018..1493f46d51 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -25,24 +25,19 @@
  *
  *******************************************************************************/
 
-#include <vector>
-#include <cstdint>
 
-#include <miopen/check_numerics.hpp>
-#include <miopen/solver.hpp>
-#include <miopen/fusion/solvers.hpp>
-#include <miopen/generic_search.hpp>
+#include <miopen/batchnorm/solvers.hpp>
 #include <miopen/batchnorm/invoke_params.hpp>
-#include <miopen/solver/ck_utility_common.hpp>
-#include <miopen/solver/problem_description_interpreter.hpp>
+#include <miopen/batch_norm.hpp>
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
 #include <ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp>
 #endif
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_INFER)
 
 namespace miopen {
 namespace solver {
-namespace fusion {
+namespace batchnorm {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -91,12 +86,14 @@ struct CKArgsBNormFwd
 };
 
 template <typename DataType>
-void PerformanceConfigCKBnFwdInference::Init(const miopen::batchnorm::ProblemDescription& problem)
+int CheckCKApplicability(
+    const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args       = CKArgsBNormFwd{problem};
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp<DataType>>::GetInstances();
     assert(!bn_fwd_ptrs.empty());
+    int count = 0;
     for(const auto& it : bn_fwd_ptrs)
     {
         auto argument_ptr = it->MakeArgumentPointer(args.xyLengths,
@@ -109,103 +106,28 @@ void PerformanceConfigCKBnFwdInference::Init(const miopen::batchnorm::ProblemDes
                                                     {nullptr, nullptr, nullptr, nullptr, nullptr},
                                                     {nullptr},
                                                     Normalize{0.0});
-        if(it->IsSupportedArgument(argument_ptr.get()))
-        {
-            valid_kernels.push_back(it->GetTypeString());
-        }
-    }
-
-    assert(!valid_kernels.empty());
-    this->index     = 0;
-    this->kernel_id = valid_kernels[0];
-}
-
-template <typename DataType>
-bool PerformanceConfigCKBnFwdInference::CheckIsSupportCKArgs(
-    const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& args       = CKArgsBNormFwd{problem};
-    const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp<DataType>>::GetInstances();
-
-    int i = 0;
-    for(; i < bn_fwd_ptrs.size(); i++)
-    {
-        if(bn_fwd_ptrs[i]->GetTypeString() == this->kernel_id)
-        {
-            break;
+        if(it->IsSupportedArgument(argument_ptr.get())){
+            return count;
         }
+        count++;
     }
-    if(i == valid_kernels.size())
-    {
-        return false;
-    }
-    auto argument_ptr =
-        bn_fwd_ptrs[i]->MakeArgumentPointer(args.xyLengths,
-                                            {args.xyStrides,
-                                             args.aligned_scaleBiasMeanVarStrides,
-                                             args.aligned_scaleBiasMeanVarStrides,
-                                             args.aligned_scaleBiasMeanVarStrides,
-                                             args.aligned_scaleBiasMeanVarStrides},
-                                            {args.xyStrides},
-                                            {nullptr, nullptr, nullptr, nullptr, nullptr},
-                                            {nullptr},
-                                            Normalize{0.0});
-    return bn_fwd_ptrs[i]->IsSupportedArgument(argument_ptr.get());
-}
-
-template <typename DataType>
-bool CKBnFwdInference::CheckCKApplicability(
-    const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& args       = CKArgsBNormFwd{problem};
-    const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp<DataType>>::GetInstances();
-    assert(!bn_fwd_ptrs.empty());
-
-    for(const auto& it : bn_fwd_ptrs)
-    {
-        auto argument_ptr = it->MakeArgumentPointer(args.xyLengths,
-                                                    {args.xyStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides},
-                                                    {args.xyStrides},
-                                                    {nullptr, nullptr, nullptr, nullptr, nullptr},
-                                                    {nullptr},
-                                                    Normalize{0.0});
-        if(it->IsSupportedArgument(argument_ptr.get()))
-            return true;
-    }
-    return false;
+    return -1;
 }
 
 template <typename DataType>
-void RunCKSolution(const Handle& handle,
+static void RunCKSolution(const Handle& handle,
                    const AnyInvokeParams& primitive_parameters,
-                   const miopen::batchnorm::ProblemDescription& problem,
-                   const PerformanceConfigCKBnFwdInference& config)
+                   const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args = CKArgsBNormFwd{problem};
 
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
         DeviceOp<DataType>>::GetInstances();
 
-    int index = 0;
-    for(; index < bn_fwd_ptrs.size(); index++)
-    {
-        if(bn_fwd_ptrs[index]->GetTypeString() == config.kernel_id)
-        {
-            break;
-        }
-    }
-    assert(index < bn_fwd_ptrs.size());
-    auto& bn_ptr           = bn_fwd_ptrs.at(index);
-    const auto& invoke_ctx = primitive_parameters.CastTo<miopen::fusion::FusionInvokeParams>();
-    assert(invoke_ctx.op_args.params[0] != nullptr);
-    const auto& params = dynamic_cast<miopen::fusion::BatchNormInferenceOpInvokeParam&>(
-        *invoke_ctx.op_args.params[0]);
+    int kernel_index = CheckCKApplicability<DataType>(problem);
+    assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size());
+    auto& bn_ptr           = bn_fwd_ptrs.at(kernel_index);
+    const auto& params = primitive_parameters.CastTo<miopen::batchnorm::InfInvokeParams>();
 
     auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths,
                                                     {args.xyStrides,
@@ -214,12 +136,12 @@ void RunCKSolution(const Handle& handle,
                                                      args.aligned_scaleBiasMeanVarStrides,
                                                      args.aligned_scaleBiasMeanVarStrides},
                                                     {args.xyStrides},
-                                                    {invoke_ctx.in,
+                                                    {params.x,
                                                      params.estimatedMean,
                                                      params.estimatedVariance,
                                                      params.bnScale,
                                                      params.bnBias},
-                                                    {invoke_ctx.out},
+                                                    {params.y},
                                                     Normalize{params.epsilon});
 
     auto invoker_ptr            = bn_ptr->MakeInvokerPointer();
@@ -235,131 +157,16 @@ void RunCKSolution(const Handle& handle,
 }
 #endif
 
-void PerformanceConfigCKBnFwdInference::HeuristicInit(const FusionDescription& fdesc_problem)
-{
-#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = fdesc_problem;
-#else
-    const auto& bn_problem =
-        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
-    switch(bn_problem.GetXDesc().GetType())
-    {
-    case miopenHalf: Init<ck::half_t>(bn_problem); break;
-    case miopenInt8:
-    case miopenFloat: Init<float>(bn_problem); break;
-    case miopenInt32:
-    case miopenInt8x4:
-    case miopenBFloat16:
-    case miopenDouble:
-    default: MIOPEN_THROW("Unsupported datatype");
-    }
-
-#endif
-}
-
-bool PerformanceConfigCKBnFwdInference::SetNextValue(const FusionDescription& fdesc_problem)
-{
-#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = fdesc_problem;
-    return false;
-#else
-    if(this->valid_kernels.empty())
-    {
-        this->HeuristicInit(fdesc_problem);
-        assert(!valid_kernels.empty());
-        return true;
-    }
-    if((this->index + 1) < valid_kernels.size())
-    {
-        ++this->index;
-        this->kernel_id = this->valid_kernels[index];
-        return true;
-    }
-    else
-        return false;
-#endif
-}
-
-bool PerformanceConfigCKBnFwdInference::IsValidValue() const
-{
-    return this->index >= 0 && this->index < valid_kernels.size();
-}
-
-bool PerformanceConfigCKBnFwdInference::IsValid(const FusionContext&,
-                                                const FusionDescription& fdesc_problem) const
-{
-#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = fdesc_problem;
-    return false;
-#else
-    // Extract convolution problem from the fusion context.
-    const auto& bn_problem =
-        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
-    switch(bn_problem.GetDXDesc().GetType())
-    {
-    case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(bn_problem);
-    case miopenInt8:
-    case miopenFloat: return CheckIsSupportCKArgs<float>(bn_problem);
-    case miopenInt32:
-    case miopenInt8x4:
-    case miopenBFloat16:
-    case miopenDouble:
-    default: MIOPEN_THROW("Unsupported datatype");
-    }
-    return false;
-#endif
-}
-
-bool PerformanceConfigCKBnFwdInference::operator==(
-    const PerformanceConfigCKBnFwdInference& other) const
-{
-    return this->kernel_id == other.kernel_id;
-}
-PerformanceConfigCKBnFwdInference
-CKBnFwdInference::GetDefaultPerformanceConfig(const FusionContext&,
-                                              const FusionDescription& fdesc_problem) const
-{
-    PerformanceConfigCKBnFwdInference pp;
-    pp.HeuristicInit(fdesc_problem);
-    MIOPEN_LOG_I(pp.ToString());
-    return pp;
-}
-
-bool CKBnFwdInference::IsValidPerformanceConfig(
-    const FusionContext& ctx,
-    const FusionDescription& fdesc_problem,
-    const PerformanceConfigCKBnFwdInference& config) const
-{
-    return config.IsValid(ctx, fdesc_problem);
-}
-
-PerformanceConfigCKBnFwdInference CKBnFwdInference::Search(const FusionContext& ctx,
-                                                           const FusionDescription& fdesc_problem,
-                                                           const AnyInvokeParams& invoke_ctx) const
-{
-    return GenericSearch(*this, ctx, fdesc_problem, invoke_ctx);
-}
-
-bool CKBnFwdInference::IsApplicable(const FusionContext& ctx,
-                                    const FusionDescription& fdesc_problem) const
+bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
+                                    const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
     std::ignore = ctx;
     std::ignore = fdesc_problem;
     return false;
 #else
-    const auto& desc = *fdesc_problem.fusion_plan_desc;
-    if(desc.op_map.empty())
-        MIOPEN_THROW(miopenStatusInternalError, "desc.op_map.empty()");
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_INFER{}))
         return false;
-    if(desc.op_map.size() != 1)
-        return false;
-    const auto& bn_op = dynamic_cast<BatchNormInferenceFusionOpDescriptor&>(*desc.op_map[0]);
-    if(bn_op.kind() != miopenFusionOpBatchNormInference)
-        return false;
-    const auto& bn_problem =
-        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
     if(!bn_problem.IsLayoutNHWC())
         return false;
     if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
@@ -367,9 +174,9 @@ bool CKBnFwdInference::IsApplicable(const FusionContext& ctx,
 
     switch(bn_problem.GetXDesc().GetType())
     {
-    case miopenHalf: return CheckCKApplicability<ck::half_t>(bn_problem);
+    case miopenHalf: return (CheckCKApplicability<ck::half_t>(bn_problem) != -1);
     case miopenInt8:
-    case miopenFloat: return CheckCKApplicability<float>(bn_problem);
+    case miopenFloat: return (CheckCKApplicability<float>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
@@ -380,30 +187,29 @@ bool CKBnFwdInference::IsApplicable(const FusionContext& ctx,
 #endif
 }
 
-ConvSolution CKBnFwdInference::GetSolution(const FusionContext&,
-                                           const FusionDescription& fdesc_problem,
-                                           const PerformanceConfigCKBnFwdInference& config) const
+ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context,
+                                           const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = fdesc_problem;
-    std::ignore = config;
+    std::ignore = context;
+    std::ignore = bn_problem;
     return {};
 #else
-    const auto& bn_problem =
-        fdesc_problem.GetBnProblem(0, miopen::batchnorm::Direction::ForwardInference);
+    std::ignore = context;
 
     ConvSolution result;
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
         std::ignore = kernels;
         return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
+            
             switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
             {
             case miopenHalf:
-                RunCKSolution<ck::half_t>(handle, primitive_parameters, bn_problem, config);
+                RunCKSolution<ck::half_t>(handle, primitive_parameters, bn_problem);
                 break;
             case miopenInt8:
             case miopenFloat:
-                RunCKSolution<float>(handle, primitive_parameters, bn_problem, config);
+                RunCKSolution<float>(handle, primitive_parameters, bn_problem);
                 break;
             case miopenInt32:
             case miopenInt8x4:
@@ -417,6 +223,6 @@ ConvSolution CKBnFwdInference::GetSolution(const FusionContext&,
 #endif
 }
 
-} // namespace fusion
+} // namespace batchnorm
 } // namespace solver
 } // namespace miopen
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 87f504bef2..fd9fcc8f85 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -25,10 +25,11 @@
  *******************************************************************************/
 #pragma once
 
+#include <miopen/miopen.h>
 #include <gtest/gtest.h>
 
-#include "bn_test_base.hpp"
-#include "test_fusion_plan_base.hpp"
+#include "bn_test_data.hpp"
+#include "test_operations.hpp"
 
 template <typename T>
 struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
@@ -38,10 +39,24 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
     {
         test_skipped                       = false;
         std::tie(bn_config, tensor_layout) = GetParam();
-        bn_infer_data.SetUpImpl(bn_config, tensor_layout);
+        bn_infer_test_data.SetUpImpl(bn_config, tensor_layout);
 
-        test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data);
-        test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data);
+        auto&& handle = get_handle();
+        miopenBatchNormalizationForwardInference(
+        &handle,
+        bn_config.mode,
+        &bn_infer_test_data.alpha,
+        &bn_infer_test_data.beta,
+        &bn_infer_test_data.input.desc,
+        bn_infer_test_data.in_dev.get(),
+        &bn_infer_test_data.output.desc,
+        bn_infer_test_data.out_dev.get(),
+        &bn_infer_test_data.scale.desc,
+        bn_infer_test_data.scale_dev.get(),
+        bn_infer_test_data.shift_dev.get(),
+        bn_infer_test_data.estMean_dev.get(),
+        bn_infer_test_data.estVariance_dev.get(),
+        bn_infer_test_data.epsilon);
     }
 
     void TearDown() override
@@ -49,18 +64,13 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         if(test_skipped)
             return;
         auto&& handle = get_handle();
-        bn_infer_data.output.data =
-            handle.Read<T>(bn_infer_data.out_dev, bn_infer_data.output.data.size());
-        test::FusionPlan::ComputeRefBN(bn_infer_data);
-        test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out);
+        bn_infer_test_data.output.data =
+            handle.Read<T>(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
+        test::ComputeCPUBNInference(bn_infer_test_data);
+        test::BnCmpare(bn_infer_test_data.output, bn_infer_test_data.ref_out);
     }
     BNTestCase bn_config;
-
     bool test_skipped = false;
-    miopen::FusionPlanDescriptor fusePlanDesc;
-    miopen::OperatorArgs params;
-
-    BNInferSolverTest<T, BNTestCase> bn_infer_data;
-
+    BNInferTestData<T, BNTestCase> bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
new file mode 100644
index 0000000000..6df3d547a5
--- /dev/null
+++ b/test/gtest/bn_infer.cpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+struct BNInferTestFloat : BNInferTest<float>
+{
+};
+
+TEST_P(BNInferTestFloat, BnFwdInferCK)
+{
+    
+}
+
+
+INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
+                         BNInferTestFloat,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_test_base.hpp b/test/gtest/bn_test_data.hpp
similarity index 92%
rename from test/gtest/bn_test_base.hpp
rename to test/gtest/bn_test_data.hpp
index 2ae1e0aa16..689cb1d058 100644
--- a/test/gtest/bn_test_base.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -29,6 +29,7 @@
 
 #include <miopen/miopen.h>
 #include <miopen/solver_id.hpp>
+#include <miopen/batchnorm/problem_description.hpp>
 #include <serialize.hpp>
 #include <fusionHost.hpp>
 
@@ -90,7 +91,7 @@ std::vector<BNTestCase> Network1()
 }
 
 template <typename T, typename TConfig>
-struct BNSolverTestBase
+struct BNTestData
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
     {
@@ -148,11 +149,11 @@ struct BNSolverTestBase
 };
 
 template <typename T, typename TConfig>
-struct BNInferSolverTest : public BNSolverTestBase<T, TConfig>
+struct BNInferTestData : public BNTestData<T, TConfig>
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
     {
-        BNSolverTestBase<T, TConfig>::SetUpImpl(config, t_layout);
+        BNTestData<T, TConfig>::SetUpImpl(config, t_layout);
         CreateTensors();
         InitTensorsWithRandValue();
         WriteToGPU();
@@ -167,8 +168,8 @@ struct BNInferSolverTest : public BNSolverTestBase<T, TConfig>
     miopen::Allocator::ManageDataPtr estMean_dev;
     miopen::Allocator::ManageDataPtr estVariance_dev;
     double epsilon          = 1.0e-5;
-    const float alpha       = static_cast<float>(1.0f);
-    const float beta        = static_cast<float>(0);
+    float alpha       = static_cast<float>(1.0f);
+    float beta        = static_cast<float>(0);
     const float activ_alpha = static_cast<double>(0.5f);
     const float activ_beta  = static_cast<double>(0.5f);
     const float activ_gamma = static_cast<double>(0.5f);
@@ -178,19 +179,19 @@ struct BNInferSolverTest : public BNSolverTestBase<T, TConfig>
     {
         auto derivedBnDesc = miopen::TensorDescriptor{};
         miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         BNSolverTestBase<T, TConfig>::input.desc,
-                                         BNSolverTestBase<T, TConfig>::bn_mode);
+                                         BNTestData<T, TConfig>::input.desc,
+                                         BNTestData<T, TConfig>::bn_mode);
         scale       = tensor<T>{miopen_type<T>{},
-                          BNSolverTestBase<T, TConfig>::tensor_layout,
+                          BNTestData<T, TConfig>::tensor_layout,
                           derivedBnDesc.GetLengths()};
         shift       = tensor<T>{miopen_type<T>{},
-                          BNSolverTestBase<T, TConfig>::tensor_layout,
+                          BNTestData<T, TConfig>::tensor_layout,
                           derivedBnDesc.GetLengths()};
         estMean     = tensor<T>{miopen_type<T>{},
-                            BNSolverTestBase<T, TConfig>::tensor_layout,
+                            BNTestData<T, TConfig>::tensor_layout,
                             derivedBnDesc.GetLengths()};
         estVariance = tensor<T>{miopen_type<T>{},
-                                BNSolverTestBase<T, TConfig>::tensor_layout,
+                                BNTestData<T, TConfig>::tensor_layout,
                                 derivedBnDesc.GetLengths()};
     }
 
diff --git a/test/gtest/na.hpp b/test/gtest/na.hpp
index 1377a93038..02bc98d0bc 100644
--- a/test/gtest/na.hpp
+++ b/test/gtest/na.hpp
@@ -25,51 +25,186 @@
  *******************************************************************************/
 #pragma once
 
+#include <random>
+
 #include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/solver_id.hpp>
+#include <serialize.hpp>
+#include <fusionHost.hpp>
+
+#include "tensor_util.hpp"
+#include "get_handle.hpp"
+
+struct BNTestCase
+{
+    size_t N;
+    size_t C;
+    size_t H;
+    size_t W;
+    miopenBatchNormMode_t mode;
+    miopen::batchnorm::Direction Direction;
+    bool save;
+    bool keepRunning;
 
-#include "bn_test_base.hpp"
-#include "test_fusion_plan_base.hpp"
+    friend std::ostream& operator<<(std::ostream& ss, const BNTestCase& tc)
+    {
+        return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W
+                  << " mode: " << tc.mode << " Direction: " << static_cast<int>(tc.Direction)
+                  << " save: " << tc.save << " keepRunning: " << tc.keepRunning;
+    }
+    std::vector<size_t> GetInput() { return {N, C, H, W}; }
+};
+
+std::vector<BNTestCase> Network1()
+{
+    // pyt_mlperf_resnet50v1.5
+    return {
+        {64, 128, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
+}
 
 template <typename T>
-struct BNActivInferTest : public ::testing::TestWithParam<
-                              std::tuple<miopenActivationMode_t, BNTestCase, miopenTensorLayout_t>>
+struct BNActivInferTest
+    : public ::testing::TestWithParam<std::tuple<miopenActivationMode_t, BNTestCase>>
 {
 protected:
     void SetUp() override
     {
-        test_skipped                                   = false;
-        std::tie(activ_mode, bn_config, tensor_layout) = GetParam();
-        bn_infer_data.SetUpImpl(bn_config, tensor_layout);
+        test_skipped                    = false;
+        std::tie(activ_mode, bn_config) = GetParam();
+        bn_mode                         = bn_config.mode;
+        input                           = tensor<T>{bn_config.GetInput()};
+        auto derivedBnDesc              = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc, input.desc, bn_mode);
+        scale       = tensor<T>{derivedBnDesc.GetLengths()};
+        shift       = tensor<T>{derivedBnDesc.GetLengths()};
+        estMean     = tensor<T>{derivedBnDesc.GetLengths()};
+        estVariance = tensor<T>{derivedBnDesc.GetLengths()};
+        std::random_device rd{};
+        std::mt19937 gen{rd()};
+        std::uniform_int_distribution<> d{0, 100};
+        auto gen_value = [&](auto...) {
+            return 1e-2 * static_cast<T>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+        };
+        input.generate(gen_value);
+        scale.generate(gen_value);
+        shift.generate(gen_value);
+        estMean.generate(gen_value);
+        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<T>(d(gen)) + 1); };
+        estVariance.generate(gen_var);
+        activ_desc    = {activ_mode, activ_alpha, activ_beta, activ_gamma};
+        output        = tensor<T>{bn_config.GetInput()};
+        auto&& handle = get_handle();
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+        in_dev          = handle.Write(input.data);
+        scale_dev       = handle.Write(scale.data);
+        shift_dev       = handle.Write(shift.data);
+        estMean_dev     = handle.Write(estMean.data);
+        estVariance_dev = handle.Write(estVariance.data);
+        out_dev         = handle.Write(output.data);
 
-        test::FusionPlan::InitFusionPlan(fusePlanDesc, bn_infer_data);
-        test::FusionPlan::AddBnInfer(fusePlanDesc, params, bn_infer_data);
-        test::FusionPlan::AddActiv(fusePlanDesc, params, bn_infer_data, activ_mode);
+        // Setup the Fusionplan
+        fusePlanDesc = miopen::FusionPlanDescriptor(miopenVerticalFusion, input.desc);
+        auto bnOp =
+            std::make_shared<miopen::BatchNormInferenceFusionOpDescriptor>(bn_mode, bn_desc);
+        EXPECT_EQ(fusePlanDesc.AddOp(bnOp), miopenStatusSuccess);
+        bnOp->SetArgs(params,
+                      &alpha,
+                      &beta,
+                      scale_dev.get(),
+                      shift_dev.get(),
+                      estMean_dev.get(),
+                      estVariance_dev.get(),
+                      epsilon);
+        auto activOp = std::make_shared<miopen::ActivFwdFusionOpDescriptor>(activ_desc.GetMode());
+        EXPECT_EQ(fusePlanDesc.AddOp(activOp), miopenStatusSuccess);
+        activOp->SetArgs(params, &alpha, &beta, activ_alpha, activ_beta, activ_gamma);
     }
 
     void TearDown() override
     {
         if(test_skipped)
             return;
-        test::FusionPlan::ComputeRefBN(bn_infer_data);
-        activationHostInfer(activ_mode,
-                            bn_infer_data.activ_gamma,
-                            bn_infer_data.activ_beta,
-                            bn_infer_data.activ_alpha,
-                            bn_infer_data.ref_out.data,
-                            bn_infer_data.ref_out.data);
+        ref_out = tensor<T>{bn_config.GetInput()};
+        if(bn_mode == miopenBNPerActivation)
+        {
+            batchNormPerActivHostInference(
+                input, ref_out, scale, shift, epsilon, estMean, estVariance);
+        }
+        else
+        {
+            batchNormSpatialHostInference(
+                input, ref_out, scale, shift, epsilon, estMean, estVariance);
+        }
+        activationHostInfer(
+            activ_mode, activ_gamma, activ_beta, activ_alpha, ref_out.data, ref_out.data);
         auto&& handle = get_handle();
-        bn_infer_data.output.data =
-            handle.Read<T>(bn_infer_data.out_dev, bn_infer_data.output.data.size());
-        test::FusionPlan::BnCmpare(bn_infer_data.output, bn_infer_data.ref_out);
+        output.data   = handle.Read<T>(out_dev, output.data.size());
+        EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
+        EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
+        EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
+            << "Non finite number found in the GPU data";
+        EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
+        const double tolerance = 80;
+        double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
+        auto error             = miopen::rms_range(ref_out, output);
+        EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
+            << "Non finite number found in the CPU data";
+        EXPECT_TRUE(error < threshold)
+            << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
     BNTestCase bn_config;
-
+    miopen::TensorDescriptor bn_desc;
+    miopen::ActivationDescriptor activ_desc;
+    miopenBatchNormMode_t bn_mode;
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_out;
+    tensor<T> scale;
+    tensor<T> shift;
+    tensor<T> estMean;
+    tensor<T> estVariance;
+    miopen::Allocator::ManageDataPtr in_dev;
+    miopen::Allocator::ManageDataPtr out_dev;
+    miopen::Allocator::ManageDataPtr scale_dev;
+    miopen::Allocator::ManageDataPtr shift_dev;
+    miopen::Allocator::ManageDataPtr estMean_dev;
+    miopen::Allocator::ManageDataPtr estVariance_dev;
     bool test_skipped = false;
     miopenActivationMode_t activ_mode;
     miopen::FusionPlanDescriptor fusePlanDesc;
     miopen::OperatorArgs params;
-
-    BNInferSolverTest<T, BNTestCase> bn_infer_data;
-
-    miopenTensorLayout_t tensor_layout;
+    const float alpha       = static_cast<float>(1.0f);
+    const float beta        = static_cast<float>(0);
+    const float activ_alpha = static_cast<double>(0.5f);
+    const float activ_beta  = static_cast<double>(0.5f);
+    const float activ_gamma = static_cast<double>(0.5f);
+    double epsilon          = 1.0e-5;
 };
diff --git a/test/gtest/na_infer.cpp b/test/gtest/na_infer.cpp
index 2cc0f5f118..f0d3407b2b 100644
--- a/test/gtest/na_infer.cpp
+++ b/test/gtest/na_infer.cpp
@@ -28,7 +28,6 @@
 #include <miopen/fusion/fusion_invoke_params.hpp>
 
 #include "na.hpp"
-#include "bn.hpp"
 
 struct BNActivInferFloat : BNActivInferTest<float>
 {
@@ -38,14 +37,6 @@ struct BNActivInferHalf : BNActivInferTest<half_float::half>
 {
 };
 
-struct BNInferFloat : BNInferTest<float>
-{
-};
-
-struct BNInferHalf : BNInferTest<half_float::half>
-{
-};
-
 template <typename Solver, typename TestCase>
 void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
                const miopen::fusion::FusionInvokeParams& plan_params,
@@ -69,76 +60,22 @@ void RunSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
     (invoker)(handle, plan_params);
     handle.Finish();
 }
-
-template <typename Solver, typename TestCase>
-void RunTunableSolver(miopen::FusionPlanDescriptor& fusePlanDesc,
-                      const std::unique_ptr<miopen::fusion::FusionInvokeParams>& plan_params,
-                      const TestCase& config,
-                      bool& test_skipped)
-{
-    auto& handle = get_handle();
-    Solver solv{};
-    const auto fusion_problem = miopen::FusionDescription{&fusePlanDesc};
-    auto fusion_ctx           = miopen::FusionContext{handle};
-    if(!solv.IsApplicable(fusion_ctx, fusion_problem))
-    {
-        test_skipped = true;
-        GTEST_SKIP() << solv.SolverDbId() << " Not Applicable" << config;
-    }
-    ASSERT_TRUE(solv.IsApplicable(fusion_ctx, fusion_problem));
-    auto sol = solv.GetSolution(
-        fusion_ctx, fusion_problem, solv.GetDefaultPerformanceConfig(fusion_ctx, fusion_problem));
-    ASSERT_TRUE(sol.Succeeded());
-    ASSERT_TRUE(sol.invoker_factory);
-    const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
-    (invoker)(handle, *(plan_params.get()));
-    handle.Finish();
-}
-
 TEST_P(BNActivInferFloat, BnFwdInferActivationFused)
 {
-    const auto plan_params = miopen::fusion::FusionInvokeParams(params,
-                                                                bn_infer_data.input.desc,
-                                                                bn_infer_data.in_dev.get(),
-                                                                bn_infer_data.output.desc,
-                                                                bn_infer_data.out_dev.get(),
-                                                                false);
+    const auto plan_params = miopen::fusion::FusionInvokeParams(
+        params, input.desc, in_dev.get(), output.desc, out_dev.get(), false);
     RunSolver<miopen::solver::fusion::BnFwdInferActivationFused>(
         fusePlanDesc, plan_params, bn_config, test_skipped);
 }
 
-TEST_P(BNInferFloat, CKBnFwdInference)
-{
-    const auto plan_params =
-        std::make_unique<miopen::fusion::FusionInvokeParams>(params,
-                                                             bn_infer_data.input.desc,
-                                                             bn_infer_data.in_dev.get(),
-                                                             bn_infer_data.output.desc,
-                                                             bn_infer_data.out_dev.get(),
-                                                             false);
-    RunTunableSolver<miopen::solver::fusion::CKBnFwdInference>(
-        fusePlanDesc, plan_params, bn_config, test_skipped);
-}
-
-INSTANTIATE_TEST_SUITE_P(BNInferFloatSuite,
-                         BNInferFloat,
-                         testing::Combine(testing::ValuesIn(Network1()),
-                                          testing::Values(miopenTensorNHWC)));
-
 INSTANTIATE_TEST_SUITE_P(BNActivInferFloatSuite,
                          BNActivInferFloat,
                          testing::Combine(testing::Values(miopenActivationRELU),
-                                          testing::ValuesIn(Network1()),
-                                          testing::Values(miopenTensorNCHW)));
-
+                                          testing::ValuesIn(Network1())));
 TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused)
 {
-    const auto plan_params = miopen::fusion::FusionInvokeParams(params,
-                                                                bn_infer_data.input.desc,
-                                                                bn_infer_data.in_dev.get(),
-                                                                bn_infer_data.output.desc,
-                                                                bn_infer_data.out_dev.get(),
-                                                                false);
+    const auto plan_params = miopen::fusion::FusionInvokeParams(
+        params, input.desc, in_dev.get(), output.desc, out_dev.get(), false);
     RunSolver<miopen::solver::fusion::BnFwdInferActivationFused>(
         fusePlanDesc, plan_params, bn_config, test_skipped);
 }
@@ -146,5 +83,4 @@ TEST_P(BNActivInferHalf, DISABLED_BnFwdInferActivationFused)
 INSTANTIATE_TEST_SUITE_P(BNActivInferHalfSuite,
                          BNActivInferHalf,
                          testing::Combine(testing::Values(miopenActivationRELU),
-                                          testing::ValuesIn(Network1()),
-                                          testing::Values(miopenTensorNHWC)));
+                                          testing::ValuesIn(Network1())));
diff --git a/test/gtest/test_fusion_plan_base.hpp b/test/gtest/test_operations.hpp
similarity index 88%
rename from test/gtest/test_fusion_plan_base.hpp
rename to test/gtest/test_operations.hpp
index e4b46ca1f6..398aaf406a 100644
--- a/test/gtest/test_fusion_plan_base.hpp
+++ b/test/gtest/test_operations.hpp
@@ -25,7 +25,37 @@
  *******************************************************************************/
 #pragma once
 
-#include "bn_test_base.hpp"
+namespace test{
+template <typename DLModule>
+void ComputeCPUBNInference(DLModule& dl_module)
+{
+    batchNormSpatialHostInference(dl_module.input,
+                                      dl_module.ref_out,
+                                      dl_module.scale,
+                                      dl_module.shift,
+                                      dl_module.epsilon,
+                                      dl_module.estMean,
+                                      dl_module.estVariance);
+}
+
+template <typename T>
+void BnCmpare(const tensor<T>& output, const tensor<T>& ref_out)
+{
+    EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
+    EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
+    EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
+        << "Non finite number found in the GPU data";
+    EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
+    const double tolerance = 80;
+    double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
+    auto error             = miopen::rms_range(ref_out, output);
+    EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
+        << "Non finite number found in the CPU data";
+    EXPECT_TRUE(error < threshold)
+        << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+}
+}
+
 namespace test {
 namespace FusionPlan {
 template <typename DLModule>
@@ -76,48 +106,5 @@ bool Skip(miopen::Handle& handle)
 
     return skip_test;
 }
-
-template <typename DLModule>
-void ComputeRefBN(DLModule& dl_module)
-{
-    if(dl_module.bn_mode == miopenBNPerActivation)
-    {
-        batchNormPerActivHostInference(dl_module.input,
-                                       dl_module.ref_out,
-                                       dl_module.scale,
-                                       dl_module.shift,
-                                       dl_module.epsilon,
-                                       dl_module.estMean,
-                                       dl_module.estVariance);
-    }
-    else
-    {
-        batchNormSpatialHostInference(dl_module.input,
-                                      dl_module.ref_out,
-                                      dl_module.scale,
-                                      dl_module.shift,
-                                      dl_module.epsilon,
-                                      dl_module.estMean,
-                                      dl_module.estVariance);
-    }
-}
-
-template <typename T>
-void BnCmpare(const tensor<T>& output, const tensor<T>& ref_out)
-{
-    EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
-    EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
-    EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
-        << "Non finite number found in the GPU data";
-    EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
-    const double tolerance = 80;
-    double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
-    auto error             = miopen::rms_range(ref_out, output);
-    EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
-        << "Non finite number found in the CPU data";
-    EXPECT_TRUE(error < threshold)
-        << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
-}
-
 } // namespace FusionPlan
 } // namespace test

From 4002995786f35784a6c57410246122c5f9e5781a Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sun, 3 Sep 2023 01:08:30 +0000
Subject: [PATCH 07/28]  bg/LWPMIOPEN-194 : add all data types supported by CK

---
 src/solver/batchnorm/forward_inference_ck.cpp | 63 ++++++++++++-------
 test/fusionHost.hpp                           | 16 ++---
 test/gtest/bn.hpp                             | 22 +++++--
 test/gtest/bn_infer.cpp                       | 35 ++++++++++-
 test/gtest/bn_test_data.hpp                   | 57 +++++++++--------
 test/gtest/test_operations.hpp                |  4 +-
 6 files changed, 131 insertions(+), 66 deletions(-)

diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 1493f46d51..13ee6e2fd5 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -47,12 +47,9 @@ using Normalize   = ck::tensor_operation::element_wise::NormalizeInInfer;
 constexpr index_t Rank                  = 4;
 constexpr index_t NumBatchNormReduceDim = 3;
 
-template <typename DataType>
-using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
-    ck::Tuple<DataType, DataType, DataType, DataType, DataType>,
-    ck::Tuple<DataType>,
-    Normalize,
-    Rank>;
+using F16  = ck::half_t;
+using F32  = float;
+using F64  = double;
 
 struct CKArgsBNormFwd
 {
@@ -81,17 +78,26 @@ struct CKArgsBNormFwd
 
     std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{3};
 
-    double epsilon = 0.0001;
     std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
 };
 
-template <typename DataType>
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
 int CheckCKApplicability(
     const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args       = CKArgsBNormFwd{problem};
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
+        ck::Tuple<YDataType>,
+        Normalize,
+        Rank>;
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp<DataType>>::GetInstances();
+            DeviceOp>::GetInstances();
     assert(!bn_fwd_ptrs.empty());
     int count = 0;
     for(const auto& it : bn_fwd_ptrs)
@@ -114,17 +120,28 @@ int CheckCKApplicability(
     return -1;
 }
 
-template <typename DataType>
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
 static void RunCKSolution(const Handle& handle,
                    const AnyInvokeParams& primitive_parameters,
                    const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args = CKArgsBNormFwd{problem};
 
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
+        ck::Tuple<YDataType>,
+        Normalize,
+        Rank>;
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp<DataType>>::GetInstances();
+            DeviceOp>::GetInstances();
 
-    int kernel_index = CheckCKApplicability<DataType>(problem);
+    int kernel_index = CheckCKApplicability<XDataType, YDataType, AccDataType, 
+                                        ScaleDataType, BiasDataType, MeanVarDataType>(problem);
     assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size());
     auto& bn_ptr           = bn_fwd_ptrs.at(kernel_index);
     const auto& params = primitive_parameters.CastTo<miopen::batchnorm::InfInvokeParams>();
@@ -174,13 +191,13 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
 
     switch(bn_problem.GetXDesc().GetType())
     {
-    case miopenHalf: return (CheckCKApplicability<ck::half_t>(bn_problem) != -1);
-    case miopenInt8:
-    case miopenFloat: return (CheckCKApplicability<float>(bn_problem) != -1);
+    case miopenHalf: return (CheckCKApplicability<F16, F16, F32, F16, F16, F32>(bn_problem) != -1);
+    case miopenFloat: return (CheckCKApplicability<F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
+    case miopenDouble: return (CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
+    case miopenBFloat16:
     case miopenInt32:
+    case miopenInt8:
     case miopenInt8x4:
-    case miopenBFloat16:
-    case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
     }
     return false;
@@ -205,16 +222,18 @@ ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context,
             switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
             {
             case miopenHalf:
-                RunCKSolution<ck::half_t>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F16, F16, F32, F16, F16, F32>(handle, primitive_parameters, bn_problem);
                 break;
-            case miopenInt8:
             case miopenFloat:
-                RunCKSolution<float>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F32, F32, F32, F32, F32, F32>(handle, primitive_parameters, bn_problem);
                 break;
+            case miopenDouble:
+                RunCKSolution<F64, F64, F64, F64, F64, F64>(handle, primitive_parameters, bn_problem);
+                break;
+            case miopenBFloat16:
+            case miopenInt8:
             case miopenInt32:
             case miopenInt8x4:
-            case miopenBFloat16:
-            case miopenDouble:
             default: MIOPEN_THROW("Unsupported datatype");
             }
         };
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 5d2b08f1bb..be27234c08 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -135,22 +135,22 @@ void convHostForward(const tensor<T>& input,
     }
 }
 
-template <class T, class U>
+template <class T, class U, class V = U>
 void batchNormSpatialHostInference(const tensor<T>& input,
                                    tensor<T>& output,
                                    const tensor<U>& scale,
                                    const tensor<U>& bias,
                                    double epsilon,
-                                   const tensor<U>& estimatedMean,
-                                   const tensor<U>& estimatedVariance)
+                                   const tensor<V>& estimatedMean,
+                                   const tensor<V>& estimatedVariance)
 {
 
     int n_batches, channels, height, width;
     std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
     par_for(channels, 1, [&](int cidx) { // via channel
-        double mean      = estimatedMean(0, cidx, 0, 0);
-        double variance  = estimatedVariance(0, cidx, 0, 0);
-        double invertVar = 1.0 / sqrt(variance + epsilon);
+        V mean      = estimatedMean(0, cidx, 0, 0);
+        V variance  = estimatedVariance(0, cidx, 0, 0);
+        V invertVar = 1.0 / sqrt(variance + epsilon);
         // process the batch per channel
         for(int row = 0; row < height; row++)
         { // via rows
@@ -158,8 +158,8 @@ void batchNormSpatialHostInference(const tensor<T>& input,
             { // via columns
                 for(int bidx = 0; bidx < n_batches; bidx++)
                 { // via mini_batch
-                    double elemStd = static_cast<double>(input(bidx, cidx, row, column)) - mean;
-                    double inhat   = elemStd * invertVar;
+                    V elemStd = static_cast<V>(input(bidx, cidx, row, column)) - mean;
+                    V inhat   = elemStd * invertVar;
                     output(bidx, cidx, row, column) =
                         static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
                     // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index fd9fcc8f85..86adf49f7b 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -31,7 +31,8 @@
 #include "bn_test_data.hpp"
 #include "test_operations.hpp"
 
-template <typename T>
+template <typename XDataType, typename YDataType, typename ScaleDataType, 
+          typename BiasDataType, typename MeanVarDataType>
 struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
 {
 protected:
@@ -57,6 +58,8 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         bn_infer_test_data.estMean_dev.get(),
         bn_infer_test_data.estVariance_dev.get(),
         bn_infer_test_data.epsilon);
+
+        std::fill(bn_infer_test_data.output.begin(), bn_infer_test_data.output.end(), std::numeric_limits<YDataType>::quiet_NaN());
     }
 
     void TearDown() override
@@ -65,12 +68,23 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
             return;
         auto&& handle = get_handle();
         bn_infer_test_data.output.data =
-            handle.Read<T>(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
+            handle.Read<YDataType>(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
         test::ComputeCPUBNInference(bn_infer_test_data);
-        test::BnCmpare(bn_infer_test_data.output, bn_infer_test_data.ref_out);
+        
+        if constexpr(std::is_same_v<YDataType, double>)
+        {
+            // tolerance for CK solver
+            test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8);
+        }
+        else{
+            test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out);
+        }
+
     }
+
     BNTestCase bn_config;
     bool test_skipped = false;
-    BNInferTestData<T, BNTestCase> bn_infer_test_data;
+    BNInferTestData<XDataType, YDataType, ScaleDataType, 
+          BiasDataType, MeanVarDataType, BNTestCase> bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 6df3d547a5..8785724884 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -26,17 +26,48 @@
 
 #include "bn.hpp"
 
-struct BNInferTestFloat : BNInferTest<float>
+struct BNInferTestHalf : BNInferTest<half_float::half, half_float::half, half_float::half, 
+    half_float::half, float>
 {
 };
 
-TEST_P(BNInferTestFloat, BnFwdInferCK)
+struct BNInferTestFloat : BNInferTest<float, float, float, 
+    float, float>
+{
+};
+
+struct BNInferTestDouble : BNInferTest<double, double, double, 
+    double, double>
+{
+};
+
+
+TEST_P(BNInferTestHalf, BnFwdInferCKHalf)
+{
+    
+}
+
+TEST_P(BNInferTestFloat, BnFwdInferCKFloat)
+{
+    
+}
+
+TEST_P(BNInferTestDouble, BnFwdInferCKDouble)
 {
     
 }
 
+INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
+                         BNInferTestHalf,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
 
 INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
                          BNInferTestFloat,
                          testing::Combine(testing::ValuesIn(Network1()),
                                           testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNInferTestDoubleNHWCSuite,
+                         BNInferTestDouble,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 689cb1d058..9ddb30bfeb 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -60,6 +60,7 @@ std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
+        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
         {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
@@ -90,7 +91,7 @@ std::vector<BNTestCase> Network1()
         {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
 }
 
-template <typename T, typename TConfig>
+template <typename XDataType, typename YDataType, typename TConfig>
 struct BNTestData
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
@@ -105,9 +106,9 @@ struct BNTestData
     }
     const miopen::TensorDescriptor& GetInputDesc() const { return input.desc; }
 
-    tensor<T> input;
-    tensor<T> output;
-    tensor<T> ref_out;
+    tensor<XDataType> input;
+    tensor<YDataType> output;
+    tensor<YDataType> ref_out;
     miopen::Allocator::ManageDataPtr in_dev;
     miopen::Allocator::ManageDataPtr out_dev;
 
@@ -122,8 +123,8 @@ struct BNTestData
 private:
     void CreateTensors()
     {
-        input   = tensor<T>{miopen_type<T>{}, tensor_layout, bn_config.GetInput()};
-        output  = tensor<T>{miopen_type<T>{}, tensor_layout, bn_config.GetInput()};
+        input   = tensor<XDataType>{miopen_type<XDataType>{}, tensor_layout, bn_config.GetInput()};
+        output  = tensor<YDataType>{miopen_type<YDataType>{}, tensor_layout, bn_config.GetInput()};
         ref_out = output;
     }
 
@@ -133,7 +134,7 @@ struct BNTestData
         std::mt19937 gen{rd()};
         std::uniform_int_distribution<> d{0, 100};
         auto gen_value = [&](auto...) {
-            return 1e-2 * static_cast<T>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+            return 1e-2 * static_cast<XDataType>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
         };
         input.generate(gen_value);
     }
@@ -148,21 +149,22 @@ struct BNTestData
     }
 };
 
-template <typename T, typename TConfig>
-struct BNInferTestData : public BNTestData<T, TConfig>
+template <typename XDataType, typename YDataType, typename ScaleDataType, 
+          typename BiasDataType, typename MeanVarDataType, typename TConfig>
+struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
     {
-        BNTestData<T, TConfig>::SetUpImpl(config, t_layout);
+        BNTestData<XDataType, YDataType, TConfig>::SetUpImpl(config, t_layout);
         CreateTensors();
         InitTensorsWithRandValue();
         WriteToGPU();
     }
 
-    tensor<T> scale;
-    tensor<T> shift;
-    tensor<T> estMean;
-    tensor<T> estVariance;
+    tensor<ScaleDataType> scale;
+    tensor<BiasDataType> shift;
+    tensor<MeanVarDataType> estMean;
+    tensor<MeanVarDataType> estVariance;
     miopen::Allocator::ManageDataPtr scale_dev;
     miopen::Allocator::ManageDataPtr shift_dev; // bias
     miopen::Allocator::ManageDataPtr estMean_dev;
@@ -179,19 +181,19 @@ struct BNInferTestData : public BNTestData<T, TConfig>
     {
         auto derivedBnDesc = miopen::TensorDescriptor{};
         miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         BNTestData<T, TConfig>::input.desc,
-                                         BNTestData<T, TConfig>::bn_mode);
-        scale       = tensor<T>{miopen_type<T>{},
-                          BNTestData<T, TConfig>::tensor_layout,
+                                         BNTestData<XDataType, YDataType, TConfig>::input.desc,
+                                         BNTestData<XDataType, YDataType, TConfig>::bn_mode);
+        scale       = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
                           derivedBnDesc.GetLengths()};
-        shift       = tensor<T>{miopen_type<T>{},
-                          BNTestData<T, TConfig>::tensor_layout,
+        shift       = tensor<BiasDataType>{miopen_type<BiasDataType>{},
+                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
                           derivedBnDesc.GetLengths()};
-        estMean     = tensor<T>{miopen_type<T>{},
-                            BNTestData<T, TConfig>::tensor_layout,
+        estMean     = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                            BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
                             derivedBnDesc.GetLengths()};
-        estVariance = tensor<T>{miopen_type<T>{},
-                                BNTestData<T, TConfig>::tensor_layout,
+        estVariance = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
                                 derivedBnDesc.GetLengths()};
     }
 
@@ -201,12 +203,13 @@ struct BNInferTestData : public BNTestData<T, TConfig>
         std::mt19937 gen{rd()};
         std::uniform_int_distribution<> d{0, 100};
         auto gen_value = [&](auto...) {
-            return 1e-2 * static_cast<T>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+            return 1e-2 * static_cast<ScaleDataType>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
         };
         scale.generate(gen_value);
         shift.generate(gen_value);
-        estMean.generate(gen_value);
-        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<T>(d(gen)) + 1); };
+        
+        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
+        estMean.generate(gen_var);
         estVariance.generate(gen_var);
     }
     void WriteToGPU()
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 398aaf406a..3ac2d40d5a 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -39,15 +39,13 @@ void ComputeCPUBNInference(DLModule& dl_module)
 }
 
 template <typename T>
-void BnCmpare(const tensor<T>& output, const tensor<T>& ref_out)
+void CompareTensor(const tensor<T>& output, const tensor<T>& ref_out, const T threshold = std::numeric_limits<T>::epsilon())
 {
     EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
     EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
     EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
         << "Non finite number found in the GPU data";
     EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
-    const double tolerance = 80;
-    double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
     auto error             = miopen::rms_range(ref_out, output);
     EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
         << "Non finite number found in the CPU data";

From bd7f9f6e2690e438882296d71a77178deedbeb94 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 4 Sep 2023 17:28:19 +0000
Subject: [PATCH 08/28] bg/LWPMIOPEN-194 : minor fixes

---
 test/gtest/bn.hpp           |  2 +-
 test/gtest/bn_infer.cpp     | 16 ----------------
 test/gtest/bn_test_data.hpp |  2 +-
 3 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 86adf49f7b..e5ff25517b 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -73,7 +73,7 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         
         if constexpr(std::is_same_v<YDataType, double>)
         {
-            // tolerance for CK solver
+            // tolerance for CK solver tolerance for
             test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8);
         }
         else{
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 8785724884..8d49d3c99e 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -36,12 +36,6 @@ struct BNInferTestFloat : BNInferTest<float, float, float,
 {
 };
 
-struct BNInferTestDouble : BNInferTest<double, double, double, 
-    double, double>
-{
-};
-
-
 TEST_P(BNInferTestHalf, BnFwdInferCKHalf)
 {
     
@@ -52,11 +46,6 @@ TEST_P(BNInferTestFloat, BnFwdInferCKFloat)
     
 }
 
-TEST_P(BNInferTestDouble, BnFwdInferCKDouble)
-{
-    
-}
-
 INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
                          BNInferTestHalf,
                          testing::Combine(testing::ValuesIn(Network1()),
@@ -66,8 +55,3 @@ INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
                          BNInferTestFloat,
                          testing::Combine(testing::ValuesIn(Network1()),
                                           testing::Values(miopenTensorNHWC)));
-
-INSTANTIATE_TEST_SUITE_P(BNInferTestDoubleNHWCSuite,
-                         BNInferTestDouble,
-                         testing::Combine(testing::ValuesIn(Network1()),
-                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 9ddb30bfeb..4692db5443 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -207,9 +207,9 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         };
         scale.generate(gen_value);
         shift.generate(gen_value);
+        estMean.generate(gen_value);
         
         auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
-        estMean.generate(gen_var);
         estVariance.generate(gen_var);
     }
     void WriteToGPU()

From 291d7522c87559b0896ee27c626f9520f0dac627 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 5 Sep 2023 13:58:43 +0000
Subject: [PATCH 09/28] bg/LWPMIOPEN-194: fix clang format

---
 src/include/miopen/batchnorm/solvers.hpp      |  1 -
 src/ocl/batchnormocl.cpp                      |  2 +-
 src/solver/batchnorm/forward_inference_ck.cpp | 80 ++++++++++---------
 test/gtest/bn.hpp                             | 59 +++++++-------
 test/gtest/bn_infer.cpp                       | 17 ++--
 test/gtest/bn_test_data.hpp                   | 39 +++++----
 test/gtest/test_operations.hpp                | 22 ++---
 7 files changed, 114 insertions(+), 106 deletions(-)

diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 6cf494ce37..4a2275294f 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -145,7 +145,6 @@ struct BnCKFwdInference final : BatchnormSolver
 
 // ----------- end BnCKFwdInference ---------------
 
-
 } // namespace batchnorm
 
 } // namespace solver
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 1025509da4..6c8a079a2a 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -223,7 +223,7 @@ void BatchNormForwardInference(Handle& handle,
 
         const auto algo    = AlgorithmName{"miopenBatchNormalizationForwardInference"};
         const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference,
-        solver::batchnorm::BnCKFwdInference>{};
+                                                     solver::batchnorm::BnCKFwdInference>{};
 
         solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
     }
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index e914b2bbf3..88968c2572 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -25,7 +25,6 @@
  *
  *******************************************************************************/
 
-
 #include <miopen/batchnorm/solvers.hpp>
 #include <miopen/batchnorm/invoke_params.hpp>
 #include <miopen/batch_norm.hpp>
@@ -47,9 +46,9 @@ using Normalize   = ck::tensor_operation::element_wise::NormalizeInInfer;
 constexpr index_t Rank                  = 4;
 constexpr index_t NumBatchNormReduceDim = 3;
 
-using F16  = ck::half_t;
-using F32  = float;
-using F64  = double;
+using F16 = ck::half_t;
+using F32 = float;
+using F64 = double;
 
 struct CKArgsBNormFwd
 {
@@ -87,17 +86,16 @@ template <typename XDataType,
           typename ScaleDataType,
           typename BiasDataType,
           typename MeanVarDataType>
-int CheckCKApplicability(
-    const miopen::batchnorm::ProblemDescription& problem)
+int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
 {
-    const auto& args       = CKArgsBNormFwd{problem};
-    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
+    const auto& args = CKArgsBNormFwd{problem};
+    using DeviceOp   = ck::tensor_operation::device::DeviceElementwise<
         ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
         ck::Tuple<YDataType>,
         Normalize,
         Rank>;
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
+        DeviceOp>::GetInstances();
     assert(!bn_fwd_ptrs.empty());
     int count = 0;
     for(const auto& it : bn_fwd_ptrs)
@@ -112,7 +110,8 @@ int CheckCKApplicability(
                                                     {nullptr, nullptr, nullptr, nullptr, nullptr},
                                                     {nullptr},
                                                     Normalize{0.0});
-        if(it->IsSupportedArgument(argument_ptr.get())){
+        if(it->IsSupportedArgument(argument_ptr.get()))
+        {
             return count;
         }
         count++;
@@ -127,8 +126,8 @@ template <typename XDataType,
           typename BiasDataType,
           typename MeanVarDataType>
 static void RunCKSolution(const Handle& handle,
-                   const AnyInvokeParams& primitive_parameters,
-                   const miopen::batchnorm::ProblemDescription& problem)
+                          const AnyInvokeParams& primitive_parameters,
+                          const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args = CKArgsBNormFwd{problem};
 
@@ -138,28 +137,29 @@ static void RunCKSolution(const Handle& handle,
         Normalize,
         Rank>;
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
-
-    int kernel_index = CheckCKApplicability<XDataType, YDataType, AccDataType, 
-                                        ScaleDataType, BiasDataType, MeanVarDataType>(problem);
+        DeviceOp>::GetInstances();
+
+    int kernel_index = CheckCKApplicability<XDataType,
+                                            YDataType,
+                                            AccDataType,
+                                            ScaleDataType,
+                                            BiasDataType,
+                                            MeanVarDataType>(problem);
     assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size());
-    auto& bn_ptr           = bn_fwd_ptrs.at(kernel_index);
+    auto& bn_ptr       = bn_fwd_ptrs.at(kernel_index);
     const auto& params = primitive_parameters.CastTo<miopen::batchnorm::InfInvokeParams>();
 
-    auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths,
-                                                    {args.xyStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides},
-                                                    {args.xyStrides},
-                                                    {params.x,
-                                                     params.estimatedMean,
-                                                     params.estimatedVariance,
-                                                     params.bnScale,
-                                                     params.bnBias},
-                                                    {params.y},
-                                                    Normalize{params.epsilon});
+    auto argument_ptr = bn_ptr->MakeArgumentPointer(
+        args.xyLengths,
+        {args.xyStrides,
+         args.aligned_scaleBiasMeanVarStrides,
+         args.aligned_scaleBiasMeanVarStrides,
+         args.aligned_scaleBiasMeanVarStrides,
+         args.aligned_scaleBiasMeanVarStrides},
+        {args.xyStrides},
+        {params.x, params.estimatedMean, params.estimatedVariance, params.bnScale, params.bnBias},
+        {params.y},
+        Normalize{params.epsilon});
 
     auto invoker_ptr            = bn_ptr->MakeInvokerPointer();
     const auto enable_profiling = handle.IsProfilingEnabled();
@@ -193,7 +193,8 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
     {
     case miopenHalf: return (CheckCKApplicability<F16, F16, F32, F16, F16, F32>(bn_problem) != -1);
     case miopenFloat: return (CheckCKApplicability<F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
-    case miopenDouble: return (CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
+    case miopenDouble:
+        return (CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
     case miopenBFloat16:
     case miopenInt32:
     case miopenInt8:
@@ -204,8 +205,9 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
 #endif
 }
 
-ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context,
-                                           const miopen::batchnorm::ProblemDescription& bn_problem) const
+ConvSolution
+BnCKFwdInference::GetSolution(const ExecutionContext& context,
+                              const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
     std::ignore = context;
@@ -218,17 +220,19 @@ ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
         std::ignore = kernels;
         return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
-            
             switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
             {
             case miopenHalf:
-                RunCKSolution<F16, F16, F32, F16, F16, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F16, F16, F32, F16, F16, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenFloat:
-                RunCKSolution<F32, F32, F32, F32, F32, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F32, F32, F32, F32, F32, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenDouble:
-                RunCKSolution<F64, F64, F64, F64, F64, F64>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F64, F64, F64, F64, F64, F64>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenBFloat16:
             case miopenInt8:
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index e5ff25517b..637c007e5d 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -31,8 +31,11 @@
 #include "bn_test_data.hpp"
 #include "test_operations.hpp"
 
-template <typename XDataType, typename YDataType, typename ScaleDataType, 
-          typename BiasDataType, typename MeanVarDataType>
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
 struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
 {
 protected:
@@ -43,48 +46,50 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         bn_infer_test_data.SetUpImpl(bn_config, tensor_layout);
 
         auto&& handle = get_handle();
-        miopenBatchNormalizationForwardInference(
-        &handle,
-        bn_config.mode,
-        &bn_infer_test_data.alpha,
-        &bn_infer_test_data.beta,
-        &bn_infer_test_data.input.desc,
-        bn_infer_test_data.in_dev.get(),
-        &bn_infer_test_data.output.desc,
-        bn_infer_test_data.out_dev.get(),
-        &bn_infer_test_data.scale.desc,
-        bn_infer_test_data.scale_dev.get(),
-        bn_infer_test_data.shift_dev.get(),
-        bn_infer_test_data.estMean_dev.get(),
-        bn_infer_test_data.estVariance_dev.get(),
-        bn_infer_test_data.epsilon);
+        miopenBatchNormalizationForwardInference(&handle,
+                                                 bn_config.mode,
+                                                 &bn_infer_test_data.alpha,
+                                                 &bn_infer_test_data.beta,
+                                                 &bn_infer_test_data.input.desc,
+                                                 bn_infer_test_data.in_dev.get(),
+                                                 &bn_infer_test_data.output.desc,
+                                                 bn_infer_test_data.out_dev.get(),
+                                                 &bn_infer_test_data.scale.desc,
+                                                 bn_infer_test_data.scale_dev.get(),
+                                                 bn_infer_test_data.shift_dev.get(),
+                                                 bn_infer_test_data.estMean_dev.get(),
+                                                 bn_infer_test_data.estVariance_dev.get(),
+                                                 bn_infer_test_data.epsilon);
 
-        std::fill(bn_infer_test_data.output.begin(), bn_infer_test_data.output.end(), std::numeric_limits<YDataType>::quiet_NaN());
+        std::fill(bn_infer_test_data.output.begin(),
+                  bn_infer_test_data.output.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
     }
 
     void TearDown() override
     {
         if(test_skipped)
             return;
-        auto&& handle = get_handle();
-        bn_infer_test_data.output.data =
-            handle.Read<YDataType>(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
+        auto&& handle                  = get_handle();
+        bn_infer_test_data.output.data = handle.Read<YDataType>(
+            bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
         test::ComputeCPUBNInference(bn_infer_test_data);
-        
+
         if constexpr(std::is_same_v<YDataType, double>)
         {
             // tolerance for CK solver tolerance for
-            test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8);
+            test::CompareTensor<YDataType>(
+                bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8);
         }
-        else{
+        else
+        {
             test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out);
         }
-
     }
 
     BNTestCase bn_config;
     bool test_skipped = false;
-    BNInferTestData<XDataType, YDataType, ScaleDataType, 
-          BiasDataType, MeanVarDataType, BNTestCase> bn_infer_test_data;
+    BNInferTestData<XDataType, YDataType, ScaleDataType, BiasDataType, MeanVarDataType, BNTestCase>
+        bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 8d49d3c99e..e3ff28e776 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -26,25 +26,18 @@
 
 #include "bn.hpp"
 
-struct BNInferTestHalf : BNInferTest<half_float::half, half_float::half, half_float::half, 
-    half_float::half, float>
+struct BNInferTestHalf
+    : BNInferTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-struct BNInferTestFloat : BNInferTest<float, float, float, 
-    float, float>
+struct BNInferTestFloat : BNInferTest<float, float, float, float, float>
 {
 };
 
-TEST_P(BNInferTestHalf, BnFwdInferCKHalf)
-{
-    
-}
+TEST_P(BNInferTestHalf, BnFwdInferCKHalf) {}
 
-TEST_P(BNInferTestFloat, BnFwdInferCKFloat)
-{
-    
-}
+TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {}
 
 INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
                          BNInferTestHalf,
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 4692db5443..a7aff78da3 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -149,8 +149,12 @@ struct BNTestData
     }
 };
 
-template <typename XDataType, typename YDataType, typename ScaleDataType, 
-          typename BiasDataType, typename MeanVarDataType, typename TConfig>
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename TConfig>
 struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
@@ -170,8 +174,8 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
     miopen::Allocator::ManageDataPtr estMean_dev;
     miopen::Allocator::ManageDataPtr estVariance_dev;
     double epsilon          = 1.0e-5;
-    float alpha       = static_cast<float>(1.0f);
-    float beta        = static_cast<float>(0);
+    float alpha             = static_cast<float>(1.0f);
+    float beta              = static_cast<float>(0);
     const float activ_alpha = static_cast<double>(0.5f);
     const float activ_beta  = static_cast<double>(0.5f);
     const float activ_gamma = static_cast<double>(0.5f);
@@ -183,18 +187,19 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         miopen::DeriveBNTensorDescriptor(derivedBnDesc,
                                          BNTestData<XDataType, YDataType, TConfig>::input.desc,
                                          BNTestData<XDataType, YDataType, TConfig>::bn_mode);
-        scale       = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
-                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                          derivedBnDesc.GetLengths()};
-        shift       = tensor<BiasDataType>{miopen_type<BiasDataType>{},
-                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                          derivedBnDesc.GetLengths()};
-        estMean     = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
-                            BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                            derivedBnDesc.GetLengths()};
-        estVariance = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
-                                BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                derivedBnDesc.GetLengths()};
+        scale   = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                                      BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                      derivedBnDesc.GetLengths()};
+        shift   = tensor<BiasDataType>{miopen_type<BiasDataType>{},
+                                     BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                     derivedBnDesc.GetLengths()};
+        estMean = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                          derivedBnDesc.GetLengths()};
+        estVariance =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
     }
 
     void InitTensorsWithRandValue()
@@ -208,7 +213,7 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         scale.generate(gen_value);
         shift.generate(gen_value);
         estMean.generate(gen_value);
-        
+
         auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
         estVariance.generate(gen_var);
     }
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 3ac2d40d5a..d4e524e472 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -25,34 +25,36 @@
  *******************************************************************************/
 #pragma once
 
-namespace test{
+namespace test {
 template <typename DLModule>
 void ComputeCPUBNInference(DLModule& dl_module)
 {
     batchNormSpatialHostInference(dl_module.input,
-                                      dl_module.ref_out,
-                                      dl_module.scale,
-                                      dl_module.shift,
-                                      dl_module.epsilon,
-                                      dl_module.estMean,
-                                      dl_module.estVariance);
+                                  dl_module.ref_out,
+                                  dl_module.scale,
+                                  dl_module.shift,
+                                  dl_module.epsilon,
+                                  dl_module.estMean,
+                                  dl_module.estVariance);
 }
 
 template <typename T>
-void CompareTensor(const tensor<T>& output, const tensor<T>& ref_out, const T threshold = std::numeric_limits<T>::epsilon())
+void CompareTensor(const tensor<T>& output,
+                   const tensor<T>& ref_out,
+                   const T threshold = std::numeric_limits<T>::epsilon())
 {
     EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
     EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
     EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
         << "Non finite number found in the GPU data";
     EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
-    auto error             = miopen::rms_range(ref_out, output);
+    auto error = miopen::rms_range(ref_out, output);
     EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
         << "Non finite number found in the CPU data";
     EXPECT_TRUE(error < threshold)
         << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
 }
-}
+} // namespace test
 
 namespace test {
 namespace FusionPlan {

From f09c16ea19475b0dca85a1506fe648dce988f852 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 7 Sep 2023 00:14:07 +0000
Subject: [PATCH 10/28] bg/LWPMIOPEN-193_bn_back : first commit test working

---
 src/CMakeLists.txt                            |   1 +
 src/include/miopen/batchnorm/solvers.hpp      |  12 +
 src/ocl/batchnormocl.cpp                      |   4 +-
 src/solver/batchnorm/backward_ck.cpp          | 277 ++++++++++++++++++
 src/solver/batchnorm/forward_inference_ck.cpp |   1 +
 test/gtest/bn.hpp                             |  79 ++++-
 test/gtest/bn_bwd.cpp                         |  65 ++++
 test/gtest/bn_test_data.hpp                   | 175 +++++++++--
 test/gtest/test_operations.hpp                |  14 +
 9 files changed, 594 insertions(+), 34 deletions(-)
 create mode 100644 src/solver/batchnorm/backward_ck.cpp
 create mode 100644 test/gtest/bn_bwd.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dc86c4136c..677a23b4fb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -146,6 +146,7 @@ set( MIOpen_Source
     solver/activ/bwd_1.cpp
     solver/activ/fwd_0.cpp
     solver/activ/fwd_1.cpp
+    solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
     solver/batchnorm/backward_spatial_multiple.cpp
diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 6cf494ce37..3cc42e36cf 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -145,6 +145,18 @@ struct BnCKFwdInference final : BatchnormSolver
 
 // ----------- end BnCKFwdInference ---------------
 
+// ----------- start BnCKBwdBackward ---------------
+struct BnCKBwdBackward final : BatchnormSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnCKBwdBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::batchnorm::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& problem) const override;
+};
+
+// ----------- end BnCKBwdBackward ---------------
 
 } // namespace batchnorm
 
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 1025509da4..10d2bb8372 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -338,7 +338,6 @@ void BatchNormBackward(Handle& handle,
         tmp.dx                = dx;
         tmp.bnScale           = bnScale;
         tmp.resultBnScaleDiff = resultBnScaleDiff;
-        tmp.resultBnScaleDiff = resultBnScaleDiff;
         tmp.resultBnBiasDiff  = resultBnBiasDiff;
         tmp.epsilon           = epsilon;
         tmp.savedMean         = savedMean;
@@ -346,7 +345,8 @@ void BatchNormBackward(Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnBwdTrainingSpatialSingle,
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKBwdBackward, 
+                                                 solver::batchnorm::BnBwdTrainingSpatialSingle,
                                                  solver::batchnorm::BnBwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnBwdTrainingPerActivation>{};
 
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
new file mode 100644
index 0000000000..8300a3de62
--- /dev/null
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -0,0 +1,277 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/solvers.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/bfloat16.hpp>
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
+#include <ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp>
+#endif
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_BACK)
+
+namespace miopen {
+namespace solver {
+namespace batchnorm {
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using index_t     = int32_t;
+
+constexpr index_t Rank                  = 4;
+constexpr index_t NumBatchNormReduceDim = 3;
+
+using F16  = ck::half_t;
+using F32  = float;
+using F64  = double;
+using BF16 = bfloat16;
+
+struct CKArgsBNormFwd
+{
+    CKArgsBNormFwd(const miopen::batchnorm::ProblemDescription& problem)
+    {
+        std::copy(problem.GetXDesc().GetLengths().begin(),
+                  problem.GetXDesc().GetLengths().end(),
+                  xyLengths.begin());
+
+        std::copy(problem.GetXDesc().GetStrides().begin(),
+                  problem.GetXDesc().GetStrides().end(),
+                  xyStrides.begin());
+        arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel
+        arrScaleBiasMeanVarStrides[0] = 1;
+        
+        // prep for CK
+        std::sort(xyStrides.begin(), xyStrides.end(),  std::greater<>());
+        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+    }
+
+    std::array<ck::index_t, Rank> xyLengths; // inOutLengths
+    std::array<ck::index_t, Rank> xyStrides; // inOutStrides
+    std::vector<int> invariantDims;
+
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
+
+    double epsilon = 0.0001;
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+};
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+int CheckCKApplicability(
+    const miopen::batchnorm::ProblemDescription& problem)
+{
+    const auto& args       = CKArgsBNormFwd{problem};
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      DscaleDbiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+    const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+    assert(!bn_bwd_ptrs.empty());
+    int count = 0;
+    for(const auto& it : bn_bwd_ptrs)
+    {
+        auto argument_ptr = it->MakeArgumentPointer(args.xyLengths,
+                                                    args.xyStrides,
+                                                    args.xyStrides,
+                                                    args.xyStrides,
+                                                    args.reduceDims,
+                                                    args.arrScaleBiasMeanVarLengths,
+                                                    args.arrScaleBiasMeanVarStrides,
+                                                    args.arrScaleBiasMeanVarStrides,
+                                                    args.arrScaleBiasMeanVarStrides,
+                                                    nullptr,
+                                                    nullptr,
+                                                    nullptr,
+                                                    nullptr,
+                                                    nullptr,
+                                                    args.epsilon,
+                                                    PassThrough{},
+                                                    nullptr,
+                                                    nullptr,
+                                                    nullptr);
+        if(it->IsSupportedArgument(argument_ptr.get())){
+            return count;
+        }
+        count++;
+    }
+    return -1;
+}
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+static void RunCKSolution(const Handle& handle,
+                   const AnyInvokeParams& primitive_parameters,
+                   const miopen::batchnorm::ProblemDescription& problem)
+{
+    const auto& args = CKArgsBNormFwd{problem};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      DscaleDbiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+    const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    int kernel_index = CheckCKApplicability<XDataType,
+                                            DxDataType,
+                                            DyDataType,
+                                            AccDataType,
+                                            ScaleDataType,
+                                            DscaleDbiasDataType,
+                                            MeanVarDataType>(problem);
+    assert(kernel_index >= 0 && kernel_index < bn_bwd_ptrs.size());
+    auto& bn_ptr           = bn_bwd_ptrs.at(kernel_index);
+    const auto& params = primitive_parameters.CastTo<miopen::batchnorm::BwdInvokeParams>();
+    auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths,
+                                                    args.xyStrides,
+                                                    args.xyStrides,
+                                                    args.xyStrides,
+                                                    args.reduceDims,
+                                                    args.arrScaleBiasMeanVarLengths,
+                                                    args.arrScaleBiasMeanVarStrides,
+                                                    args.arrScaleBiasMeanVarStrides,
+                                                    args.arrScaleBiasMeanVarStrides,
+                                                    params.x,
+                                                    params.dy,
+                                                    params.bnScale,
+                                                    params.savedMean,
+                                                    params.savedInvVariance,
+                                                    args.epsilon,
+                                                    PassThrough{},
+                                                    params.dx,
+                                                    params.resultBnScaleDiff,
+                                                    params.resultBnBiasDiff);
+    auto invoker_ptr            = bn_ptr->MakeInvokerPointer();
+    const auto enable_profiling = handle.IsProfilingEnabled();
+
+    float elapsed_time =
+        invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
+    if(enable_profiling)
+    {
+        handle.ResetKernelTime();
+        handle.AccumKernelTime(elapsed_time);
+    }
+}
+#endif
+
+bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
+                                    const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_BACK{}))
+        return false;
+    if(!bn_problem.IsLayoutNHWC())
+        return false;
+    if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
+        return false;
+
+    switch(bn_problem.GetXDesc().GetType())
+    {
+    case miopenFloat: return (CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
+    case miopenDouble: return (CheckCKApplicability<F64, F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
+    case miopenHalf: return (CheckCKApplicability<F16, F32, F32, F32, F16, F32, F32>(bn_problem) != -1);
+    case miopenBFloat16: return (CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem) != -1);
+    case miopenInt32:
+    case miopenInt8:
+    case miopenInt8x4:
+    default: MIOPEN_THROW("Unsupported datatype");
+    }
+    return false;
+#endif
+}
+
+ConvSolution BnCKBwdBackward::GetSolution(const ExecutionContext& context,
+                                           const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = context;
+    std::ignore = bn_problem;
+    return {};
+#else
+    std::ignore = context;
+
+    ConvSolution result;
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        std::ignore = kernels;
+        return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
+
+            switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
+            {
+            case miopenFloat:
+                RunCKSolution<F32, F32, F32, F32, F32, F32, F32>(handle, primitive_parameters, bn_problem);
+                break;
+            case miopenDouble:
+                RunCKSolution<F64, F64, F64, F64, F64, F64, F64>(handle, primitive_parameters, bn_problem);
+                break;
+            case miopenHalf:
+                RunCKSolution<F16, F32, F32, F32, F16, F32, F32>(handle, primitive_parameters, bn_problem);
+                break;
+            case miopenBFloat16:
+                RunCKSolution<BF16, F32, F32, F32, BF16, F32, F32>(handle, primitive_parameters, bn_problem);
+                break;
+            case miopenInt8:
+            case miopenInt32:
+            case miopenInt8x4:
+            default: MIOPEN_THROW("Unsupported datatype");
+            }
+        };
+    };
+    return result;
+#endif
+}
+
+} // namespace batchnorm
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index e914b2bbf3..70f3854caf 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -50,6 +50,7 @@ constexpr index_t NumBatchNormReduceDim = 3;
 using F16  = ck::half_t;
 using F32  = float;
 using F64  = double;
+using BF16 = ck::bhalf_t;
 
 struct CKArgsBNormFwd
 {
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index e5ff25517b..907241a3e5 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -26,8 +26,8 @@
 #pragma once
 
 #include <miopen/miopen.h>
+#include <miopen/bfloat16.hpp>
 #include <gtest/gtest.h>
-
 #include "bn_test_data.hpp"
 #include "test_operations.hpp"
 
@@ -88,3 +88,80 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
           BiasDataType, MeanVarDataType, BNTestCase> bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
+
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped                       = false;
+        std::tie(bn_config, tensor_layout) = GetParam();
+        bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout);
+
+        auto&& handle = get_handle();
+        miopenBatchNormalizationBackward(
+        &handle,
+        bn_config.mode,
+        &bn_bwd_test_data.alphaDataDiff,
+        &bn_bwd_test_data.betaDataDiff,
+        &bn_bwd_test_data.alphaParamDiff,
+        &bn_bwd_test_data.betaParamDiff,
+        &bn_bwd_test_data.input.desc,       // const xDesc
+        bn_bwd_test_data.in_dev.get(),      // const x
+        &bn_bwd_test_data.dy.desc,          // const dyDesc
+        bn_bwd_test_data.dy_dev.get(),      // const dy
+        
+        &bn_bwd_test_data.output.desc,          // const dxDesc
+        bn_bwd_test_data.out_dev.get(),          // dx (not -const)
+
+        &bn_bwd_test_data.bnScale.desc,     // const bnScale
+        bn_bwd_test_data.bnScale_dev.get(), // const bnScale
+        
+        bn_bwd_test_data.dScale_dev.get(), // resultBnScaleDiff (not const)
+        bn_bwd_test_data.dBias_dev.get(),  // resultBnBiasDiff (not const)
+    
+        bn_bwd_test_data.epsilon,
+    
+        bn_bwd_test_data.savedMean_dev.get(),   // const savedMean
+        bn_bwd_test_data.savedInvVar_dev.get()); // const savedInvVariance
+
+        std::fill(bn_bwd_test_data.output.begin(), bn_bwd_test_data.output.end(), std::numeric_limits<DxDataType>::quiet_NaN());
+    }
+
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+        auto&& handle = get_handle();
+        bn_bwd_test_data.output.data =
+            handle.Read<DyDataType>(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size());
+        bn_bwd_test_data.dScale.data =
+            handle.Read<DxDataType>(bn_bwd_test_data.dScale_dev, bn_bwd_test_data.dScale.data.size());
+        bn_bwd_test_data.dBias.data =
+            handle.Read<DxDataType>(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
+        test::ComputeCPUBNBwd(bn_bwd_test_data);
+        // using tolerance = 1e-4 since this the tolerance CK uses
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4);
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4);
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 1e-4);
+    }
+
+    BNTestCase bn_config;
+    bool test_skipped = false;
+    BNBwdTestData<XDataType,
+                  DxDataType,
+                  DyDataType,
+                  AccDataType,
+                  ScaleDataType,
+                  DscaleDbiasDataType,
+                  MeanVarDataType, BNTestCase> bn_bwd_test_data;
+    miopenTensorLayout_t tensor_layout;
+};
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
new file mode 100644
index 0000000000..e004b16734
--- /dev/null
+++ b/test/gtest/bn_bwd.cpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+
+// struct BNBwdTestBFloat16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+// {
+// };
+
+struct BNBwdTestFloat : BNBwdTest<float, float, float, float, float, float, float>
+{
+};
+
+// struct BNBwdTestDouble : BNBwdTest<double, double, double, double, double, double, double>
+// {
+// };
+
+TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat)
+{
+    
+}
+
+// TEST_P(BNBwdTestBFloat16, BnFwdBwdCKBFloat16)
+// {
+    
+// }
+
+INSTANTIATE_TEST_SUITE_P(BNBwdTestFloatNHWCSuite,
+                         BNBwdTestFloat,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+// INSTANTIATE_TEST_SUITE_P(BNBwdTestBFloat16NHWCSuite,
+//                          BNBwdTestBFloat16,
+//                          testing::Combine(testing::ValuesIn(Network1()),
+//                                           testing::Values(miopenTensorNHWC)));
+
+// INSTANTIATE_TEST_SUITE_P(BNBwdTestDoubleNHWCSuite,
+//                          BNBwdTestDouble,
+//                          testing::Combine(testing::ValuesIn(Network1()),
+//                                           testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 4692db5443..3e9cdf6a3d 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -60,35 +60,34 @@ std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
+         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
 }
 
 template <typename XDataType, typename YDataType, typename TConfig>
@@ -125,7 +124,7 @@ struct BNTestData
     {
         input   = tensor<XDataType>{miopen_type<XDataType>{}, tensor_layout, bn_config.GetInput()};
         output  = tensor<YDataType>{miopen_type<YDataType>{}, tensor_layout, bn_config.GetInput()};
-        ref_out = output;
+        ref_out = tensor<YDataType>{miopen_type<YDataType>{}, tensor_layout, bn_config.GetInput()};
     }
 
     void InitTensorsWithRandValue()
@@ -136,7 +135,7 @@ struct BNTestData
         auto gen_value = [&](auto...) {
             return 1e-2 * static_cast<XDataType>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
         };
-        input.generate(gen_value);
+        input.generate(gen_value);    
     }
 
     void SetDirection() { direction = bn_config.Direction; }
@@ -221,3 +220,117 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         estVariance_dev = handle.Write(estVariance.data);
     }
 };
+
+
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename TConfig>
+struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
+{
+    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    {
+        BNTestData<XDataType, DyDataType, TConfig>::SetUpImpl(config, t_layout);
+        CreateTensors();
+        InitTensorsWithRandValue();
+        WriteToGPU();
+    }
+
+    tensor<ScaleDataType> bnScale;
+
+    tensor<MeanVarDataType> savedMean;        
+    tensor<MeanVarDataType> savedInvVar; 
+
+    
+    tensor<DxDataType> dy;               
+    tensor<DscaleDbiasDataType> dScale;  
+    tensor<DscaleDbiasDataType> dBias; 
+    tensor<DscaleDbiasDataType> dScale_ref;  
+    tensor<DscaleDbiasDataType> dBias_ref; 
+
+    miopen::Allocator::ManageDataPtr bnScale_dev;
+    miopen::Allocator::ManageDataPtr savedMean_dev;
+    miopen::Allocator::ManageDataPtr savedInvVar_dev;
+
+    miopen::Allocator::ManageDataPtr dy_dev;
+    miopen::Allocator::ManageDataPtr dScale_dev;
+    miopen::Allocator::ManageDataPtr dBias_dev;
+    miopen::Allocator::ManageDataPtr dScale_ref_dev;
+    miopen::Allocator::ManageDataPtr dBias_ref_dev;
+    double epsilon          = std::numeric_limits<float>::epsilon();
+
+    float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
+    float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
+
+private:
+    void CreateTensors()
+    {
+        dy   = tensor<XDataType>{miopen_type<XDataType>{}, 
+                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout, 
+                                BNTestData<XDataType, DyDataType, TConfig>::bn_config.GetInput()};
+
+        auto derivedBnDesc = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         BNTestData<XDataType, DyDataType, TConfig>::input.desc,
+                                         BNTestData<XDataType, DyDataType, TConfig>::bn_mode);
+        bnScale       = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                          BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                          derivedBnDesc.GetLengths()};
+        savedMean     = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                            BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                            derivedBnDesc.GetLengths()};
+        savedInvVar    = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                derivedBnDesc.GetLengths()};
+        dScale          = tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
+                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                derivedBnDesc.GetLengths()};
+        dBias           = tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
+                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                derivedBnDesc.GetLengths()};
+        dScale_ref = dScale;
+        dBias_ref  = dBias;
+    }
+
+    void InitTensorsWithRandValue()
+    {
+        std::random_device rd{};
+        std::mt19937 gen{rd()};
+        std::uniform_int_distribution<> d{0, 100};
+        auto gen_value = [&](auto...) {
+            return 1e-2 * static_cast<ScaleDataType>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+        };
+        dy.generate(gen_value);
+        bnScale.generate(gen_value);
+        savedMean.generate(gen_value);
+
+        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
+        savedInvVar.generate(gen_var);
+        
+        std::fill(dScale.begin(), dScale.end(), 0.);
+        std::fill(dBias.begin(), dBias.end(), 0.);
+        
+        std::fill(dScale_ref.begin(), dScale_ref.end(), 0.);
+        std::fill(dBias_ref.begin(), dBias_ref.end(), 0.);
+    }
+    void WriteToGPU()
+    {
+        auto&& handle   = get_handle();
+
+        bnScale_dev          = handle.Write(bnScale.data);
+        savedMean_dev        = handle.Write(savedMean.data);
+        savedInvVar_dev = handle.Write(savedInvVar.data);
+        dy_dev          = handle.Write(dy.data);
+        
+        dScale_dev   = handle.Write(dScale.data);
+        dBias_dev    = handle.Write(dBias.data);
+
+        dScale_ref_dev   = handle.Write(dScale.data);
+        dBias_ref_dev    = handle.Write(dBias.data);
+    }
+};
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 3ac2d40d5a..c0a71d8c02 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -38,6 +38,20 @@ void ComputeCPUBNInference(DLModule& dl_module)
                                       dl_module.estVariance);
 }
 
+template <typename DLModule>
+void ComputeCPUBNBwd(DLModule& dl_module)
+{
+    batchNormSpatialHostBwdTrain(dl_module.input,
+                                 dl_module.dy,
+                                 dl_module.ref_out,
+                                 dl_module.bnScale,
+                                 dl_module.dScale_ref,
+                                 dl_module.dBias_ref,
+                                 dl_module.savedMean,
+                                 dl_module.savedInvVar);
+}
+
+
 template <typename T>
 void CompareTensor(const tensor<T>& output, const tensor<T>& ref_out, const T threshold = std::numeric_limits<T>::epsilon())
 {

From e7667c3a49e6eda0593fa32ff0acb9935d58a77d Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 8 Sep 2023 14:50:26 +0000
Subject: [PATCH 11/28] bg/LWPMIOPEN-194 : add test for all types

---
 src/include/miopen/batchnorm/solvers.hpp      |  3 ---
 src/solver/batchnorm/forward_inference_ck.cpp | 13 +++++++----
 test/fusionHost.hpp                           | 10 ++++----
 test/gtest/bn.hpp                             | 13 ++---------
 test/gtest/bn_infer.cpp                       | 23 +++++++++++++++++++
 test/gtest/test_operations.hpp                |  2 +-
 6 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 4a2275294f..c7d050abeb 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -132,7 +132,6 @@ struct BnFwdInference final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
-// ----------- start BnCKFwdInference ---------------
 struct BnCKFwdInference final : BatchnormSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<BnCKFwdInference>(); }
@@ -143,8 +142,6 @@ struct BnCKFwdInference final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
-// ----------- end BnCKFwdInference ---------------
-
 } // namespace batchnorm
 
 } // namespace solver
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 88968c2572..ac6b542825 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -46,9 +46,10 @@ using Normalize   = ck::tensor_operation::element_wise::NormalizeInInfer;
 constexpr index_t Rank                  = 4;
 constexpr index_t NumBatchNormReduceDim = 3;
 
-using F16 = ck::half_t;
-using F32 = float;
-using F64 = double;
+using F16  = ck::half_t;
+using F32  = float;
+using F64  = double;
+using BF16 = ushort;
 
 struct CKArgsBNormFwd
 {
@@ -196,6 +197,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
     case miopenDouble:
         return (CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
     case miopenBFloat16:
+        return (CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8:
     case miopenInt8x4:
@@ -220,7 +222,7 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
         std::ignore = kernels;
         return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
-            switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
+            switch(bn_problem.GetXDesc().GetType())
             {
             case miopenHalf:
                 RunCKSolution<F16, F16, F32, F16, F16, F32>(
@@ -235,6 +237,9 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
                     handle, primitive_parameters, bn_problem);
                 break;
             case miopenBFloat16:
+                RunCKSolution<BF16, BF16, F32, BF16, BF16, F32>(
+                    handle, primitive_parameters, bn_problem);
+                break;
             case miopenInt8:
             case miopenInt32:
             case miopenInt8x4:
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index be27234c08..cffefea0e2 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -148,9 +148,9 @@ void batchNormSpatialHostInference(const tensor<T>& input,
     int n_batches, channels, height, width;
     std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
     par_for(channels, 1, [&](int cidx) { // via channel
-        V mean      = estimatedMean(0, cidx, 0, 0);
-        V variance  = estimatedVariance(0, cidx, 0, 0);
-        V invertVar = 1.0 / sqrt(variance + epsilon);
+        V mean           = estimatedMean(0, cidx, 0, 0);
+        V variance       = estimatedVariance(0, cidx, 0, 0);
+        double invertVar = 1.0 / sqrt(variance + epsilon);
         // process the batch per channel
         for(int row = 0; row < height; row++)
         { // via rows
@@ -158,8 +158,8 @@ void batchNormSpatialHostInference(const tensor<T>& input,
             { // via columns
                 for(int bidx = 0; bidx < n_batches; bidx++)
                 { // via mini_batch
-                    V elemStd = static_cast<V>(input(bidx, cidx, row, column)) - mean;
-                    V inhat   = elemStd * invertVar;
+                    double elemStd = static_cast<double>(input(bidx, cidx, row, column)) - mean;
+                    double inhat   = elemStd * invertVar;
                     output(bidx, cidx, row, column) =
                         static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
                     // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 637c007e5d..0b763da411 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -74,17 +74,8 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         bn_infer_test_data.output.data = handle.Read<YDataType>(
             bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
         test::ComputeCPUBNInference(bn_infer_test_data);
-
-        if constexpr(std::is_same_v<YDataType, double>)
-        {
-            // tolerance for CK solver tolerance for
-            test::CompareTensor<YDataType>(
-                bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8);
-        }
-        else
-        {
-            test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out);
-        }
+        // 4e-3 is tolerance used by CK kernel.
+        test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out, 4e-3);
     }
 
     BNTestCase bn_config;
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index e3ff28e776..6598ef7169 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -35,10 +35,23 @@ struct BNInferTestFloat : BNInferTest<float, float, float, float, float>
 {
 };
 
+struct BNInferTestDouble : BNInferTest<double, double, double, double, double>
+{
+};
+
+struct BNInferTestBFloat16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, double>
+{
+};
+
 TEST_P(BNInferTestHalf, BnFwdInferCKHalf) {}
 
 TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {}
 
+// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig
+// only supports half and float
+TEST_P(BNInferTestDouble, DISABLED_BnFwdInferCKDouble) {}
+TEST_P(BNInferTestBFloat16, DISABLED_BnFwdInferCKBFloat16) {}
+
 INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
                          BNInferTestHalf,
                          testing::Combine(testing::ValuesIn(Network1()),
@@ -48,3 +61,13 @@ INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
                          BNInferTestFloat,
                          testing::Combine(testing::ValuesIn(Network1()),
                                           testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
+                         BNInferTestDouble,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
+                         BNInferTestBFloat16,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index d4e524e472..d1528fe2bb 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -41,7 +41,7 @@ void ComputeCPUBNInference(DLModule& dl_module)
 template <typename T>
 void CompareTensor(const tensor<T>& output,
                    const tensor<T>& ref_out,
-                   const T threshold = std::numeric_limits<T>::epsilon())
+                   const double threshold = std::numeric_limits<T>::epsilon())
 {
     EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
     EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";

From ce4beef136b72a70956cbf299e6fc6905c107ef7 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 8 Sep 2023 18:03:13 +0000
Subject: [PATCH 12/28] bg/LWPMIOPEN-193_bn_back : add test for all types

---
 src/batch_norm_api.cpp                        |   7 -
 src/ocl/batchnormocl.cpp                      |   6 +-
 src/solver/batchnorm/backward_ck.cpp          |  62 ++++---
 src/solver/batchnorm/forward_inference_ck.cpp |  74 ++++----
 test/gtest/bn.hpp                             | 137 +++++++-------
 test/gtest/bn_bwd.cpp                         |  23 ++-
 test/gtest/bn_infer.cpp                       |  32 ++--
 test/gtest/bn_test_data.hpp                   | 174 +++++++++---------
 test/gtest/test_operations.hpp                |  23 +--
 9 files changed, 284 insertions(+), 254 deletions(-)

diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 22cf757627..6ca7366f9a 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -271,13 +271,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                  const void* savedMean,
                                  const void* savedInvVariance)
 {
-    // bfloat16 not supported for batchnorm operation
-    if(miopen::deref(xDesc).GetType() == miopenBFloat16 ||
-       miopen::deref(dyDesc).GetType() == miopenBFloat16 ||
-       miopen::deref(dxDesc).GetType() == miopenBFloat16)
-    {
-        return miopenStatusNotImplemented;
-    }
 
     MIOPEN_LOG_FUNCTION(handle,
                         bn_mode,
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 10d2bb8372..7a20038c55 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -223,7 +223,7 @@ void BatchNormForwardInference(Handle& handle,
 
         const auto algo    = AlgorithmName{"miopenBatchNormalizationForwardInference"};
         const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference,
-        solver::batchnorm::BnCKFwdInference>{};
+                                                     solver::batchnorm::BnCKFwdInference>{};
 
         solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
     }
@@ -300,7 +300,7 @@ void BatchNormBackward(Handle& handle,
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
-    if(dxDesc.GetType() != dyDesc.GetType() || dyDesc.GetType() != xDesc.GetType())
+    if(dxDesc.GetType() != dyDesc.GetType())
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
@@ -345,7 +345,7 @@ void BatchNormBackward(Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKBwdBackward, 
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKBwdBackward,
                                                  solver::batchnorm::BnBwdTrainingSpatialSingle,
                                                  solver::batchnorm::BnBwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnBwdTrainingPerActivation>{};
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index 8300a3de62..492cb2a619 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -48,7 +48,7 @@ constexpr index_t NumBatchNormReduceDim = 3;
 using F16  = ck::half_t;
 using F32  = float;
 using F64  = double;
-using BF16 = bfloat16;
+using BF16 = ushort;
 
 struct CKArgsBNormFwd
 {
@@ -63,9 +63,9 @@ struct CKArgsBNormFwd
                   xyStrides.begin());
         arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel
         arrScaleBiasMeanVarStrides[0] = 1;
-        
+
         // prep for CK
-        std::sort(xyStrides.begin(), xyStrides.end(),  std::greater<>());
+        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
         std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
     }
 
@@ -87,11 +87,10 @@ template <typename XDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType>
-int CheckCKApplicability(
-    const miopen::batchnorm::ProblemDescription& problem)
+int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args       = CKArgsBNormFwd{problem};
-    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+    using DeviceOp         = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
                                                                       DxDataType,
                                                                       DyDataType,
                                                                       AccDataType,
@@ -102,7 +101,7 @@ int CheckCKApplicability(
                                                                       Rank,
                                                                       NumBatchNormReduceDim>;
     const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
+        DeviceOp>::GetInstances();
     assert(!bn_bwd_ptrs.empty());
     int count = 0;
     for(const auto& it : bn_bwd_ptrs)
@@ -126,7 +125,8 @@ int CheckCKApplicability(
                                                     nullptr,
                                                     nullptr,
                                                     nullptr);
-        if(it->IsSupportedArgument(argument_ptr.get())){
+        if(it->IsSupportedArgument(argument_ptr.get()))
+        {
             return count;
         }
         count++;
@@ -142,12 +142,12 @@ template <typename XDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType>
 static void RunCKSolution(const Handle& handle,
-                   const AnyInvokeParams& primitive_parameters,
-                   const miopen::batchnorm::ProblemDescription& problem)
+                          const AnyInvokeParams& primitive_parameters,
+                          const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args = CKArgsBNormFwd{problem};
 
-    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+    using DeviceOp         = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
                                                                       DxDataType,
                                                                       DyDataType,
                                                                       AccDataType,
@@ -158,7 +158,7 @@ static void RunCKSolution(const Handle& handle,
                                                                       Rank,
                                                                       NumBatchNormReduceDim>;
     const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
+        DeviceOp>::GetInstances();
 
     int kernel_index = CheckCKApplicability<XDataType,
                                             DxDataType,
@@ -168,9 +168,9 @@ static void RunCKSolution(const Handle& handle,
                                             DscaleDbiasDataType,
                                             MeanVarDataType>(problem);
     assert(kernel_index >= 0 && kernel_index < bn_bwd_ptrs.size());
-    auto& bn_ptr           = bn_bwd_ptrs.at(kernel_index);
-    const auto& params = primitive_parameters.CastTo<miopen::batchnorm::BwdInvokeParams>();
-    auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths,
+    auto& bn_ptr                = bn_bwd_ptrs.at(kernel_index);
+    const auto& params          = primitive_parameters.CastTo<miopen::batchnorm::BwdInvokeParams>();
+    auto argument_ptr           = bn_ptr->MakeArgumentPointer(args.xyLengths,
                                                     args.xyStrides,
                                                     args.xyStrides,
                                                     args.xyStrides,
@@ -203,7 +203,7 @@ static void RunCKSolution(const Handle& handle,
 #endif
 
 bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
-                                    const miopen::batchnorm::ProblemDescription& bn_problem) const
+                                   const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
     std::ignore = ctx;
@@ -219,10 +219,14 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
 
     switch(bn_problem.GetXDesc().GetType())
     {
-    case miopenFloat: return (CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
-    case miopenDouble: return (CheckCKApplicability<F64, F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
-    case miopenHalf: return (CheckCKApplicability<F16, F32, F32, F32, F16, F32, F32>(bn_problem) != -1);
-    case miopenBFloat16: return (CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem) != -1);
+    case miopenFloat:
+        return (CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
+    case miopenDouble:
+        return (CheckCKApplicability<F64, F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
+    case miopenHalf:
+        return (CheckCKApplicability<F16, F32, F32, F32, F16, F32, F32>(bn_problem) != -1);
+    case miopenBFloat16:
+        return (CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8:
     case miopenInt8x4:
@@ -232,8 +236,9 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
 #endif
 }
 
-ConvSolution BnCKBwdBackward::GetSolution(const ExecutionContext& context,
-                                           const miopen::batchnorm::ProblemDescription& bn_problem) const
+ConvSolution
+BnCKBwdBackward::GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
     std::ignore = context;
@@ -246,20 +251,23 @@ ConvSolution BnCKBwdBackward::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
         std::ignore = kernels;
         return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
-
             switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
             {
             case miopenFloat:
-                RunCKSolution<F32, F32, F32, F32, F32, F32, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F32, F32, F32, F32, F32, F32, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenDouble:
-                RunCKSolution<F64, F64, F64, F64, F64, F64, F64>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F64, F64, F64, F64, F64, F64, F64>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenHalf:
-                RunCKSolution<F16, F32, F32, F32, F16, F32, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F16, F32, F32, F32, F16, F32, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenBFloat16:
-                RunCKSolution<BF16, F32, F32, F32, BF16, F32, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<BF16, F32, F32, F32, BF16, F32, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenInt8:
             case miopenInt32:
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 70f3854caf..ba6a3682c9 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -25,7 +25,6 @@
  *
  *******************************************************************************/
 
-
 #include <miopen/batchnorm/solvers.hpp>
 #include <miopen/batchnorm/invoke_params.hpp>
 #include <miopen/batch_norm.hpp>
@@ -88,17 +87,16 @@ template <typename XDataType,
           typename ScaleDataType,
           typename BiasDataType,
           typename MeanVarDataType>
-int CheckCKApplicability(
-    const miopen::batchnorm::ProblemDescription& problem)
+int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
 {
-    const auto& args       = CKArgsBNormFwd{problem};
-    using DeviceOp = ck::tensor_operation::device::DeviceElementwise<
+    const auto& args = CKArgsBNormFwd{problem};
+    using DeviceOp   = ck::tensor_operation::device::DeviceElementwise<
         ck::Tuple<XDataType, MeanVarDataType, MeanVarDataType, ScaleDataType, BiasDataType>,
         ck::Tuple<YDataType>,
         Normalize,
         Rank>;
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
+        DeviceOp>::GetInstances();
     assert(!bn_fwd_ptrs.empty());
     int count = 0;
     for(const auto& it : bn_fwd_ptrs)
@@ -113,7 +111,8 @@ int CheckCKApplicability(
                                                     {nullptr, nullptr, nullptr, nullptr, nullptr},
                                                     {nullptr},
                                                     Normalize{0.0});
-        if(it->IsSupportedArgument(argument_ptr.get())){
+        if(it->IsSupportedArgument(argument_ptr.get()))
+        {
             return count;
         }
         count++;
@@ -128,8 +127,8 @@ template <typename XDataType,
           typename BiasDataType,
           typename MeanVarDataType>
 static void RunCKSolution(const Handle& handle,
-                   const AnyInvokeParams& primitive_parameters,
-                   const miopen::batchnorm::ProblemDescription& problem)
+                          const AnyInvokeParams& primitive_parameters,
+                          const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args = CKArgsBNormFwd{problem};
 
@@ -139,28 +138,29 @@ static void RunCKSolution(const Handle& handle,
         Normalize,
         Rank>;
     const auto bn_fwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-            DeviceOp>::GetInstances();
-
-    int kernel_index = CheckCKApplicability<XDataType, YDataType, AccDataType, 
-                                        ScaleDataType, BiasDataType, MeanVarDataType>(problem);
+        DeviceOp>::GetInstances();
+
+    int kernel_index = CheckCKApplicability<XDataType,
+                                            YDataType,
+                                            AccDataType,
+                                            ScaleDataType,
+                                            BiasDataType,
+                                            MeanVarDataType>(problem);
     assert(kernel_index >= 0 && kernel_index < bn_fwd_ptrs.size());
-    auto& bn_ptr           = bn_fwd_ptrs.at(kernel_index);
+    auto& bn_ptr       = bn_fwd_ptrs.at(kernel_index);
     const auto& params = primitive_parameters.CastTo<miopen::batchnorm::InfInvokeParams>();
 
-    auto argument_ptr = bn_ptr->MakeArgumentPointer(args.xyLengths,
-                                                    {args.xyStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides,
-                                                     args.aligned_scaleBiasMeanVarStrides},
-                                                    {args.xyStrides},
-                                                    {params.x,
-                                                     params.estimatedMean,
-                                                     params.estimatedVariance,
-                                                     params.bnScale,
-                                                     params.bnBias},
-                                                    {params.y},
-                                                    Normalize{params.epsilon});
+    auto argument_ptr = bn_ptr->MakeArgumentPointer(
+        args.xyLengths,
+        {args.xyStrides,
+         args.aligned_scaleBiasMeanVarStrides,
+         args.aligned_scaleBiasMeanVarStrides,
+         args.aligned_scaleBiasMeanVarStrides,
+         args.aligned_scaleBiasMeanVarStrides},
+        {args.xyStrides},
+        {params.x, params.estimatedMean, params.estimatedVariance, params.bnScale, params.bnBias},
+        {params.y},
+        Normalize{params.epsilon});
 
     auto invoker_ptr            = bn_ptr->MakeInvokerPointer();
     const auto enable_profiling = handle.IsProfilingEnabled();
@@ -194,7 +194,8 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
     {
     case miopenHalf: return (CheckCKApplicability<F16, F16, F32, F16, F16, F32>(bn_problem) != -1);
     case miopenFloat: return (CheckCKApplicability<F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
-    case miopenDouble: return (CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
+    case miopenDouble:
+        return (CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
     case miopenBFloat16:
     case miopenInt32:
     case miopenInt8:
@@ -205,8 +206,9 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
 #endif
 }
 
-ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context,
-                                           const miopen::batchnorm::ProblemDescription& bn_problem) const
+ConvSolution
+BnCKFwdInference::GetSolution(const ExecutionContext& context,
+                              const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
     std::ignore = context;
@@ -219,17 +221,19 @@ ConvSolution BnCKFwdInference::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
         std::ignore = kernels;
         return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
-            
             switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
             {
             case miopenHalf:
-                RunCKSolution<F16, F16, F32, F16, F16, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F16, F16, F32, F16, F16, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenFloat:
-                RunCKSolution<F32, F32, F32, F32, F32, F32>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F32, F32, F32, F32, F32, F32>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenDouble:
-                RunCKSolution<F64, F64, F64, F64, F64, F64>(handle, primitive_parameters, bn_problem);
+                RunCKSolution<F64, F64, F64, F64, F64, F64>(
+                    handle, primitive_parameters, bn_problem);
                 break;
             case miopenBFloat16:
             case miopenInt8:
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 907241a3e5..77d5f62d95 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -31,8 +31,11 @@
 #include "bn_test_data.hpp"
 #include "test_operations.hpp"
 
-template <typename XDataType, typename YDataType, typename ScaleDataType, 
-          typename BiasDataType, typename MeanVarDataType>
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
 struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
 {
 protected:
@@ -43,53 +46,54 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         bn_infer_test_data.SetUpImpl(bn_config, tensor_layout);
 
         auto&& handle = get_handle();
-        miopenBatchNormalizationForwardInference(
-        &handle,
-        bn_config.mode,
-        &bn_infer_test_data.alpha,
-        &bn_infer_test_data.beta,
-        &bn_infer_test_data.input.desc,
-        bn_infer_test_data.in_dev.get(),
-        &bn_infer_test_data.output.desc,
-        bn_infer_test_data.out_dev.get(),
-        &bn_infer_test_data.scale.desc,
-        bn_infer_test_data.scale_dev.get(),
-        bn_infer_test_data.shift_dev.get(),
-        bn_infer_test_data.estMean_dev.get(),
-        bn_infer_test_data.estVariance_dev.get(),
-        bn_infer_test_data.epsilon);
-
-        std::fill(bn_infer_test_data.output.begin(), bn_infer_test_data.output.end(), std::numeric_limits<YDataType>::quiet_NaN());
+        miopenBatchNormalizationForwardInference(&handle,
+                                                 bn_config.mode,
+                                                 &bn_infer_test_data.alpha,
+                                                 &bn_infer_test_data.beta,
+                                                 &bn_infer_test_data.input.desc,
+                                                 bn_infer_test_data.in_dev.get(),
+                                                 &bn_infer_test_data.output.desc,
+                                                 bn_infer_test_data.out_dev.get(),
+                                                 &bn_infer_test_data.scale.desc,
+                                                 bn_infer_test_data.scale_dev.get(),
+                                                 bn_infer_test_data.shift_dev.get(),
+                                                 bn_infer_test_data.estMean_dev.get(),
+                                                 bn_infer_test_data.estVariance_dev.get(),
+                                                 bn_infer_test_data.epsilon);
+
+        std::fill(bn_infer_test_data.output.begin(),
+                  bn_infer_test_data.output.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
     }
 
     void TearDown() override
     {
         if(test_skipped)
             return;
-        auto&& handle = get_handle();
-        bn_infer_test_data.output.data =
-            handle.Read<YDataType>(bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
-        test::ComputeCPUBNInference(bn_infer_test_data);
-        
-        if constexpr(std::is_same_v<YDataType, double>)
-        {
-            // tolerance for CK solver tolerance for
-            test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out, 1e-8);
-        }
-        else{
-            test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out);
-        }
+        auto&& handle                  = get_handle();
+        bn_infer_test_data.output.data = handle.Read<YDataType>(
+            bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
+        // test::ComputeCPUBNInference(bn_infer_test_data);
 
+        // if constexpr(std::is_same_v<YDataType, double>)
+        // {
+        //     // tolerance for CK solver tolerance for
+        //     test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out,
+        //     1e-8);
+        // }
+        // else{
+        //     test::CompareTensor<YDataType>(bn_infer_test_data.output,
+        //     bn_infer_test_data.ref_out);
+        // }
     }
 
     BNTestCase bn_config;
     bool test_skipped = false;
-    BNInferTestData<XDataType, YDataType, ScaleDataType, 
-          BiasDataType, MeanVarDataType, BNTestCase> bn_infer_test_data;
+    BNInferTestData<XDataType, YDataType, ScaleDataType, BiasDataType, MeanVarDataType, BNTestCase>
+        bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
 
-
 template <typename XDataType,
           typename DxDataType,
           typename DyDataType,
@@ -108,32 +112,34 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
 
         auto&& handle = get_handle();
         miopenBatchNormalizationBackward(
-        &handle,
-        bn_config.mode,
-        &bn_bwd_test_data.alphaDataDiff,
-        &bn_bwd_test_data.betaDataDiff,
-        &bn_bwd_test_data.alphaParamDiff,
-        &bn_bwd_test_data.betaParamDiff,
-        &bn_bwd_test_data.input.desc,       // const xDesc
-        bn_bwd_test_data.in_dev.get(),      // const x
-        &bn_bwd_test_data.dy.desc,          // const dyDesc
-        bn_bwd_test_data.dy_dev.get(),      // const dy
-        
-        &bn_bwd_test_data.output.desc,          // const dxDesc
-        bn_bwd_test_data.out_dev.get(),          // dx (not -const)
-
-        &bn_bwd_test_data.bnScale.desc,     // const bnScale
-        bn_bwd_test_data.bnScale_dev.get(), // const bnScale
-        
-        bn_bwd_test_data.dScale_dev.get(), // resultBnScaleDiff (not const)
-        bn_bwd_test_data.dBias_dev.get(),  // resultBnBiasDiff (not const)
-    
-        bn_bwd_test_data.epsilon,
-    
-        bn_bwd_test_data.savedMean_dev.get(),   // const savedMean
-        bn_bwd_test_data.savedInvVar_dev.get()); // const savedInvVariance
-
-        std::fill(bn_bwd_test_data.output.begin(), bn_bwd_test_data.output.end(), std::numeric_limits<DxDataType>::quiet_NaN());
+            &handle,
+            bn_config.mode,
+            &bn_bwd_test_data.alphaDataDiff,
+            &bn_bwd_test_data.betaDataDiff,
+            &bn_bwd_test_data.alphaParamDiff,
+            &bn_bwd_test_data.betaParamDiff,
+            &bn_bwd_test_data.input.desc,  // const xDesc
+            bn_bwd_test_data.in_dev.get(), // const x
+            &bn_bwd_test_data.dy.desc,     // const dyDesc
+            bn_bwd_test_data.dy_dev.get(), // const dy
+
+            &bn_bwd_test_data.output.desc,  // const dxDesc
+            bn_bwd_test_data.out_dev.get(), // dx (not -const)
+
+            &bn_bwd_test_data.bnScale.desc,     // const bnScale
+            bn_bwd_test_data.bnScale_dev.get(), // const bnScale
+
+            bn_bwd_test_data.dScale_dev.get(), // resultBnScaleDiff (not const)
+            bn_bwd_test_data.dBias_dev.get(),  // resultBnBiasDiff (not const)
+
+            bn_bwd_test_data.epsilon,
+
+            bn_bwd_test_data.savedMean_dev.get(),    // const savedMean
+            bn_bwd_test_data.savedInvVar_dev.get()); // const savedInvVariance
+
+        std::fill(bn_bwd_test_data.output.begin(),
+                  bn_bwd_test_data.output.end(),
+                  std::numeric_limits<DxDataType>::quiet_NaN());
     }
 
     void TearDown() override
@@ -143,10 +149,11 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
         auto&& handle = get_handle();
         bn_bwd_test_data.output.data =
             handle.Read<DyDataType>(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size());
-        bn_bwd_test_data.dScale.data =
-            handle.Read<DxDataType>(bn_bwd_test_data.dScale_dev, bn_bwd_test_data.dScale.data.size());
+        bn_bwd_test_data.dScale.data = handle.Read<DxDataType>(bn_bwd_test_data.dScale_dev,
+                                                               bn_bwd_test_data.dScale.data.size());
         bn_bwd_test_data.dBias.data =
             handle.Read<DxDataType>(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
+
         test::ComputeCPUBNBwd(bn_bwd_test_data);
         // using tolerance = 1e-4 since this the tolerance CK uses
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4);
@@ -162,6 +169,8 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                   AccDataType,
                   ScaleDataType,
                   DscaleDbiasDataType,
-                  MeanVarDataType, BNTestCase> bn_bwd_test_data;
+                  MeanVarDataType,
+                  BNTestCase>
+        bn_bwd_test_data;
     miopenTensorLayout_t tensor_layout;
 };
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index e004b16734..244be887c0 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -26,6 +26,10 @@
 
 #include "bn.hpp"
 
+struct BNBwdTestTestHalf
+    : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
+{
+};
 
 // struct BNBwdTestBFloat16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
 // {
@@ -39,16 +43,25 @@ struct BNBwdTestFloat : BNBwdTest<float, float, float, float, float, float, floa
 // {
 // };
 
-TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat)
-{
-    
-}
+// TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat)
+// {
+
+// }
 
 // TEST_P(BNBwdTestBFloat16, BnFwdBwdCKBFloat16)
 // {
-    
+
 // }
 
+TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat) {}
+
+TEST_P(BNBwdTestTestHalf, BnFwdBwdCKHalf) {}
+
+INSTANTIATE_TEST_SUITE_P(BNBwdTestTestHalfNHWCSuite,
+                         BNBwdTestTestHalf,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
 INSTANTIATE_TEST_SUITE_P(BNBwdTestFloatNHWCSuite,
                          BNBwdTestFloat,
                          testing::Combine(testing::ValuesIn(Network1()),
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 8d49d3c99e..62b20821c4 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -26,30 +26,26 @@
 
 #include "bn.hpp"
 
-struct BNInferTestHalf : BNInferTest<half_float::half, half_float::half, half_float::half, 
-    half_float::half, float>
-{
-};
+// struct BNInferTestHalf : BNInferTest<half_float::half, half_float::half, half_float::half,
+//     half_float::half, float>
+// {
+// };
 
-struct BNInferTestFloat : BNInferTest<float, float, float, 
-    float, float>
+struct BNInferTestFloat : BNInferTest<float, float, float, float, float>
 {
 };
 
-TEST_P(BNInferTestHalf, BnFwdInferCKHalf)
-{
-    
-}
+// TEST_P(BNInferTestHalf, BnFwdInferCKHalf)
+// {
 
-TEST_P(BNInferTestFloat, BnFwdInferCKFloat)
-{
-    
-}
+// }
 
-INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
-                         BNInferTestHalf,
-                         testing::Combine(testing::ValuesIn(Network1()),
-                                          testing::Values(miopenTensorNHWC)));
+TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {}
+
+// INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
+//                          BNInferTestHalf,
+//                          testing::Combine(testing::ValuesIn(Network1()),
+//                                           testing::Values(miopenTensorNHWC)));
 
 INSTANTIATE_TEST_SUITE_P(BNInferTestFloatNHWCSuite,
                          BNInferTestFloat,
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 3e9cdf6a3d..394d5c251a 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -60,34 +60,34 @@ std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
+        {1, 1, 4, 4, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
 }
 
 template <typename XDataType, typename YDataType, typename TConfig>
@@ -135,7 +135,7 @@ struct BNTestData
         auto gen_value = [&](auto...) {
             return 1e-2 * static_cast<XDataType>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
         };
-        input.generate(gen_value);    
+        input.generate(gen_value);
     }
 
     void SetDirection() { direction = bn_config.Direction; }
@@ -148,8 +148,12 @@ struct BNTestData
     }
 };
 
-template <typename XDataType, typename YDataType, typename ScaleDataType, 
-          typename BiasDataType, typename MeanVarDataType, typename TConfig>
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename TConfig>
 struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
@@ -169,8 +173,8 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
     miopen::Allocator::ManageDataPtr estMean_dev;
     miopen::Allocator::ManageDataPtr estVariance_dev;
     double epsilon          = 1.0e-5;
-    float alpha       = static_cast<float>(1.0f);
-    float beta        = static_cast<float>(0);
+    float alpha             = static_cast<float>(1.0f);
+    float beta              = static_cast<float>(0);
     const float activ_alpha = static_cast<double>(0.5f);
     const float activ_beta  = static_cast<double>(0.5f);
     const float activ_gamma = static_cast<double>(0.5f);
@@ -182,18 +186,19 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         miopen::DeriveBNTensorDescriptor(derivedBnDesc,
                                          BNTestData<XDataType, YDataType, TConfig>::input.desc,
                                          BNTestData<XDataType, YDataType, TConfig>::bn_mode);
-        scale       = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
-                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                          derivedBnDesc.GetLengths()};
-        shift       = tensor<BiasDataType>{miopen_type<BiasDataType>{},
-                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                          derivedBnDesc.GetLengths()};
-        estMean     = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
-                            BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                            derivedBnDesc.GetLengths()};
-        estVariance = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
-                                BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                derivedBnDesc.GetLengths()};
+        scale   = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                                      BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                      derivedBnDesc.GetLengths()};
+        shift   = tensor<BiasDataType>{miopen_type<BiasDataType>{},
+                                     BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                     derivedBnDesc.GetLengths()};
+        estMean = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                          derivedBnDesc.GetLengths()};
+        estVariance =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
     }
 
     void InitTensorsWithRandValue()
@@ -207,7 +212,7 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         scale.generate(gen_value);
         shift.generate(gen_value);
         estMean.generate(gen_value);
-        
+
         auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
         estVariance.generate(gen_var);
     }
@@ -221,8 +226,6 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
     }
 };
 
-
-
 template <typename XDataType,
           typename DxDataType,
           typename DyDataType,
@@ -243,15 +246,14 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 
     tensor<ScaleDataType> bnScale;
 
-    tensor<MeanVarDataType> savedMean;        
-    tensor<MeanVarDataType> savedInvVar; 
+    tensor<MeanVarDataType> savedMean;
+    tensor<MeanVarDataType> savedInvVar;
 
-    
-    tensor<DxDataType> dy;               
-    tensor<DscaleDbiasDataType> dScale;  
-    tensor<DscaleDbiasDataType> dBias; 
-    tensor<DscaleDbiasDataType> dScale_ref;  
-    tensor<DscaleDbiasDataType> dBias_ref; 
+    tensor<DyDataType> dy;
+    tensor<DscaleDbiasDataType> dScale;
+    tensor<DscaleDbiasDataType> dBias;
+    tensor<DscaleDbiasDataType> dScale_ref;
+    tensor<DscaleDbiasDataType> dBias_ref;
 
     miopen::Allocator::ManageDataPtr bnScale_dev;
     miopen::Allocator::ManageDataPtr savedMean_dev;
@@ -262,7 +264,7 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
     miopen::Allocator::ManageDataPtr dBias_dev;
     miopen::Allocator::ManageDataPtr dScale_ref_dev;
     miopen::Allocator::ManageDataPtr dBias_ref_dev;
-    double epsilon          = std::numeric_limits<float>::epsilon();
+    double epsilon = std::numeric_limits<float>::epsilon();
 
     float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
     float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
@@ -270,29 +272,33 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 private:
     void CreateTensors()
     {
-        dy   = tensor<XDataType>{miopen_type<XDataType>{}, 
-                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout, 
+        dy = tensor<DyDataType>{miopen_type<DyDataType>{},
+                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
                                 BNTestData<XDataType, DyDataType, TConfig>::bn_config.GetInput()};
 
         auto derivedBnDesc = miopen::TensorDescriptor{};
         miopen::DeriveBNTensorDescriptor(derivedBnDesc,
                                          BNTestData<XDataType, DyDataType, TConfig>::input.desc,
                                          BNTestData<XDataType, DyDataType, TConfig>::bn_mode);
-        bnScale       = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
-                          BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                          derivedBnDesc.GetLengths()};
-        savedMean     = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
-                            BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                            derivedBnDesc.GetLengths()};
-        savedInvVar    = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
-                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                                derivedBnDesc.GetLengths()};
-        dScale          = tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
-                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                                derivedBnDesc.GetLengths()};
-        dBias           = tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
-                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                                derivedBnDesc.GetLengths()};
+        bnScale = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                                        BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                        derivedBnDesc.GetLengths()};
+        savedMean =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+        savedInvVar =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+        dScale =
+            tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
+                                        BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                        derivedBnDesc.GetLengths()};
+        dBias =
+            tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
+                                        BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                        derivedBnDesc.GetLengths()};
         dScale_ref = dScale;
         dBias_ref  = dBias;
     }
@@ -311,26 +317,26 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 
         auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
         savedInvVar.generate(gen_var);
-        
+
         std::fill(dScale.begin(), dScale.end(), 0.);
         std::fill(dBias.begin(), dBias.end(), 0.);
-        
+
         std::fill(dScale_ref.begin(), dScale_ref.end(), 0.);
         std::fill(dBias_ref.begin(), dBias_ref.end(), 0.);
     }
     void WriteToGPU()
     {
-        auto&& handle   = get_handle();
+        auto&& handle = get_handle();
 
-        bnScale_dev          = handle.Write(bnScale.data);
-        savedMean_dev        = handle.Write(savedMean.data);
+        bnScale_dev     = handle.Write(bnScale.data);
+        savedMean_dev   = handle.Write(savedMean.data);
         savedInvVar_dev = handle.Write(savedInvVar.data);
         dy_dev          = handle.Write(dy.data);
-        
-        dScale_dev   = handle.Write(dScale.data);
-        dBias_dev    = handle.Write(dBias.data);
 
-        dScale_ref_dev   = handle.Write(dScale.data);
-        dBias_ref_dev    = handle.Write(dBias.data);
+        dScale_dev = handle.Write(dScale.data);
+        dBias_dev  = handle.Write(dBias.data);
+
+        dScale_ref_dev = handle.Write(dScale.data);
+        dBias_ref_dev  = handle.Write(dBias.data);
     }
 };
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index c0a71d8c02..08b119444b 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -25,17 +25,17 @@
  *******************************************************************************/
 #pragma once
 
-namespace test{
+namespace test {
 template <typename DLModule>
 void ComputeCPUBNInference(DLModule& dl_module)
 {
     batchNormSpatialHostInference(dl_module.input,
-                                      dl_module.ref_out,
-                                      dl_module.scale,
-                                      dl_module.shift,
-                                      dl_module.epsilon,
-                                      dl_module.estMean,
-                                      dl_module.estVariance);
+                                  dl_module.ref_out,
+                                  dl_module.scale,
+                                  dl_module.shift,
+                                  dl_module.epsilon,
+                                  dl_module.estMean,
+                                  dl_module.estVariance);
 }
 
 template <typename DLModule>
@@ -51,22 +51,23 @@ void ComputeCPUBNBwd(DLModule& dl_module)
                                  dl_module.savedInvVar);
 }
 
-
 template <typename T>
-void CompareTensor(const tensor<T>& output, const tensor<T>& ref_out, const T threshold = std::numeric_limits<T>::epsilon())
+void CompareTensor(const tensor<T>& output,
+                   const tensor<T>& ref_out,
+                   const T threshold = std::numeric_limits<T>::epsilon())
 {
     EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
     EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
     EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
         << "Non finite number found in the GPU data";
     EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
-    auto error             = miopen::rms_range(ref_out, output);
+    auto error = miopen::rms_range(ref_out, output);
     EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
         << "Non finite number found in the CPU data";
     EXPECT_TRUE(error < threshold)
         << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
 }
-}
+} // namespace test
 
 namespace test {
 namespace FusionPlan {

From 714b6367c9f63a6143422dd219f8f4af0c91f174 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 8 Sep 2023 21:13:09 +0000
Subject: [PATCH 13/28] bg/LWPMIOPEN-193_bn_back : fix bn backward host
 template

---
 test/fusionHost.hpp         | 12 ++++++------
 test/gtest/bn_test_data.hpp |  5 +----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index be27234c08..4b4bb0eeee 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -280,14 +280,14 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
 }
 
 template <class T, class U>
-void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
+void batchNormSpatialHostBwdTrain(const tensor<U>& x_input,
                                   const tensor<T>& dy_input,
                                   tensor<T>& dx_out,
-                                  const tensor<U>& scale,
-                                  tensor<U>& dscale,
-                                  tensor<U>& dbias,
-                                  const tensor<U>& savedMean,
-                                  const tensor<U>& savedInvVar)
+                                  const tensor<U>& scale, 
+                                  tensor<T>& dscale,
+                                  tensor<T>& dbias,
+                                  const tensor<T>& savedMean,
+                                  const tensor<T>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 394d5c251a..01a46228f7 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -60,7 +60,6 @@ std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-        {1, 1, 4, 4, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
@@ -238,7 +237,7 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 {
     void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
     {
-        BNTestData<XDataType, DyDataType, TConfig>::SetUpImpl(config, t_layout);
+        BNTestData<XDataType, DxDataType, TConfig>::SetUpImpl(config, t_layout);
         CreateTensors();
         InitTensorsWithRandValue();
         WriteToGPU();
@@ -336,7 +335,5 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
         dScale_dev = handle.Write(dScale.data);
         dBias_dev  = handle.Write(dBias.data);
 
-        dScale_ref_dev = handle.Write(dScale.data);
-        dBias_ref_dev  = handle.Write(dBias.data);
     }
 };

From e0792eedc41405eca2904f8ae87a5f4a8cf3a2e2 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 11 Sep 2023 02:11:18 +0000
Subject: [PATCH 14/28] bg/LWPMIOPEN-193_bn_back : clang format

---
 test/fusionHost.hpp         | 2 +-
 test/gtest/bn.hpp           | 2 +-
 test/gtest/bn_bwd.cpp       | 8 +++-----
 test/gtest/bn_infer.cpp     | 1 -
 test/gtest/bn_test_data.hpp | 1 -
 5 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 35032890a5..eed9173541 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -283,7 +283,7 @@ template <class T, class U>
 void batchNormSpatialHostBwdTrain(const tensor<U>& x_input,
                                   const tensor<T>& dy_input,
                                   tensor<T>& dx_out,
-                                  const tensor<U>& scale, 
+                                  const tensor<U>& scale,
                                   tensor<T>& dscale,
                                   tensor<T>& dbias,
                                   const tensor<T>& savedMean,
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 87a37ec044..57b7cec6e6 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -80,7 +80,7 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
 
     BNTestCase bn_config;
     bool test_skipped = false;
-   BNInferTestData<XDataType, YDataType, ScaleDataType, BiasDataType, MeanVarDataType, BNTestCase>
+    BNInferTestData<XDataType, YDataType, ScaleDataType, BiasDataType, MeanVarDataType, BNTestCase>
         bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 9a9ab2ad9d..76a96c883d 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -45,14 +45,12 @@ struct BNBwdTestDouble : BNBwdTest<double, double, double, double, double, doubl
 
 TEST_P(BNBwdTestTestHalf, BnFwdBwdCKHalf) {}
 
-TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat){}
+TEST_P(BNBwdTestFloat, BnFwdBwdCKFloat) {}
 
 // Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig
 // only supports half and float
-TEST_P(BNBwdTestBFloat16, DISABLED_BnFwdBwdCKBFloat16){}
-TEST_P(BNBwdTestDouble, DISABLED_BnFwdBwdCKDouble){}
-
-
+TEST_P(BNBwdTestBFloat16, DISABLED_BnFwdBwdCKBFloat16) {}
+TEST_P(BNBwdTestDouble, DISABLED_BnFwdBwdCKDouble) {}
 
 INSTANTIATE_TEST_SUITE_P(BNBwdTestTestHalfNHWCSuite,
                          BNBwdTestTestHalf,
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 48dc6fe572..6598ef7169 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -52,7 +52,6 @@ TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {}
 TEST_P(BNInferTestDouble, DISABLED_BnFwdInferCKDouble) {}
 TEST_P(BNInferTestBFloat16, DISABLED_BnFwdInferCKBFloat16) {}
 
-
 INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
                          BNInferTestHalf,
                          testing::Combine(testing::ValuesIn(Network1()),
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 01a46228f7..2b8f91eb08 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -334,6 +334,5 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 
         dScale_dev = handle.Write(dScale.data);
         dBias_dev  = handle.Write(dBias.data);
-
     }
 };

From 335c2127f32a88017aabedeff372c2448b231d8b Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 11 Sep 2023 02:43:00 +0000
Subject: [PATCH 15/28] bg/LWPMIOPEN-193_bn_back : add solver to registry

---
 src/solver.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solver.cpp b/src/solver.cpp
index 9d4f28ef5e..77835d7668 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -565,7 +565,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemm3DGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
     RegisterWithSolver(registry, ++id, ConvWinoFuryRxS<2, 3>{}, miopenConvolutionAlgoWinograd);
-
+    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 

From 8a4177ba408faab9198bb95bb26011ac72ca70da Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 11 Sep 2023 14:36:42 +0000
Subject: [PATCH 16/28] bg/LWPMIOPEN-193_bn_back : clean ups

---
 src/solver/batchnorm/backward_ck.cpp |  4 +--
 test/gtest/bn.hpp                    | 49 ++++++++++++++--------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index 3e9f597150..d6c1dc95db 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -87,7 +87,7 @@ template <typename XDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType>
-int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
+static int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args       = CKArgsBNormBwd{problem};
     using DeviceOp         = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
@@ -251,7 +251,7 @@ BnCKBwdBackward::GetSolution(const ExecutionContext& context,
     result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
         std::ignore = kernels;
         return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
-            switch(bn_problem.GetXDesc().GetType()) // add api GetInDataType in bn_problem
+            switch(bn_problem.GetXDesc().GetType())
             {
             case miopenFloat:
                 RunCKSolution<F32, F32, F32, F32, F32, F32, F32>(
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 57b7cec6e6..f5c2baac31 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -102,31 +102,30 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
         bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout);
 
         auto&& handle = get_handle();
-        miopenBatchNormalizationBackward(
-            &handle,
-            bn_config.mode,
-            &bn_bwd_test_data.alphaDataDiff,
-            &bn_bwd_test_data.betaDataDiff,
-            &bn_bwd_test_data.alphaParamDiff,
-            &bn_bwd_test_data.betaParamDiff,
-            &bn_bwd_test_data.input.desc,  // const xDesc
-            bn_bwd_test_data.in_dev.get(), // const x
-            &bn_bwd_test_data.dy.desc,     // const dyDesc
-            bn_bwd_test_data.dy_dev.get(), // const dy
-
-            &bn_bwd_test_data.output.desc,  // const dxDesc
-            bn_bwd_test_data.out_dev.get(), // dx (not -const)
-
-            &bn_bwd_test_data.bnScale.desc,     // const bnScale
-            bn_bwd_test_data.bnScale_dev.get(), // const bnScale
-
-            bn_bwd_test_data.dScale_dev.get(), // resultBnScaleDiff (not const)
-            bn_bwd_test_data.dBias_dev.get(),  // resultBnBiasDiff (not const)
-
-            bn_bwd_test_data.epsilon,
-
-            bn_bwd_test_data.savedMean_dev.get(),    // const savedMean
-            bn_bwd_test_data.savedInvVar_dev.get()); // const savedInvVariance
+        miopenBatchNormalizationBackward(&handle,
+                                         bn_config.mode,
+                                         &bn_bwd_test_data.alphaDataDiff,
+                                         &bn_bwd_test_data.betaDataDiff,
+                                         &bn_bwd_test_data.alphaParamDiff,
+                                         &bn_bwd_test_data.betaParamDiff,
+                                         &bn_bwd_test_data.input.desc,
+                                         bn_bwd_test_data.in_dev.get(),
+                                         &bn_bwd_test_data.dy.desc,
+                                         bn_bwd_test_data.dy_dev.get(),
+
+                                         &bn_bwd_test_data.output.desc,
+                                         bn_bwd_test_data.out_dev.get(),
+
+                                         &bn_bwd_test_data.bnScale.desc,
+                                         bn_bwd_test_data.bnScale_dev.get(),
+
+                                         bn_bwd_test_data.dScale_dev.get(),
+                                         bn_bwd_test_data.dBias_dev.get(),
+
+                                         bn_bwd_test_data.epsilon,
+
+                                         bn_bwd_test_data.savedMean_dev.get(),
+                                         bn_bwd_test_data.savedInvVar_dev.get());
 
         std::fill(bn_bwd_test_data.output.begin(),
                   bn_bwd_test_data.output.end(),

From 8172f51770ead30c61d3e7eea76275319e127705 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 11 Sep 2023 14:48:56 +0000
Subject: [PATCH 17/28] bg/LWPMIOPEN-194 : add static to CheckCKApplicability
 function

---
 src/solver.cpp                                | 2 +-
 src/solver/batchnorm/forward_inference_ck.cpp | 2 +-
 test/gtest/bn_test_data.hpp                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/solver.cpp b/src/solver.cpp
index 9d4f28ef5e..a4bdf5f0f1 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -561,10 +561,10 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Pooling, pooling::PoolingForwardNaive{}.SolverDbId());
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemmGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
-    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId());
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemm3DGroupFwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
     RegisterWithSolver(registry, ++id, ConvWinoFuryRxS<2, 3>{}, miopenConvolutionAlgoWinograd);
+    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index ac6b542825..c074aa6a0d 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -87,7 +87,7 @@ template <typename XDataType,
           typename ScaleDataType,
           typename BiasDataType,
           typename MeanVarDataType>
-int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
+static int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
 {
     const auto& args = CKArgsBNormFwd{problem};
     using DeviceOp   = ck::tensor_operation::device::DeviceElementwise<
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index a7aff78da3..36d7813fd2 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -170,7 +170,7 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
     tensor<MeanVarDataType> estMean;
     tensor<MeanVarDataType> estVariance;
     miopen::Allocator::ManageDataPtr scale_dev;
-    miopen::Allocator::ManageDataPtr shift_dev; // bias
+    miopen::Allocator::ManageDataPtr shift_dev;
     miopen::Allocator::ManageDataPtr estMean_dev;
     miopen::Allocator::ManageDataPtr estVariance_dev;
     double epsilon          = 1.0e-5;

From 3ce76ffbca3f5e1a6dca14157e4e21ec7faac8fc Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 11 Sep 2023 17:02:49 +0000
Subject: [PATCH 18/28] bg/LWPMIOPEN-194: fix analyze error

---
 src/solver/batchnorm/forward_inference_ck.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index c074aa6a0d..b72b4c9ac2 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -175,11 +175,11 @@ static void RunCKSolution(const Handle& handle,
 }
 #endif
 
-bool BnCKFwdInference::IsApplicable(const ExecutionContext& ctx,
+bool BnCKFwdInference::IsApplicable(const ExecutionContext& context
                                     const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = ctx;
+    std::ignore = context;
     std::ignore = fdesc_problem;
     return false;
 #else

From cdc4cb719c9a8d2803111015d8cffea000d3b32a Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 11 Sep 2023 17:13:02 +0000
Subject: [PATCH 19/28] bg/LWPMIOPEN-194: fix compile error

---
 src/solver/batchnorm/forward_inference_ck.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index b72b4c9ac2..5de1c43c3c 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -175,7 +175,7 @@ static void RunCKSolution(const Handle& handle,
 }
 #endif
 
-bool BnCKFwdInference::IsApplicable(const ExecutionContext& context
+bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
                                     const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
@@ -187,7 +187,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context
         return false;
     if(!bn_problem.IsLayoutNHWC())
         return false;
-    if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
+    if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
         return false;
 
     switch(bn_problem.GetXDesc().GetType())

From 83bd77f057efa09f036647506492ae87cec0d4e9 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 18 Sep 2023 13:50:33 +0000
Subject: [PATCH 20/28] bg/LWPMIOPEN-193_bn_back: fix review comments

---
 .../miopen/solver/implicitgemm_ck_util.hpp    |  65 +++-
 src/solver/batchnorm/backward_ck.cpp          | 280 ++++++++----------
 test/fusionHost.hpp                           |  20 +-
 3 files changed, 186 insertions(+), 179 deletions(-)

diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index 8656bdbabc..ce64ee5109 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -41,8 +41,10 @@ typename ConvPtrsType::iterator FindConvPtrByID(ConvPtrsType& conv_ptrs,
     });
 }
 
-template <typename DeviceOpType, typename CKArgsType>
-std::vector<std::string> FillValidKernelsIDs(const ProblemDescription& problem)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename ProblemDescriptionType = ProblemDescription>
+std::vector<std::string> FillValidKernelsIDs(const ProblemDescriptionType& problem)
 {
     const auto args      = CKArgsType{problem};
     const auto conv_ptrs = DeviceOpType::GetInstances();
@@ -59,8 +61,10 @@ std::vector<std::string> FillValidKernelsIDs(const ProblemDescription& problem)
     return valid_kernels;
 }
 
-template <typename DeviceOpType, typename CKArgsType>
-bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& kernel_id)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename ProblemDescriptionType = ProblemDescription>
+bool IsCKArgsSupported(const ProblemDescriptionType& problem, const std::string& kernel_id)
 {
     auto conv_ptrs = DeviceOpType::GetInstances();
     auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
@@ -68,20 +72,25 @@ bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& ker
     return (ptr_iter != conv_ptrs.end()) && CKArgsType{problem}.IsSupportedBy(*ptr_iter);
 }
 
-template <typename DeviceOpType, typename CKArgsType>
-bool IsCKApplicable(const ProblemDescription& problem)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename ProblemDescriptionType = ProblemDescription>
+bool IsCKApplicable(const ProblemDescriptionType& problem)
 {
     const auto args = CKArgsType{problem};
-    if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; }))
-        return false;
+    // if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; }))
+    //     return false;
 
     const auto ptrs = DeviceOpType::GetInstances();
     return std::any_of(
         ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); });
 }
 
-template <typename DeviceOpType, typename CKArgsType, typename CastType>
-ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::string& kernel_id)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename CastType,
+          typename ProblemDescriptionType = ProblemDescription>
+ConvSolution InitInvokerFactory(const ProblemDescriptionType& problem, const std::string& kernel_id)
 {
     auto conv_ptrs = DeviceOpType::GetInstances();
     auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
@@ -112,5 +121,41 @@ ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::st
     return result;
 }
 
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename CastType,
+          typename ProblemDescriptionType = ProblemDescription>
+ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem,
+                                   const std::string& kernel_id)
+{
+    auto conv_ptrs = DeviceOpType::GetInstances();
+    auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
+
+    if(ptr_iter == conv_ptrs.end())
+        return ConvSolution(miopenStatusInvalidValue);
+
+    ConvSolution result;
+    result.invoker_factory =
+        [ck_args     = CKArgsType{problem},
+         sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](const std::vector<Kernel>&) mutable {
+            return [ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr)](
+                       const Handle& handle, const AnyInvokeParams& primitive_parameters) {
+                const auto& data_ctx = primitive_parameters.CastTo<CastType>();
+                auto argument_ptr    = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx);
+                auto invoker_ptr     = sh_conv_ptr->MakeInvokerPointer();
+
+                const auto enable_profiling = handle.IsProfilingEnabled();
+                float elapsed_time =
+                    invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
+                if(enable_profiling)
+                {
+                    handle.ResetKernelTime();
+                    handle.AccumKernelTime(elapsed_time);
+                }
+            };
+        };
+    return result;
+}
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index d6c1dc95db..a0e3968a63 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -31,6 +31,7 @@
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
 #include <miopen/solver/ck_utility_common.hpp>
 #include <ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp>
+#include <miopen/solver/implicitgemm_ck_util.hpp>
 #endif
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_BACK)
 
@@ -50,27 +51,81 @@ using F32  = float;
 using F64  = double;
 using BF16 = ushort;
 
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+using DeviceOpBNBwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                     DxDataType,
+                                                     DyDataType,
+                                                     AccDataType,
+                                                     ScaleDataType,
+                                                     DscaleDbiasDataType,
+                                                     MeanVarDataType,
+                                                     PassThrough,
+                                                     Rank,
+                                                     NumBatchNormReduceDim>>;
+
 struct CKArgsBNormBwd
 {
     CKArgsBNormBwd(const miopen::batchnorm::ProblemDescription& problem)
     {
         std::copy(problem.GetXDesc().GetLengths().begin(),
                   problem.GetXDesc().GetLengths().end(),
-                  xyLengths.begin());
+                  lens.begin());
 
         std::copy(problem.GetXDesc().GetStrides().begin(),
                   problem.GetXDesc().GetStrides().end(),
-                  xyStrides.begin());
-        arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel
+                  strides.begin());
+        arrScaleBiasMeanVarLengths[0] = lens[1]; // get channel
         arrScaleBiasMeanVarStrides[0] = 1;
 
         // prep for CK
-        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
-        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+        std::sort(strides.begin(), strides.end(), std::greater<>());
+        std::rotate(lens.begin() + 1, lens.begin() + 2, lens.end());
+    }
+
+    CKArgsBNormBwd(const CKArgsBNormBwd&) = default;
+    CKArgsBNormBwd(CKArgsBNormBwd&&)      = default;
+    CKArgsBNormBwd& operator=(const CKArgsBNormBwd&) = default;
+
+    template <typename InvokerPtr, typename InvokerParams>
+    auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const
+    {
+        return invoker_ptr->MakeArgumentPointer(lens,
+                                                strides,
+                                                strides,
+                                                strides,
+                                                reduceDims,
+                                                arrScaleBiasMeanVarLengths,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                data_ctx.x,
+                                                data_ctx.dy,
+                                                data_ctx.bnScale,
+                                                data_ctx.savedMean,
+                                                data_ctx.savedInvVariance,
+                                                epsilon,
+                                                PassThrough{},
+                                                data_ctx.dx,
+                                                data_ctx.resultBnScaleDiff,
+                                                data_ctx.resultBnBiasDiff);
+    }
+
+    template <typename ConvPtr>
+    bool IsSupportedBy(const ConvPtr& invoker_ptr) const
+    {
+        auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::BwdInvokeParams{});
+        return invoker_ptr->IsSupportedArgument(arg_ptr.get());
     }
 
-    std::array<ck::index_t, Rank> xyLengths; // inOutLengths
-    std::array<ck::index_t, Rank> xyStrides; // inOutStrides
+    std::array<ck::index_t, Rank> lens;    // inOutLengths
+    std::array<ck::index_t, Rank> strides; // inOutStrides
     std::vector<int> invariantDims;
 
     std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
@@ -87,119 +142,18 @@ template <typename XDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType>
-static int CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
-{
-    const auto& args       = CKArgsBNormBwd{problem};
-    using DeviceOp         = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
-                                                                      DxDataType,
-                                                                      DyDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      DscaleDbiasDataType,
-                                                                      MeanVarDataType,
-                                                                      PassThrough,
-                                                                      Rank,
-                                                                      NumBatchNormReduceDim>;
-    const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-    assert(!bn_bwd_ptrs.empty());
-    int count = 0;
-    for(const auto& it : bn_bwd_ptrs)
-    {
-        auto argument_ptr = it->MakeArgumentPointer(args.xyLengths,
-                                                    args.xyStrides,
-                                                    args.xyStrides,
-                                                    args.xyStrides,
-                                                    args.reduceDims,
-                                                    args.arrScaleBiasMeanVarLengths,
-                                                    args.arrScaleBiasMeanVarStrides,
-                                                    args.arrScaleBiasMeanVarStrides,
-                                                    args.arrScaleBiasMeanVarStrides,
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr,
-                                                    args.epsilon,
-                                                    PassThrough{},
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr);
-        if(it->IsSupportedArgument(argument_ptr.get()))
-        {
-            return count;
-        }
-        count++;
-    }
-    return -1;
-}
-
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType,
-          typename AccDataType,
-          typename ScaleDataType,
-          typename DscaleDbiasDataType,
-          typename MeanVarDataType>
-static void RunCKSolution(const Handle& handle,
-                          const AnyInvokeParams& primitive_parameters,
-                          const miopen::batchnorm::ProblemDescription& problem)
+static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
 {
-    const auto& args = CKArgsBNormBwd{problem};
-
-    using DeviceOp         = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
-                                                                      DxDataType,
-                                                                      DyDataType,
-                                                                      AccDataType,
-                                                                      ScaleDataType,
-                                                                      DscaleDbiasDataType,
-                                                                      MeanVarDataType,
-                                                                      PassThrough,
-                                                                      Rank,
-                                                                      NumBatchNormReduceDim>;
-    const auto bn_bwd_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
-        DeviceOp>::GetInstances();
-
-    int kernel_index = CheckCKApplicability<XDataType,
+    return IsCKApplicable<DeviceOpBNBwdPtrs<XDataType,
                                             DxDataType,
                                             DyDataType,
                                             AccDataType,
                                             ScaleDataType,
                                             DscaleDbiasDataType,
-                                            MeanVarDataType>(problem);
-    assert(kernel_index >= 0 && kernel_index < bn_bwd_ptrs.size());
-    auto& bn_ptr                = bn_bwd_ptrs.at(kernel_index);
-    const auto& params          = primitive_parameters.CastTo<miopen::batchnorm::BwdInvokeParams>();
-    auto argument_ptr           = bn_ptr->MakeArgumentPointer(args.xyLengths,
-                                                    args.xyStrides,
-                                                    args.xyStrides,
-                                                    args.xyStrides,
-                                                    args.reduceDims,
-                                                    args.arrScaleBiasMeanVarLengths,
-                                                    args.arrScaleBiasMeanVarStrides,
-                                                    args.arrScaleBiasMeanVarStrides,
-                                                    args.arrScaleBiasMeanVarStrides,
-                                                    params.x,
-                                                    params.dy,
-                                                    params.bnScale,
-                                                    params.savedMean,
-                                                    params.savedInvVariance,
-                                                    args.epsilon,
-                                                    PassThrough{},
-                                                    params.dx,
-                                                    params.resultBnScaleDiff,
-                                                    params.resultBnBiasDiff);
-    auto invoker_ptr            = bn_ptr->MakeInvokerPointer();
-    const auto enable_profiling = handle.IsProfilingEnabled();
-
-    float elapsed_time =
-        invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
-    if(enable_profiling)
-    {
-        handle.ResetKernelTime();
-        handle.AccumKernelTime(elapsed_time);
-    }
+                                            MeanVarDataType>,
+                          CKArgsBNormBwd>(problem);
 }
+
 #endif
 
 bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
@@ -216,17 +170,16 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
         return false;
     if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
         return false;
+    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+        return false;
 
     switch(bn_problem.GetXDesc().GetType())
     {
-    case miopenFloat:
-        return (CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem) != -1);
-    case miopenDouble:
-        return (CheckCKApplicability<F64, F64, F64, F64, F64, F64, F64>(bn_problem) != -1);
-    case miopenHalf:
-        return (CheckCKApplicability<F16, F32, F32, F32, F16, F32, F32>(bn_problem) != -1);
+    case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return CheckCKApplicability<F64, F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenHalf: return CheckCKApplicability<F16, F32, F32, F32, F16, F32, F32>(bn_problem);
     case miopenBFloat16:
-        return (CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem) != -1);
+        return CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
     case miopenInt32:
     case miopenInt8:
     case miopenInt8x4:
@@ -236,48 +189,57 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
 #endif
 }
 
-ConvSolution
-BnCKBwdBackward::GetSolution(const ExecutionContext& context,
-                             const miopen::batchnorm::ProblemDescription& bn_problem) const
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem)
 {
-#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = context;
-    std::ignore = bn_problem;
-    return {};
-#else
-    std::ignore = context;
+    const auto& valid_kernel_ids = FillValidKernelsIDs<DeviceOpBNBwdPtrs<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         DscaleDbiasDataType,
+                                                                         MeanVarDataType>,
+                                                       CKArgsBNormBwd>(bn_problem);
+    assert(!valid_kernel_ids.empty());
+    const auto& kernel_id = valid_kernel_ids[0];
+    return InitAnyInvokerFactory<DeviceOpBNBwdPtrs<XDataType,
+                                                   DxDataType,
+                                                   DyDataType,
+                                                   AccDataType,
+                                                   ScaleDataType,
+                                                   DscaleDbiasDataType,
+                                                   MeanVarDataType>,
+                                 CKArgsBNormBwd,
+                                 miopen::batchnorm::BwdInvokeParams>(bn_problem, kernel_id);
+}
+
+ConvSolution BnCKBwdBackward::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+    switch(bn_problem.GetXDesc().GetType())
+    {
 
-    ConvSolution result;
-    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-        std::ignore = kernels;
-        return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
-            switch(bn_problem.GetXDesc().GetType())
-            {
-            case miopenFloat:
-                RunCKSolution<F32, F32, F32, F32, F32, F32, F32>(
-                    handle, primitive_parameters, bn_problem);
-                break;
-            case miopenDouble:
-                RunCKSolution<F64, F64, F64, F64, F64, F64, F64>(
-                    handle, primitive_parameters, bn_problem);
-                break;
-            case miopenHalf:
-                RunCKSolution<F16, F32, F32, F32, F16, F32, F32>(
-                    handle, primitive_parameters, bn_problem);
-                break;
-            case miopenBFloat16:
-                RunCKSolution<BF16, F32, F32, F32, BF16, F32, F32>(
-                    handle, primitive_parameters, bn_problem);
-                break;
-            case miopenInt8:
-            case miopenInt32:
-            case miopenInt8x4:
-            default: MIOPEN_THROW("Unsupported datatype");
-            }
-        };
-    };
-    return result;
+    case miopenFloat: return MakeAnyInvokerFactory<F32, F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return MakeAnyInvokerFactory<F64, F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenHalf: return MakeAnyInvokerFactory<F16, F32, F32, F32, F16, F32, F32>(bn_problem);
+    case miopenBFloat16:
+        return MakeAnyInvokerFactory<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
+    case miopenInt8:
+    case miopenInt32:
+    case miopenInt8x4:
+    default:
+        MIOPEN_THROW(miopenStatusInternalError, "BnCKBwdBackward operation not for this data type");
+    }
 #endif
+    return {};
 }
 
 } // namespace batchnorm
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index eed9173541..219b18682c 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -279,15 +279,15 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
     });
 }
 
-template <class T, class U>
-void batchNormSpatialHostBwdTrain(const tensor<U>& x_input,
-                                  const tensor<T>& dy_input,
-                                  tensor<T>& dx_out,
-                                  const tensor<U>& scale,
-                                  tensor<T>& dscale,
-                                  tensor<T>& dbias,
-                                  const tensor<T>& savedMean,
-                                  const tensor<T>& savedInvVar)
+template <class DataType, class XAndScaleDataType>
+void batchNormSpatialHostBwdTrain(const tensor<XAndScaleDataType>& x_input,
+                                  const tensor<DataType>& dy_input,
+                                  tensor<DataType>& dx_out,
+                                  const tensor<XAndScaleDataType>& scale,
+                                  tensor<DataType>& dscale,
+                                  tensor<DataType>& dbias,
+                                  const tensor<DataType>& savedMean,
+                                  const tensor<DataType>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -335,7 +335,7 @@ void batchNormSpatialHostBwdTrain(const tensor<U>& x_input,
                     double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
                     double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
+                    dx_out(bidx, cidx, row, column) = static_cast<DataType>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)

From d6b19eca4314677a23123a4c80cf2952e6b8bf0f Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 18 Sep 2023 17:01:16 +0000
Subject: [PATCH 21/28] bg/LWPMIOPEN-193_bn_back : fix tidy error

---
 src/include/miopen/solver/implicitgemm_ck_util.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index ce64ee5109..318d970170 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -132,7 +132,7 @@ ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem,
     auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
 
     if(ptr_iter == conv_ptrs.end())
-        return ConvSolution(miopenStatusInvalidValue);
+        return {miopenStatusInvalidValue};
 
     ConvSolution result;
     result.invoker_factory =

From 87fb611e643df601dc747e7b54eea62636c96c8c Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 21 Sep 2023 13:48:32 +0000
Subject: [PATCH 22/28] bg/LWPMIOPEN-193_bn_back : make solver epsilon and test
 driver epsilon same

---
 src/solver/batchnorm/backward_ck.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index a0e3968a63..53f647f483 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -131,7 +131,7 @@ struct CKArgsBNormBwd
     std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
     std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
 
-    double epsilon = 0.0001;
+    double epsilon = 1e-5;
     std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
 };
 

From b77d93b0ebfb02e20a5b7a98a13acc3298a44bb1 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Mon, 25 Sep 2023 14:11:30 +0000
Subject: [PATCH 23/28] bg/LWPMIOPEN-193_bn_back : add CK's cpu ref for batch
 norm backward test

---
 test/fusionHost.hpp            | 105 +++++++++++++++++++++++++++++++++
 test/gtest/bn.hpp              |   9 ++-
 test/gtest/test_operations.hpp |  25 +++++---
 3 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 219b18682c..b446835b4b 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -42,6 +42,14 @@
 #include "verify.hpp"
 #include <miopen/fusion_plan.hpp>
 
+// add ck guard
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
+
+
+
+
 template <class T>
 void convHostForward(const tensor<T>& input,
                      tensor<T>& output,
@@ -342,6 +350,103 @@ void batchNormSpatialHostBwdTrain(const tensor<XAndScaleDataType>& x_input,
     });           // for (channel)
 }
 
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DLModule>
+void batchNormSpatialHostBwdTrainCK(DLModule& dl_module)
+{
+    // batchNormSpatialHostBwdTrain(dl_module.input,
+    //                              dl_module.x_input,
+    //                              dl_module.ref_out,
+    //                              dl_module.bnScale,
+    //                              dl_module.resBnScaleDiff,
+    //                              dl_module.resBnBiasDiff,
+    //                              dl_module.savedMean,
+    //                              dl_module.savedInvVariance);
+    
+    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+    constexpr ck::index_t Rank         = 4;
+    constexpr ck::index_t NumReduceDim = 3;
+
+    using ReferenceBatchNormBwdInstance =
+            ck::tensor_operation::host::ReferenceBatchNormBwd<XDataType,
+                  DxDataType,
+                  DyDataType,
+                  AccDataType,
+                  ScaleDataType,
+                  DscaleDbiasDataType,
+                  MeanVarDataType,
+                                                              PassThroughOp,
+                                                              Rank,
+                                                              NumReduceDim>;
+
+        auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
+        std::array<int, NumReduceDim> arrReduceDims{0, 1, 2};
+        
+        std::array<ck::index_t, Rank - NumReduceDim> arrScaleBiasMeanVarLengths;
+        std::array<ck::index_t, Rank - NumReduceDim> arrScaleBiasMeanVarStrides;
+
+        arrScaleBiasMeanVarLengths[0] = dl_module.input.desc.GetLengths()[1]; // get channel
+        arrScaleBiasMeanVarStrides[0] = 1;
+
+        auto derivedBnDesc = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         dl_module.input.desc,
+                                         dl_module.bn_mode);
+        
+        std::array<ck::index_t, Rank> xyLengths; // inOutLengths
+        std::array<ck::index_t, Rank> xyStrides;
+
+        std::copy(dl_module.input.desc.GetLengths().begin(),
+                  dl_module.input.desc.GetLengths().end(),
+                  xyLengths.begin());
+
+        std::copy(dl_module.input.desc.GetStrides().begin(),
+                  dl_module.input.desc.GetStrides().end(),
+                  xyStrides.begin());
+
+        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
+        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+        
+        auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
+            xyLengths,
+            xyStrides,
+            xyStrides,
+            xyStrides,
+            arrReduceDims,
+            arrScaleBiasMeanVarLengths,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            dl_module.input.data.data(),
+            dl_module.dy.data.data(),
+            dl_module.bnScale.data.data(),
+            dl_module.savedMean.data.data(),
+            dl_module.savedInvVar.data.data(),
+            dl_module.epsilon,
+            PassThroughOp{},
+            dl_module.ref_out.data.data(),
+            dl_module.dScale_ref.data.data(),
+            dl_module.dBias_ref.data.data());
+
+        if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cerr << "The runtime parameters not supported by the reference instance, exiting!"
+                      << std::endl;
+            exit(1);
+        };
+
+        auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+}
+
 template <class T, class U>
 void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double gamma,
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index fb1bf69171..2f0f67af16 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -139,7 +139,14 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
         bn_bwd_test_data.dBias.data =
             handle.Read<DxDataType>(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
 
-        test::ComputeCPUBNBwd(bn_bwd_test_data);
+        test::ComputeCPUBNBwd<XDataType,
+                  DxDataType,
+                  DyDataType,
+                  AccDataType,
+                  ScaleDataType,
+                  DscaleDbiasDataType,
+                  MeanVarDataType>(bn_bwd_test_data);
+        
         // using tolerance = 1e-4 since this the tolerance CK uses
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4);
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 66c306651b..ad33833da2 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -38,17 +38,24 @@ void ComputeCPUBNInference(DLModule& dl_module)
                                   dl_module.estVariance);
 }
 
-template <typename DLModule>
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType, 
+          typename DLModule>
 void ComputeCPUBNBwd(DLModule& dl_module)
 {
-    batchNormSpatialHostBwdTrain(dl_module.input,
-                                 dl_module.dy,
-                                 dl_module.ref_out,
-                                 dl_module.bnScale,
-                                 dl_module.dScale_ref,
-                                 dl_module.dBias_ref,
-                                 dl_module.savedMean,
-                                 dl_module.savedInvVar);
+    batchNormSpatialHostBwdTrainCK<XDataType,
+                  DxDataType,
+                  DyDataType,
+                  AccDataType,
+                  ScaleDataType,
+                  DscaleDbiasDataType,
+                  MeanVarDataType,
+                  DLModule>(dl_module);
 }
 
 template <typename T>

From 8929dbb03da624737fd80a30857412889ef36ffe Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 27 Sep 2023 14:32:32 +0000
Subject: [PATCH 24/28] bg/LWPMIOPEN-193_bn_back: moved
 bn_spatial_nhwc_test.cpp to gtest

---
 test/bn_spatial_nhwc_test.cpp | 749 ----------------------------------
 test/fusionHost.hpp           |  10 +-
 2 files changed, 1 insertion(+), 758 deletions(-)
 delete mode 100644 test/bn_spatial_nhwc_test.cpp

diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp
deleted file mode 100644
index abca57e7ce..0000000000
--- a/test/bn_spatial_nhwc_test.cpp
+++ /dev/null
@@ -1,749 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include "driver.hpp"
-#include "get_handle.hpp"
-#include "tensor_holder.hpp"
-#include "test.hpp"
-#include "verify.hpp"
-#include "random.hpp"
-#include <array>
-#include <cmath>
-#include <ctime>
-#include <iomanip>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <miopen/batch_norm.hpp>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <miopen/tensor_layout.hpp>
-#include <utility>
-#include <cfloat>
-
-#define MIO_BN_TEST_EXPAVGFACTOR 0.1
-#define MIO_BN_TEST_EPSILON 1e-5
-#define MIO_BN_USE_MIX_PREC 1
-#if MIO_BN_USE_MIX_PREC == 1
-#define PREC_TYPE float
-#else
-#define PREC_TYPE T
-#endif
-
-template <class T, class U>
-struct verify_forward_train_bn_spatial
-{
-    const tensor<T> input;
-    const tensor<U> scale;
-    const tensor<U> shift;
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>, tensor<U>, tensor<U>> cpu() const
-    {
-        double epsilon      = MIO_BN_TEST_EPSILON;
-        double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR;
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-
-        std::size_t rs_n_batch, rs_channels, rs_height, rs_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(rs_n_batch, rs_height, rs_width, rs_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        tensor<U> runMean;
-        tensor<U> runVar;
-        if(input.desc.GetType() == miopenFloat)
-        {
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-            runVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-        }
-        else
-        {
-            prng::reset_seed();
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-            runVar  = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-
-            const U Data_scale = static_cast<U>(0.001);
-            for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++)
-            {
-                runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-                runVar[i]  = prng::gen_descreet_unsigned(Data_scale, 100);
-            }
-        }
-        auto saveMean   = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-        auto saveInvVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-        auto out        = input;
-        std::fill(out.begin(), out.end(), 0);
-
-        const auto nhw = double(height * width * n_batch);
-        par_for(channels, 1, [&](int cidx) {
-            double elemStd        = 0.;
-            double variance_accum = 0.;
-            double mean_accum     = 0.;
-            double invVar         = 0.;
-            double newRunMean     = 0.;
-            double adjust         = 0.;
-
-            std::vector<double> variance_accum_arr(height, 0.0);
-            std::vector<double> mean_accum_arr(height, 0.0);
-            std::vector<double> dshift_accum_arr(height, 0.0);
-            std::vector<double> dscale_accum_arr(height, 0.0);
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        mean_accum_arr[row] += input(bidx, cidx, row, column);
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                mean_accum += mean_accum_arr[i];
-
-            mean_accum /= nhw;
-
-            elemStd        = 0.;
-            variance_accum = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        out(bidx, cidx, row, column) = elemStd =
-                            input(bidx, cidx, row, column) - mean_accum;
-                        variance_accum_arr[row] += elemStd * elemStd;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                variance_accum += variance_accum_arr[i];
-
-            variance_accum /= nhw;
-            invVar = 1.0 / sqrt(variance_accum + epsilon);
-
-            for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-            {
-                for(std::size_t row = 0; row < height; row++)
-                {
-                    for(std::size_t column = 0; column < width; column++)
-                    {
-                        out(bidx, cidx, row, column) =
-                            scale(0, 0, 0, cidx) * (invVar * out(bidx, cidx, row, column)) +
-                            shift(0, 0, 0, cidx);
-                    }
-                }
-            }
-
-            saveMean(0, 0, 0, cidx)   = mean_accum;
-            saveInvVar(0, 0, 0, cidx) = invVar;
-
-            newRunMean             = runMean(0, 0, 0, cidx) * (1 - expAvgFactor);
-            runMean(0, 0, 0, cidx) = mean_accum * expAvgFactor + newRunMean;
-            adjust                 = (n_batch * height * width == 1) ? variance_accum
-                                                                     : (nhw / (nhw - 1)) * variance_accum;
-            runVar(0, 0, 0, cidx) =
-                (1 - expAvgFactor) * runVar(0, 0, 0, cidx) + expAvgFactor * adjust;
-        });
-
-        return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar);
-    }
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>, tensor<U>, tensor<U>> gpu() const
-    {
-        auto&& handle = get_handle();
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-
-        auto out = input;
-        std::fill(out.begin(), out.end(), 0);
-
-        std::size_t rs_n_batch, rs_channels, rs_height, rs_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(rs_n_batch, rs_height, rs_width, rs_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        tensor<U> runMean;
-        tensor<U> runVar;
-        if(input.desc.GetType() == miopenFloat)
-        {
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-            runVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-        }
-        else
-        {
-            prng::reset_seed();
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-            runVar  = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-
-            const U Data_scale = static_cast<U>(0.001);
-            for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++)
-            {
-                runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-                runVar[i]  = prng::gen_descreet_unsigned(Data_scale, 100);
-            }
-        }
-
-        auto saveMean   = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-        auto saveInvVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-
-        auto in_dev    = handle.Write(input.data);
-        auto scale_dev = handle.Write(scale.data);
-        auto shift_dev = handle.Write(shift.data);
-
-        auto runMean_dev    = handle.Write(runMean.data);
-        auto runVar_dev     = handle.Write(runVar.data);
-        auto saveMean_dev   = handle.Create<U>(channels);
-        auto saveInvVar_dev = handle.Create<U>(channels);
-        auto out_dev        = handle.Create<T>(n_batch * channels * height * width);
-
-        double epsilon      = MIO_BN_TEST_EPSILON;
-        double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR;
-
-        float alpha = 1.0;
-        float beta  = 0.0;
-
-        miopen::BatchNormForwardTraining(handle,
-                                         miopenBNSpatial,
-                                         &alpha,
-                                         &beta,
-                                         input.desc,
-                                         in_dev.get(),
-                                         out.desc,
-                                         out_dev.get(),
-                                         scale.desc,
-                                         scale_dev.get(),
-                                         shift_dev.get(),
-                                         expAvgFactor,
-                                         runMean_dev.get(),
-                                         runVar_dev.get(),
-                                         epsilon,
-                                         saveMean_dev.get(),
-                                         saveInvVar_dev.get());
-
-        saveMean.data   = handle.Read<U>(saveMean_dev, saveMean.data.size());
-        saveInvVar.data = handle.Read<U>(saveInvVar_dev, saveInvVar.data.size());
-        runMean.data    = handle.Read<U>(runMean_dev, runMean.data.size());
-        runVar.data     = handle.Read<U>(runVar_dev, runVar.data.size());
-        out.data        = handle.Read<T>(out_dev, out.data.size());
-
-        return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar);
-    }
-
-    void fail(int badtensor) const
-    {
-        std::cout << "Forward Train Spatial Batch Normalization: " << std::endl;
-        std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
-
-        switch(badtensor)
-        {
-        case(0): std::cout << "Output tensor output failed verification." << std::endl; break;
-        case(1): std::cout << "Running Mean output tensor failed verification." << std::endl; break;
-        case(2):
-            std::cout << "Running Variance output tensor failed verification." << std::endl;
-            break;
-        case(3): std::cout << "Saved Mean tensor failed verification." << std::endl; break;
-        case(4): std::cout << "Saved Variance tensor failed verification." << std::endl; break;
-        default: break;
-        }
-    }
-};
-
-template <class T, class U>
-struct verify_backward_bn_spatial_recalc
-{
-    const tensor<T> x_input;
-    const tensor<T> dy_input;
-    const tensor<U> scale;
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> cpu() const
-    {
-        double epsilon = MIO_BN_TEST_EPSILON;
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        const auto nhw = double(height * width * n_batch);
-
-        par_for(channels, 1, [&](int cidx) {
-            double elemStd = 0.;
-            unsigned int xhat_index;
-            double mean     = 0.;
-            double invVar   = 0.;
-            double dyelem   = 0.;
-            double variance = 0.;
-
-            std::vector<double> xhat(height * width * n_batch, 0.0);
-            std::vector<double> variance_accum_arr(height, 0.0);
-            std::vector<double> mean_accum_arr(height, 0.0);
-            std::vector<double> dshift_accum_arr(height, 0.0);
-            std::vector<double> dscale_accum_arr(height, 0.0);
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        mean_accum_arr[row] += x_input(bidx, cidx, row, column);
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                mean += mean_accum_arr[i];
-
-            mean /= nhw;
-
-            elemStd  = 0.;
-            variance = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        elemStd = x_input(bidx, cidx, row, column) - mean;
-                        variance_accum_arr[row] += elemStd * elemStd;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                variance += variance_accum_arr[i];
-
-            variance /= nhw;
-            invVar = 1. / double(sqrt(variance + epsilon));
-
-            dscale(0, cidx, 0, 0) = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index       = height * width * bidx + (width * row + column);
-                        elemStd          = x_input(bidx, cidx, row, column) - mean;
-                        xhat[xhat_index] = elemStd * invVar;
-                        dyelem           = dy_input(bidx, cidx, row, column);
-                        dshift_accum_arr[row] += dyelem;
-                        dscale_accum_arr[row] += xhat[xhat_index] * dyelem;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-            {
-                dshift(0, cidx, 0, 0) += dshift_accum_arr[i];
-                dscale(0, cidx, 0, 0) += dscale_accum_arr[i];
-            }
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index = height * width * bidx + (width * row + column);
-
-                        double tmp1 =
-                            nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0);
-                        double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                        double tmp3                     = (scale(0, 0, 0, cidx) * invVar) / nhw;
-                        dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1);
-                    }
-                }
-            }
-        });
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> gpu() const
-    {
-        auto&& handle = get_handle();
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        float alpha = 1.0;
-        float beta  = 0.0;
-
-        auto xin_dev    = handle.Write(x_input.data);
-        auto dyin_dev   = handle.Write(dy_input.data);
-        auto scale_dev  = handle.Write(scale.data);
-        auto dscale_dev = handle.Write(dscale.data);
-        auto dshift_dev = handle.Write(dshift.data);
-        auto dx_out_dev = handle.Write(dx_out.data);
-
-        double epsilon = MIO_BN_TEST_EPSILON;
-
-        miopen::BatchNormBackward(handle,
-                                  miopenBNSpatial,
-                                  &alpha,
-                                  &beta,
-                                  &alpha,
-                                  &beta,
-                                  x_input.desc,
-                                  xin_dev.get(),
-                                  dy_input.desc,
-                                  dyin_dev.get(),
-                                  dx_out.desc,
-                                  dx_out_dev.get(),
-                                  scale.desc,
-                                  scale_dev.get(),
-                                  dscale_dev.get(),
-                                  dshift_dev.get(),
-                                  epsilon,
-                                  nullptr,
-                                  nullptr);
-
-        dx_out.data = handle.Read<T>(dx_out_dev, dx_out.data.size());
-        dscale.data = handle.Read<U>(dscale_dev, dscale.data.size());
-        dshift.data = handle.Read<U>(dshift_dev, dshift.data.size());
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    void fail(int badtensor) const
-    {
-        std::cout << "Backward Batch Spatial Normalization Recalc Mean and Variance: " << std::endl;
-        std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl;
-        std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl;
-        switch(badtensor)
-        {
-        case(0):
-            std::cout << "Delta X output tensor output failed verification." << std::endl;
-            break;
-        case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break;
-        case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break;
-        default: break;
-        }
-    }
-};
-
-template <class T, class U>
-struct verify_backward_bn_spatial_use_saved
-{
-    const tensor<T> x_input;
-    const tensor<T> dy_input;
-    const tensor<U> scale;
-    const tensor<U> savedMean;
-    const tensor<U> savedInvVar;
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> cpu() const
-    {
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        const auto nhw = double(height * width * n_batch);
-
-        par_for(channels, 1, [&](int cidx) {
-            double elemStd = 0.;
-            unsigned int xhat_index;
-            double mean   = savedMean(0, 0, 0, cidx);
-            double invVar = savedInvVar(0, 0, 0, cidx);
-            double dyelem = 0.;
-
-            std::vector<double> xhat(n_batch * height * width, 0.0);
-            std::vector<double> dshift_accum_arr(height, 0.0);
-            std::vector<double> dscale_accum_arr(height, 0.0);
-            dscale(0, cidx, 0, 0) = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index       = height * width * bidx + (width * row + column);
-                        elemStd          = x_input(bidx, cidx, row, column) - mean;
-                        xhat[xhat_index] = elemStd * invVar;
-                        dyelem           = dy_input(bidx, cidx, row, column);
-                        dshift_accum_arr[row] += dyelem;
-                        dscale_accum_arr[row] += xhat[xhat_index] * dyelem;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-            {
-                dshift(0, cidx, 0, 0) += dshift_accum_arr[i];
-                dscale(0, cidx, 0, 0) += dscale_accum_arr[i];
-            }
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index = height * width * bidx + (width * row + column);
-
-                        double tmp1 =
-                            nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0);
-                        double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                        double tmp3                     = (scale(0, 0, 0, cidx) * invVar) / nhw;
-                        dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1);
-                    }
-                }
-            }
-        });
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> gpu() const
-    {
-        auto&& handle = get_handle();
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        float alpha = 1.0;
-        float beta  = 0.0;
-
-        auto xin_dev         = handle.Write(x_input.data);
-        auto dyin_dev        = handle.Write(dy_input.data);
-        auto scale_dev       = handle.Write(scale.data);
-        auto dscale_dev      = handle.Write(dscale.data);
-        auto dshift_dev      = handle.Write(dshift.data);
-        auto dx_out_dev      = handle.Write(dx_out.data);
-        auto savedMean_dev   = handle.Write(savedMean.data);
-        auto savedInvVar_dev = handle.Write(savedInvVar.data);
-
-        double epsilon = MIO_BN_TEST_EPSILON;
-
-        miopen::BatchNormBackward(handle,
-                                  miopenBNSpatial,
-                                  &alpha,
-                                  &beta,
-                                  &alpha,
-                                  &beta,
-                                  x_input.desc,
-                                  xin_dev.get(),
-                                  dy_input.desc,
-                                  dyin_dev.get(),
-                                  dx_out.desc,
-                                  dx_out_dev.get(),
-                                  scale.desc,
-                                  scale_dev.get(),
-                                  dscale_dev.get(),
-                                  dshift_dev.get(),
-                                  epsilon,
-                                  savedMean_dev.get(),
-                                  savedInvVar_dev.get());
-
-        dx_out.data = handle.Read<T>(dx_out_dev, dx_out.data.size());
-        dscale.data = handle.Read<U>(dscale_dev, dscale.data.size());
-        dshift.data = handle.Read<U>(dshift_dev, dshift.data.size());
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    void fail(int badtensor) const
-    {
-        std::cout << "Backward Batch Spatial Normalization Use Saved Mean and Variance: "
-                  << std::endl;
-        std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl;
-        std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl;
-        switch(badtensor)
-        {
-        case(0):
-            std::cout << "Delta X output tensor output failed verification." << std::endl;
-            break;
-        case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break;
-        case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break;
-        default: break;
-        }
-    }
-};
-
-template <class T>
-struct batch_norm_spatial_nhwc_driver : test_driver
-{
-    tensor<T> input;
-    tensor<PREC_TYPE> scale;
-    tensor<PREC_TYPE> shift;
-    batch_norm_spatial_nhwc_driver()
-    {
-        this->batch_factor = 4;
-        add(input,
-            "input",
-            get_bn_spatial_input_tensor(
-                tensor_elem_gen_integer{miopen_type<T>{} == miopenHalf ? 5 : 17}));
-    }
-
-    void run()
-    {
-        std::size_t n, c, h, w;
-        std::tie(n, c, h, w) = miopen::tien<4>(input.desc.GetLengths());
-
-        std::size_t ssn, ssc, ssh, ssw;
-        auto derivedBnDesc           = miopen::TensorDescriptor(input.desc.GetType(),
-                                                      std::vector<std::size_t>{1, 1, 1, c},
-                                                      std::vector<std::size_t>{c, c, c, 1});
-        std::tie(ssn, ssh, ssw, ssc) = miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        std::vector<std::size_t> new_len = input.desc.GetLengths();
-        std::vector<std::size_t> new_str;
-        miopen::tensor_layout_to_strides(new_len, "NCHW", "NHWC", new_str);
-        input.desc = miopen::TensorDescriptor(miopen_type<T>{}, new_len, new_str);
-
-        if(input.desc.GetType() == miopenFloat)
-        {
-            scale = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17});
-            shift = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17});
-        }
-        else
-        {
-            scale = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc};
-            shift = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc};
-
-            const PREC_TYPE Data_scale = static_cast<PREC_TYPE>(1e-4);
-            for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
-            {
-                scale[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-                shift[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-            }
-            for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
-            {
-                input[i] = prng::gen_descreet_uniform_sign(static_cast<T>(1e-5), 100);
-            }
-        }
-
-        auto outpair = verify(verify_forward_train_bn_spatial<T, PREC_TYPE>{input, scale, shift});
-
-        auto dy_input = std::get<0>(outpair.second);
-        for(std::size_t bidx = 0; bidx < n; bidx++)
-        {
-            for(std::size_t cidx = 0; cidx < c; cidx++)
-            {
-                for(std::size_t row = 0; row < h; row++)
-                {
-                    for(std::size_t column = 0; column < w; column++)
-                    {
-                        dy_input(bidx, cidx, row, column) *= 0.1;
-                    }
-                }
-            }
-        }
-        this->tolerance = 80 * input.desc.GetElementSize();
-        verify(verify_backward_bn_spatial_recalc<T, PREC_TYPE>{input, dy_input, scale});
-
-        auto savedMean   = std::get<3>(outpair.second);
-        auto savedInvVar = std::get<4>(outpair.second);
-        verify(verify_backward_bn_spatial_use_saved<T, PREC_TYPE>{
-            input, dy_input, scale, savedMean, savedInvVar});
-    }
-};
-
-int main(int argc, const char* argv[])
-{
-    test_drive<batch_norm_spatial_nhwc_driver>(argc, argv);
-    return 0;
-}
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index b446835b4b..95d3645c72 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -360,15 +360,7 @@ template <typename XDataType,
           typename DLModule>
 void batchNormSpatialHostBwdTrainCK(DLModule& dl_module)
 {
-    // batchNormSpatialHostBwdTrain(dl_module.input,
-    //                              dl_module.x_input,
-    //                              dl_module.ref_out,
-    //                              dl_module.bnScale,
-    //                              dl_module.resBnScaleDiff,
-    //                              dl_module.resBnBiasDiff,
-    //                              dl_module.savedMean,
-    //                              dl_module.savedInvVariance);
-    
+
     using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
 
     constexpr ck::index_t Rank         = 4;

From f348749eaed24fb404cfc82fdd92b1edaecfa7ca Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 27 Sep 2023 14:56:49 +0000
Subject: [PATCH 25/28] bg/LWPMIOPEN-193_bn_back : add new rand

---
 test/gtest/bn_test_data.hpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index bc77447434..b4950d6af0 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -24,8 +24,7 @@
  *
  *******************************************************************************/
 #pragma once
-
-#include <random>
+#include "random.hpp"
 
 #include <miopen/miopen.h>
 #include <miopen/solver_id.hpp>
@@ -60,7 +59,7 @@ std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
+        {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
         {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
         {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
@@ -307,17 +306,16 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 
     void InitTensorsWithRandValue()
     {
-        std::random_device rd{};
-        std::mt19937 gen{rd()};
-        std::uniform_int_distribution<> d{0, 100};
-        auto gen_value = [&](auto...) {
-            return 1e-2 * static_cast<ScaleDataType>(d(gen)) * ((d(gen) % 2 == 1) ? -1 : 1);
+        auto gen_value = [](auto...) {
+            return prng::gen_descreet_uniform_sign(static_cast<ScaleDataType>(1e-2), 100);
         };
         dy.generate(gen_value);
         bnScale.generate(gen_value);
         savedMean.generate(gen_value);
 
-        auto gen_var = [&](auto...) { return 1e-2 * (static_cast<MeanVarDataType>(d(gen)) + 1); };
+        auto gen_var = [](auto...) {
+            return static_cast<MeanVarDataType>(1e-2) * static_cast<MeanVarDataType>(prng::gen_0_to_B(100) + 1);
+        };
         savedInvVar.generate(gen_var);
 
         std::fill(dScale.begin(), dScale.end(), 0.);

From 6874015cad5fceef6d9b7067c31f74af819843cd Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 27 Sep 2023 15:19:40 +0000
Subject: [PATCH 26/28] bg/LWPMIOPEN-193_bn_back: handle all type in switch
 case of solver

---
 src/solver/batchnorm/backward_ck.cpp |  4 ++
 test/fusionHost.hpp                  | 98 ----------------------------
 test/gtest/test_operations.hpp       | 18 ++---
 3 files changed, 13 insertions(+), 107 deletions(-)

diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index 53f647f483..fba8724990 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -183,6 +183,8 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
     case miopenInt32:
     case miopenInt8:
     case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
     }
     return false;
@@ -235,6 +237,8 @@ ConvSolution BnCKBwdBackward::GetSolution(
     case miopenInt8:
     case miopenInt32:
     case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
     default:
         MIOPEN_THROW(miopenStatusInternalError, "BnCKBwdBackward operation not for this data type");
     }
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 95d3645c72..f3219048fb 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -36,20 +36,11 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <utility>
-// #include "driver.hpp"
 #include "get_handle.hpp"
 #include "tensor_holder.hpp"
 #include "verify.hpp"
 #include <miopen/fusion_plan.hpp>
 
-// add ck guard
-#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
-
-#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
-
-
-
-
 template <class T>
 void convHostForward(const tensor<T>& input,
                      tensor<T>& output,
@@ -350,95 +341,6 @@ void batchNormSpatialHostBwdTrain(const tensor<XAndScaleDataType>& x_input,
     });           // for (channel)
 }
 
-template <typename XDataType,
-          typename DxDataType,
-          typename DyDataType,
-          typename AccDataType,
-          typename ScaleDataType,
-          typename DscaleDbiasDataType,
-          typename MeanVarDataType,
-          typename DLModule>
-void batchNormSpatialHostBwdTrainCK(DLModule& dl_module)
-{
-
-    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
-
-    constexpr ck::index_t Rank         = 4;
-    constexpr ck::index_t NumReduceDim = 3;
-
-    using ReferenceBatchNormBwdInstance =
-            ck::tensor_operation::host::ReferenceBatchNormBwd<XDataType,
-                  DxDataType,
-                  DyDataType,
-                  AccDataType,
-                  ScaleDataType,
-                  DscaleDbiasDataType,
-                  MeanVarDataType,
-                                                              PassThroughOp,
-                                                              Rank,
-                                                              NumReduceDim>;
-
-        auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
-        std::array<int, NumReduceDim> arrReduceDims{0, 1, 2};
-        
-        std::array<ck::index_t, Rank - NumReduceDim> arrScaleBiasMeanVarLengths;
-        std::array<ck::index_t, Rank - NumReduceDim> arrScaleBiasMeanVarStrides;
-
-        arrScaleBiasMeanVarLengths[0] = dl_module.input.desc.GetLengths()[1]; // get channel
-        arrScaleBiasMeanVarStrides[0] = 1;
-
-        auto derivedBnDesc = miopen::TensorDescriptor{};
-        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         dl_module.input.desc,
-                                         dl_module.bn_mode);
-        
-        std::array<ck::index_t, Rank> xyLengths; // inOutLengths
-        std::array<ck::index_t, Rank> xyStrides;
-
-        std::copy(dl_module.input.desc.GetLengths().begin(),
-                  dl_module.input.desc.GetLengths().end(),
-                  xyLengths.begin());
-
-        std::copy(dl_module.input.desc.GetStrides().begin(),
-                  dl_module.input.desc.GetStrides().end(),
-                  xyStrides.begin());
-
-        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
-        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
-        
-        auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
-            xyLengths,
-            xyStrides,
-            xyStrides,
-            xyStrides,
-            arrReduceDims,
-            arrScaleBiasMeanVarLengths,
-            arrScaleBiasMeanVarStrides,
-            arrScaleBiasMeanVarStrides,
-            arrScaleBiasMeanVarStrides,
-            dl_module.input.data.data(),
-            dl_module.dy.data.data(),
-            dl_module.bnScale.data.data(),
-            dl_module.savedMean.data.data(),
-            dl_module.savedInvVar.data.data(),
-            dl_module.epsilon,
-            PassThroughOp{},
-            dl_module.ref_out.data.data(),
-            dl_module.dScale_ref.data.data(),
-            dl_module.dBias_ref.data.data());
-
-        if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
-        {
-            std::cerr << "The runtime parameters not supported by the reference instance, exiting!"
-                      << std::endl;
-            exit(1);
-        };
-
-        auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
-
-        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
-}
-
 template <class T, class U>
 void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double gamma,
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index ad33833da2..5d1e62a5b8 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -44,18 +44,18 @@ template <typename XDataType,
           typename AccDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
-          typename MeanVarDataType, 
+          typename MeanVarDataType,
           typename DLModule>
 void ComputeCPUBNBwd(DLModule& dl_module)
 {
-    batchNormSpatialHostBwdTrainCK<XDataType,
-                  DxDataType,
-                  DyDataType,
-                  AccDataType,
-                  ScaleDataType,
-                  DscaleDbiasDataType,
-                  MeanVarDataType,
-                  DLModule>(dl_module);
+    batchNormSpatialHostBwdTrain(dl_module.input,
+                                 dl_module.dy,
+                                 dl_module.ref_out,
+                                 dl_module.bnScale,
+                                 dl_module.dScale_ref,
+                                 dl_module.dBias_ref,
+                                 dl_module.savedMean,
+                                 dl_module.savedInvVar);
 }
 
 template <typename T>

From 11f66e92a0cec484e171e5ace64e0b3c5ee7aaab Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 27 Sep 2023 15:25:52 +0000
Subject: [PATCH 27/28] bg/LWPMIOPEN-193_bn_back: clang format

---
 test/gtest/bn.hpp           | 14 +++++++-------
 test/gtest/bn_test_data.hpp |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 2f0f67af16..a89e94718e 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -140,13 +140,13 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
             handle.Read<DxDataType>(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
 
         test::ComputeCPUBNBwd<XDataType,
-                  DxDataType,
-                  DyDataType,
-                  AccDataType,
-                  ScaleDataType,
-                  DscaleDbiasDataType,
-                  MeanVarDataType>(bn_bwd_test_data);
-        
+                              DxDataType,
+                              DyDataType,
+                              AccDataType,
+                              ScaleDataType,
+                              DscaleDbiasDataType,
+                              MeanVarDataType>(bn_bwd_test_data);
+
         // using tolerance = 1e-4 since this the tolerance CK uses
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4);
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index b4950d6af0..822369725e 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -314,7 +314,8 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
         savedMean.generate(gen_value);
 
         auto gen_var = [](auto...) {
-            return static_cast<MeanVarDataType>(1e-2) * static_cast<MeanVarDataType>(prng::gen_0_to_B(100) + 1);
+            return static_cast<MeanVarDataType>(1e-2) *
+                   static_cast<MeanVarDataType>(prng::gen_0_to_B(100) + 1);
         };
         savedInvVar.generate(gen_var);
 

From d5eb31cb97a0c170ccf3e70678d7340d73487ae2 Mon Sep 17 00:00:00 2001
From: mentat <108366729+bghimireamd@users.noreply.github.com>
Date: Thu, 5 Oct 2023 00:08:47 -0500
Subject: [PATCH 28/28] bg/LWPMIOPEN-192: Integrate CK's batch norm forward
 training into non-tunable MIOpen solver  (#2386)

* bg/LWPMIOPEN-192: add batch norm foward CK kernel

* bg/LWPMIOPEN-192 : analyze cleanup

* fix a typo

* bg/LWPMIOPEN-192: fix review comments

* bg/LWPMIOPEN-192 : fix compile error

* bg/LWPMIOPEN-192 : fix clang tidy

---------

Co-authored-by: Jun Liu <Liu.Jun@amd.com>
---
 src/CMakeLists.txt                           |   1 +
 src/include/miopen/batchnorm/solvers.hpp     |  10 +
 src/ocl/batchnormocl.cpp                     |   3 +-
 src/solver.cpp                               |   1 +
 src/solver/batchnorm/forward_training_ck.cpp | 239 +++++++++++++++++++
 test/fusionHost.hpp                          |  10 +-
 test/gtest/bn.hpp                            |  91 ++++++-
 test/gtest/bn_fwd_train.cpp                  |  73 ++++++
 test/gtest/bn_test_data.hpp                  | 103 ++++++++
 test/gtest/test_operations.hpp               |  15 ++
 10 files changed, 539 insertions(+), 7 deletions(-)
 create mode 100644 src/solver/batchnorm/forward_training_ck.cpp
 create mode 100644 test/gtest/bn_fwd_train.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 554c8ff4ec..abc0679a8a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -164,6 +164,7 @@ set( MIOpen_Source
     solver/batchnorm/forward_per_activation_fused.cpp
     solver/batchnorm/forward_spatial_multiple.cpp
     solver/batchnorm/forward_spatial_single.cpp
+    solver/batchnorm/forward_training_ck.cpp
     solver/conv_asm_1x1u.cpp
     solver/conv_asm_1x1u_bias_activ_fused.cpp
     solver/conv_asm_1x1u_stride2.cpp
diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 51fa1d1643..70d64bb204 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -152,6 +152,16 @@ struct BnCKBwdBackward final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
+struct BnCKFwdTraining final : BatchnormSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnCKFwdTraining>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::batchnorm::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& problem) const override;
+};
+
 } // namespace batchnorm
 
 } // namespace solver
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 7a20038c55..6147a827b8 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -131,7 +131,8 @@ void BatchNormForwardTraining(Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdTrainingSpatialSingle,
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKFwdTraining,
+                                                 solver::batchnorm::BnFwdTrainingSpatialSingle,
                                                  solver::batchnorm::BnFwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnFwdTrainingPerActivation>{};
 
diff --git a/src/solver.cpp b/src/solver.cpp
index ef90f508b2..4cd680dd9c 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -570,6 +570,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
         registry, ++id, ConvHipImplicitGemm3DGroupBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
     Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId());
     Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdTraining{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
new file mode 100644
index 0000000000..a65cec14a9
--- /dev/null
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -0,0 +1,239 @@
+
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/solvers.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
+#include <ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp>
+#include <miopen/solver/implicitgemm_ck_util.hpp>
+#endif
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING)
+
+namespace miopen {
+namespace solver {
+namespace batchnorm {
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+
+using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+using index_t       = int32_t;
+
+constexpr index_t Rank                  = 4;
+constexpr index_t NumBatchNormReduceDim = 3;
+
+using F16  = ck::half_t;
+using F32  = float;
+using F64  = double;
+using BF16 = ushort;
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+using DeviceOpBNFwdTrainingPtrs =
+    ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                         YDataType,
+                                                         AccDataType,
+                                                         ScaleDataType,
+                                                         BiasDataType,
+                                                         MeanVarDataType,
+                                                         PassThroughOp,
+                                                         Rank,
+                                                         NumBatchNormReduceDim>>;
+
+struct CKArgsBNormFwdTraining
+{
+    CKArgsBNormFwdTraining(const miopen::batchnorm::ProblemDescription& problem)
+    {
+        std::copy(problem.GetXDesc().GetLengths().begin(),
+                  problem.GetXDesc().GetLengths().end(),
+                  xyLengths.begin());
+
+        std::copy(problem.GetXDesc().GetStrides().begin(),
+                  problem.GetXDesc().GetStrides().end(),
+                  xyStrides.begin());
+        arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel
+        arrScaleBiasMeanVarStrides[0] = 1;
+
+        // prep for CK
+        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
+        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+    }
+
+    CKArgsBNormFwdTraining(const CKArgsBNormFwdTraining&) = default;
+    CKArgsBNormFwdTraining(CKArgsBNormFwdTraining&&)      = default;
+    CKArgsBNormFwdTraining& operator=(const CKArgsBNormFwdTraining&) = default;
+
+    template <typename InvokerPtr, typename InvokerParams>
+    auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const
+    {
+        return invoker_ptr->MakeArgumentPointer(xyLengths,
+                                                xyStrides,
+                                                xyStrides,
+                                                reduceDims,
+                                                arrScaleBiasMeanVarLengths,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                data_ctx.x,
+                                                data_ctx.bnScale,
+                                                data_ctx.bnBias,
+                                                data_ctx.epsilon,
+                                                PassThroughOp{},
+                                                data_ctx.y,
+                                                data_ctx.resultSaveMean,
+                                                data_ctx.resultSaveInvVariance,
+                                                data_ctx.expAvgFactor,
+                                                data_ctx.resultRunningMean,
+                                                data_ctx.resultRunningVariance);
+    }
+
+    template <typename ConvPtr>
+    bool IsSupportedBy(const ConvPtr& invoker_ptr) const
+    {
+        auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::InvokeParams{});
+        return invoker_ptr->IsSupportedArgument(arg_ptr.get());
+    }
+
+    std::array<ck::index_t, Rank> xyLengths;
+    std::array<ck::index_t, Rank> xyStrides;
+    std::vector<int> invariantDims;
+
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
+
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
+{
+    return IsCKApplicable<DeviceOpBNFwdTrainingPtrs<XDataType,
+                                                    YDataType,
+                                                    AccDataType,
+                                                    ScaleDataType,
+                                                    BiasDataType,
+                                                    MeanVarDataType>,
+                          CKArgsBNormFwdTraining>(problem);
+}
+#endif
+
+bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context,
+                                   const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = context;
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING{}))
+        return false;
+    if(!bn_problem.IsLayoutNHWC())
+        return false;
+    if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
+        return false;
+
+    switch(bn_problem.GetXDesc().GetType())
+    {
+    case miopenHalf: return CheckCKApplicability<F16, F16, F32, F16, F16, F32>(bn_problem);
+    case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenBFloat16: return CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
+    case miopenInt32:
+    case miopenInt8:
+    case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
+    default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type");
+    }
+    return false;
+#endif
+}
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem)
+{
+    const auto& valid_kernel_ids = FillValidKernelsIDs<DeviceOpBNFwdTrainingPtrs<XDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 ScaleDataType,
+                                                                                 BiasDataType,
+                                                                                 MeanVarDataType>,
+                                                       CKArgsBNormFwdTraining>(bn_problem);
+    assert(!valid_kernel_ids.empty());
+    const auto& kernel_id = valid_kernel_ids[0];
+    return InitAnyInvokerFactory<DeviceOpBNFwdTrainingPtrs<XDataType,
+                                                           YDataType,
+                                                           AccDataType,
+                                                           ScaleDataType,
+                                                           BiasDataType,
+                                                           MeanVarDataType>,
+                                 CKArgsBNormFwdTraining,
+                                 miopen::batchnorm::InvokeParams>(bn_problem, kernel_id);
+}
+
+ConvSolution BnCKFwdTraining::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+    switch(bn_problem.GetXDesc().GetType())
+    {
+
+    case miopenFloat: return MakeAnyInvokerFactory<F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return MakeAnyInvokerFactory<F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenHalf: return MakeAnyInvokerFactory<F16, F16, F32, F16, F16, F32>(bn_problem);
+    case miopenBFloat16: return MakeAnyInvokerFactory<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
+    case miopenInt8:
+    case miopenInt32:
+    case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
+    default:
+        MIOPEN_THROW(miopenStatusInternalError, "BnCKFwdTraining operation not for this data type");
+    }
+#endif
+    return {};
+}
+
+} // namespace batchnorm
+} // namespace solver
+} // namespace miopen
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index f3219048fb..5374abd1fa 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -202,17 +202,17 @@ void batchNormPerActivHostInference(const tensor<T>& input,
     });
 }
 
-template <class T, class U>
+template <class T, class U, class V = U>
 void batchNormSpatialHostFwdTrain(const tensor<T>& input,
                                   tensor<T>& out,
                                   const tensor<U>& scale,
                                   const tensor<U>& bias,
                                   double epsilon,
                                   double expAvgFactor,
-                                  tensor<U>& saveMean,
-                                  tensor<U>& saveInvVar,
-                                  tensor<U>& runMean,
-                                  tensor<U>& runVar)
+                                  tensor<V>& saveMean,
+                                  tensor<V>& saveInvVar,
+                                  tensor<V>& runMean,
+                                  tensor<V>& runVar)
 {
 
     int height, width, n_batch, channels;
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index a89e94718e..22f8391fe6 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -84,7 +84,6 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
-
 template <typename XDataType,
           typename DxDataType,
           typename DyDataType,
@@ -166,3 +165,93 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
         bn_bwd_test_data;
     miopenTensorLayout_t tensor_layout;
 };
+
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+struct BNFwdTrainTest
+    : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped                       = false;
+        std::tie(bn_config, tensor_layout) = GetParam();
+        bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout);
+
+        auto&& handle = get_handle();
+        miopenBatchNormalizationForwardTraining(&handle,
+                                                bn_config.mode,
+                                                &bn_fwd_train_test_data.alpha,
+                                                &bn_fwd_train_test_data.beta,
+                                                &bn_fwd_train_test_data.input.desc,
+                                                bn_fwd_train_test_data.in_dev.get(),
+                                                &bn_fwd_train_test_data.output.desc,
+                                                bn_fwd_train_test_data.out_dev.get(),
+                                                &bn_fwd_train_test_data.scale.desc,
+                                                bn_fwd_train_test_data.scale_dev.get(),
+                                                bn_fwd_train_test_data.shift_dev.get(),
+                                                bn_fwd_train_test_data.averageFactor,
+                                                bn_fwd_train_test_data.runMean_dev.get(),
+                                                bn_fwd_train_test_data.runVariance_dev.get(),
+                                                bn_fwd_train_test_data.epsilon,
+                                                bn_fwd_train_test_data.saveMean_dev.get(),
+                                                bn_fwd_train_test_data.saveVariance_dev.get());
+
+        std::fill(bn_fwd_train_test_data.output.begin(),
+                  bn_fwd_train_test_data.output.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
+        std::fill(bn_fwd_train_test_data.saveMean_ref.begin(),
+                  bn_fwd_train_test_data.saveMean_ref.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
+        std::fill(bn_fwd_train_test_data.saveVariance_ref.begin(),
+                  bn_fwd_train_test_data.saveVariance_ref.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
+    }
+
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+        auto&& handle                      = get_handle();
+        bn_fwd_train_test_data.output.data = handle.Read<YDataType>(
+            bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size());
+
+        bn_fwd_train_test_data.saveMean.data = handle.Read<MeanVarDataType>(
+            bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size());
+        bn_fwd_train_test_data.saveVariance.data =
+            handle.Read<MeanVarDataType>(bn_fwd_train_test_data.saveVariance_dev,
+                                         bn_fwd_train_test_data.saveVariance_ref.data.size());
+        bn_fwd_train_test_data.runMean.data = handle.Read<MeanVarDataType>(
+            bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size());
+        bn_fwd_train_test_data.runVariance.data =
+            handle.Read<MeanVarDataType>(bn_fwd_train_test_data.runVariance_dev,
+                                         bn_fwd_train_test_data.runVariance_ref.data.size());
+        test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data);
+
+        // 4e-3 is tolerance used by CK kernel.
+        test::CompareTensor<YDataType>(
+            bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3);
+    }
+
+    BNTestCase bn_config;
+    bool test_skipped = false;
+    BNFwdTrainTestData<XDataType,
+                       YDataType,
+                       ScaleDataType,
+                       BiasDataType,
+                       MeanVarDataType,
+                       BNTestCase>
+        bn_fwd_train_test_data;
+    miopenTensorLayout_t tensor_layout;
+};
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
new file mode 100644
index 0000000000..4a4dd4c728
--- /dev/null
+++ b/test/gtest/bn_fwd_train.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+struct BNFwdTrainTestHalf
+    : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
+{
+};
+
+struct BNFwdTrainTestFloat : BNFwdTrainTest<float, float, float, float, float>
+{
+};
+
+struct BNFwdTrainTestDouble : BNFwdTrainTest<double, double, double, double, double>
+{
+};
+
+struct BNFwdTrainTestBFloat16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, double>
+{
+};
+
+TEST_P(BNFwdTrainTestHalf, BnFwdTrainCKHalf) {}
+
+TEST_P(BNFwdTrainTestFloat, BnFwdTrainCKFloat) {}
+
+// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig
+// only supports half and float
+TEST_P(BNFwdTrainTestDouble, DISABLED_BnFwdTrainCKDouble) {}
+TEST_P(BNFwdTrainTestBFloat16, DISABLED_BnFwdTrainCKBFloat16) {}
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestHalfNHWCSuite,
+                         BNFwdTrainTestHalf,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite,
+                         BNFwdTrainTestFloat,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite,
+                         BNFwdTrainTestDouble,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite,
+                         BNFwdTrainTestBFloat16,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 822369725e..f65c694281 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -338,3 +338,106 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
         dBias_dev  = handle.Write(dBias.data);
     }
 };
+
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename TConfig>
+struct BNFwdTrainTestData : public BNTestData<XDataType, YDataType, TConfig>
+{
+    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    {
+        BNTestData<XDataType, YDataType, TConfig>::SetUpImpl(config, t_layout);
+        CreateTensors();
+        InitTensorsWithRandValue();
+        WriteToGPU();
+    }
+
+    tensor<ScaleDataType> scale;
+    tensor<BiasDataType> shift;
+    tensor<MeanVarDataType> saveMean;
+    tensor<MeanVarDataType> saveVariance;
+    tensor<MeanVarDataType> runMean;
+    tensor<MeanVarDataType> runVariance;
+
+    tensor<MeanVarDataType> saveMean_ref;
+    tensor<MeanVarDataType> saveVariance_ref;
+    tensor<MeanVarDataType> runMean_ref;
+    tensor<MeanVarDataType> runVariance_ref;
+
+    miopen::Allocator::ManageDataPtr scale_dev;
+    miopen::Allocator::ManageDataPtr shift_dev; // bias
+    miopen::Allocator::ManageDataPtr saveMean_dev;
+    miopen::Allocator::ManageDataPtr saveVariance_dev;
+    miopen::Allocator::ManageDataPtr runMean_dev;
+    miopen::Allocator::ManageDataPtr runVariance_dev;
+    double epsilon          = 1.0e-5;
+    double averageFactor    = 0.1;
+    float alpha             = static_cast<float>(1.0f);
+    float beta              = static_cast<float>(0);
+    const float activ_alpha = static_cast<double>(0.5f);
+    const float activ_beta  = static_cast<double>(0.5f);
+    const float activ_gamma = static_cast<double>(0.5f);
+
+private:
+    void CreateTensors()
+    {
+        auto derivedBnDesc = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         BNTestData<XDataType, YDataType, TConfig>::input.desc,
+                                         BNTestData<XDataType, YDataType, TConfig>::bn_mode);
+        scale    = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                                      BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                      derivedBnDesc.GetLengths()};
+        shift    = tensor<BiasDataType>{miopen_type<BiasDataType>{},
+                                     BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                     derivedBnDesc.GetLengths()};
+        saveMean = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                           BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                           derivedBnDesc.GetLengths()};
+        saveVariance =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+        runMean = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                          derivedBnDesc.GetLengths()};
+        runVariance =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+    }
+
+    void InitTensorsWithRandValue()
+    {
+        auto gen_value = [](auto...) {
+            return prng::gen_descreet_uniform_sign(static_cast<ScaleDataType>(1e-2), 100);
+        };
+        scale.generate(gen_value);
+        shift.generate(gen_value);
+
+        auto gen_var = [](auto...) {
+            return static_cast<MeanVarDataType>(1e-2) *
+                   static_cast<MeanVarDataType>(prng::gen_0_to_B(100) + 1);
+        };
+        runMean.generate(gen_var);
+        runVariance.generate(gen_var);
+
+        saveMean_ref     = saveMean;
+        saveVariance_ref = saveVariance;
+        runMean_ref      = runMean;
+        runVariance_ref  = runVariance;
+    }
+    void WriteToGPU()
+    {
+        auto&& handle    = get_handle();
+        scale_dev        = handle.Write(scale.data);
+        shift_dev        = handle.Write(shift.data);
+        saveMean_dev     = handle.Write(saveMean.data);
+        saveVariance_dev = handle.Write(saveVariance.data);
+        runMean_dev      = handle.Write(runMean.data);
+        runVariance_dev  = handle.Write(runVariance.data);
+    }
+};
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 5d1e62a5b8..da41212302 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -58,6 +58,21 @@ void ComputeCPUBNBwd(DLModule& dl_module)
                                  dl_module.savedInvVar);
 }
 
+template <typename DLModule>
+void ComputeCPUBNFwdTrain(DLModule& dl_module)
+{
+    batchNormSpatialHostFwdTrain(dl_module.input,
+                                 dl_module.ref_out,
+                                 dl_module.scale,
+                                 dl_module.shift,
+                                 dl_module.epsilon,
+                                 dl_module.averageFactor,
+                                 dl_module.saveMean_ref,
+                                 dl_module.saveVariance_ref,
+                                 dl_module.runMean_ref,
+                                 dl_module.runVariance_ref);
+}
+
 template <typename T>
 void CompareTensor(const tensor<T>& output,
                    const tensor<T>& ref_out,