Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OCL][MI100][MI200] Fix iGemm ASM GTC XDLOPS failures with OCL backend (Staging 95b58f72f) (#1317) and Implement abstraction for multi-buffer workspace (#1326) #1327

Merged
merged 24 commits into from
Dec 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9905fd1
add a class to represent workspace buffer traits
shaojiewang Dec 7, 2021
6945137
fix compile warning
shaojiewang Dec 7, 2021
6d1e040
fix compile warning
shaojiewang Dec 7, 2021
4836817
add workspace buffer alignment for bwd and wrw
shaojiewang Dec 7, 2021
d8b7770
fix clang tidy warning
shaojiewang Dec 7, 2021
ba20cee
fix clang tidy warning
shaojiewang Dec 7, 2021
b977cec
Merge branch 'develop' into fix_issue#1317_#1326
shaojiewang Dec 7, 2021
69e67a4
comment out W/A 1317
shaojiewang Dec 7, 2021
e325659
add test cases for transpose+asm_igemm kernels
shaojiewang Dec 7, 2021
1fc0119
rename transpose test target
shaojiewang Dec 7, 2021
64b97f1
remove WORKAROUND_ISSUE_1317 macro
shaojiewang Dec 7, 2021
65eb884
fix ctest bug for transpose+asm igemm case: weights c=c/groups
shaojiewang Dec 8, 2021
715ae58
Merge branch 'develop' into fix_issue#1317_#1326
shaojiewang Dec 13, 2021
de418a0
Update src/conv/invokers/impl_gemm_dynamic.cpp
shaojiewang Dec 14, 2021
47ffa29
rename new test cases
shaojiewang Dec 16, 2021
d9fcba5
Merge branch 'fix_issue#1317_#1326' of https://github.com/ROCmSoftwar…
shaojiewang Dec 16, 2021
c7f491a
add constexpr for const var
shaojiewang Dec 16, 2021
d88835f
move multi buffer traits to buffer_info.hpp/cpp
shaojiewang Dec 16, 2021
d2abd57
remove useless variable from MultiBufferWorkspaceTraits
shaojiewang Dec 17, 2021
c5e8574
Merge branch 'develop' into fix_issue#1317_#1326
junliume Dec 23, 2021
37aea02
Fix issue caused by merge conflicts
junliume Dec 23, 2021
64a328b
fix another duplicate caused by merge conflict
junliume Dec 23, 2021
dd46f1b
remove duplicated cases
shaojiewang Dec 28, 2021
20c829b
Revert "[tests] Disable test_regression_opencl_float_mi100 if WORKARO…
shaojiewang Dec 28, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions src/buffer_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,27 @@ BuffInfo::BuffInfo(MemLayout_t layout, int nk, int c, int h, int w, int g, int _
}
}

MultiBufferWorkspaceTraits::MultiBufferWorkspaceTraits(std::initializer_list<size_t> v_size_,
size_t alignment_)
: v_size(v_size_), alignment(alignment_)
{
size_t each_offset = 0;
v_offset.push_back(each_offset);
for(auto each_size : v_size)
{
size_t padding = (alignment - (each_size % alignment)) % alignment;
each_offset += each_size + padding;
v_offset.push_back(each_offset);
}
}

size_t MultiBufferWorkspaceTraits::GetSize() const { return v_offset.back(); }

size_t MultiBufferWorkspaceTraits::GetOffset(size_t index) const
{
if(index >= v_offset.size())
MIOPEN_THROW("index given overflows");
return v_offset[index];
}

} // namespace miopen
36 changes: 24 additions & 12 deletions src/conv/invokers/impl_gemm_dynamic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,8 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
int trans_weight_idx = -1;
int trans_output_idx = -1;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{
TransposeSolutionDefault2Nhwc trans_input(ctx, ctx.in_data_type, n, c, hi, wi);
Expand All @@ -567,9 +569,6 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

trans_weight_offset = trans_input_offset + trans_input_size;
trans_output_offset = trans_weight_offset + trans_weight_size;

int idx = 0;
if(!trans_input_skippable)
trans_input_idx = idx++;
Expand All @@ -579,9 +578,16 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
trans_output_idx = idx++;
}

// 4 bytes alignment to do atomic add
const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * k * ho * wo : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * k * ho * wo : 0;

MultiBufferWorkspaceTraits wt(
{trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

trans_input_offset = wt.GetOffset(0);
trans_weight_offset = wt.GetOffset(1);
trans_output_offset = wt.GetOffset(2);

const size_t cast_offset = wt.GetOffset(3);

const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

Expand Down Expand Up @@ -849,6 +855,8 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
int trans_weight_idx = -1;
int trans_output_idx = -1;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{
TransposeSolutionNhwc2Default trans_input(ctx, ctx.out_data_type, n, c, hi, wi);
Expand All @@ -875,9 +883,6 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

trans_weight_offset = trans_input_offset + trans_input_size;
trans_output_offset = trans_weight_offset + trans_weight_size;

int idx = 0;
if(!trans_input_skippable)
trans_input_idx = idx++;
Expand All @@ -887,9 +892,16 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
trans_output_idx = idx++;
}

// 4 bytes alignment to do atomic add
const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * c * hi * wi : 0;
const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * c * hi * wi : 0;

MultiBufferWorkspaceTraits wt(
{trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

trans_input_offset = wt.GetOffset(0);
trans_weight_offset = wt.GetOffset(1);
trans_output_offset = wt.GetOffset(2);

const size_t cast_offset = wt.GetOffset(3);

const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

Expand Down
12 changes: 12 additions & 0 deletions src/include/miopen/buffer_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <string>
#include <cassert>
#include <vector>

namespace miopen {

Expand Down Expand Up @@ -307,6 +308,17 @@ struct WinogradBufferInfo
}
};

struct MultiBufferWorkspaceTraits
{
MultiBufferWorkspaceTraits(std::initializer_list<size_t> v_size_, size_t alignment_);
size_t GetSize() const;
size_t GetOffset(size_t index) const;

std::vector<size_t> v_size;
std::vector<size_t> v_offset;
size_t alignment;
};

} // namespace miopen

#endif // GUARD_MIOPEN_BUFFER_INFO_HPP_
2 changes: 0 additions & 2 deletions src/include/miopen/conv/asm_implicit_gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
#include <vector>
#include <limits>

#define WORKAROUND_ISSUE_1317 (MIOPEN_BACKEND_OPENCL)

namespace miopen {

namespace solver {
Expand Down
55 changes: 30 additions & 25 deletions src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -867,11 +867,6 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx

bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
#if WORKAROUND_ISSUE_1317
if(ctx.IsLayoutDefault())
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{}))
return false;
#endif
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{}))
return false;

Expand Down Expand Up @@ -904,17 +899,25 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(const ConvolutionC
size_t
ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionContext& ctx) const
{
const auto& hi = ctx.out_height;
const auto& wi = ctx.out_width;
const auto& n = ctx.batch_sz;
const auto& k = ctx.n_inputs;
const auto& c = ctx.n_outputs;
const auto& ho = ctx.in_height;
const auto& wo = ctx.in_width;
const auto& y = ctx.kernel_size_h;
const auto& x = ctx.kernel_size_w;
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();
const auto& hi = ctx.out_height;
const auto& wi = ctx.out_width;
const auto& n = ctx.batch_sz;
const auto& k = ctx.n_inputs;
const auto& c = ctx.n_outputs;
const auto& ho = ctx.in_height;
const auto& wo = ctx.in_width;
const auto& y = ctx.kernel_size_h;
const auto& x = ctx.kernel_size_w;
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();

size_t size_trans_input = 0;
size_t size_trans_weight = 0;
size_t size_trans_output = 0;
size_t size_tensor_cast = 0;

constexpr size_t buf_alignment = 256;

size_t workspace_size = 0;
if(is_nchw)
{
Expand All @@ -927,20 +930,22 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
x); // group * k_per_group as batch for weight
TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo);
if(!trans_input.IsSkippable())
workspace_size += trans_input.GetSize();
size_trans_input = trans_input.GetSize();
if(!trans_weight.IsSkippable())
workspace_size += trans_weight.GetSize();
size_trans_weight = trans_weight.GetSize();
if(!trans_output.IsSkippable())
workspace_size += trans_output.GetSize();

// 4 bytes alignment to do atomic add
workspace_size = ((workspace_size + 3) >> 2) << 2;
size_trans_output = trans_output.GetSize();
}

if(!ctx.IsFp32())
workspace_size += miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * c * hi * wi;
size_tensor_cast =
miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * c * hi * wi;

MultiBufferWorkspaceTraits wt(
{size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
workspace_size = wt.GetSize();

return workspace_size;
}
Expand Down
33 changes: 19 additions & 14 deletions src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,14 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();
size_t workspace_size = 0;

size_t size_trans_input = 0;
size_t size_trans_weight = 0;
size_t size_trans_output = 0;
size_t size_tensor_cast = 0;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{

Expand All @@ -772,31 +780,28 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
TransposeSolutionNhwc2Default trans_output(ctx, ctx.out_data_type, n, k, ho, wo);

if(!trans_input.IsSkippable())
workspace_size += trans_input.GetSize();
size_trans_input = trans_input.GetSize();
if(!trans_weight.IsSkippable())
workspace_size += trans_weight.GetSize();
size_trans_weight = trans_weight.GetSize();
if(!trans_output.IsSkippable())
workspace_size += trans_output.GetSize();

// 4 bytes alignment to do atomic add
workspace_size = ((workspace_size + 3) >> 2) << 2;
size_trans_output = trans_output.GetSize();
}

if(!ctx.IsFp32())
workspace_size += miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * k * ho * wo;
size_tensor_cast =
miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* n * k * ho * wo;

MultiBufferWorkspaceTraits wt(
{size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
workspace_size = wt.GetSize();

return workspace_size;
}

bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
#if WORKAROUND_ISSUE_1317
if(ctx.IsLayoutDefault())
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{}))
return false;
#endif
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{}))
return false;

Expand Down
47 changes: 29 additions & 18 deletions src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ static inline std::size_t GetTypeSize(const std::string& s)
{
if(s == "fp32")
return miopen::GetTypeSize(miopenFloat);
if (s == "fp16")
if(s == "fp16")
return miopen::GetTypeSize(miopenHalf);
else
return miopen::GetTypeSize(miopenBFloat16);
Expand Down Expand Up @@ -795,11 +795,6 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx

bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(const ConvolutionContext& ctx) const
{
#if WORKAROUND_ISSUE_1317
if(ctx.IsLayoutDefault())
if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{}))
return false;
#endif
if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{}))
return false;

Expand Down Expand Up @@ -891,6 +886,14 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
const auto& x = ctx.kernel_size_w;
const auto& group = ctx.group_counts;
const auto is_nchw = ctx.IsLayoutDefault();

size_t size_trans_input = 0;
size_t size_trans_weight = 0;
size_t size_trans_output = 0;
size_t size_tensor_cast = 0;

constexpr size_t buf_alignment = 256;

size_t workspace_size = 0;
if(is_nchw)
{
Expand All @@ -903,20 +906,22 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo
x); // group * k_per_group as batch for weight
TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo);
if(!trans_input.IsSkippable())
workspace_size += trans_input.GetSize();
size_trans_input = trans_input.GetSize();
if(!trans_weight.IsSkippable())
workspace_size += trans_weight.GetSize();
size_trans_weight = trans_weight.GetSize();
if(!trans_output.IsSkippable())
workspace_size += trans_output.GetSize();
size_trans_output = trans_output.GetSize();

// 4 bytes alignment to do atomic add
workspace_size = ((workspace_size + 3) >> 2) << 2;
}

if(!ctx.IsFp32())
workspace_size += miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
size_tensor_cast = miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
// kernel is FP32, when using FP32 atomic
* (k / group) * c * y * x;
* (k / group) * c * y * x;

MultiBufferWorkspaceTraits wt({size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast}, buf_alignment);
workspace_size = wt.GetSize();

return workspace_size;
}

Expand Down Expand Up @@ -1027,6 +1032,9 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
int trans_input_idx = -1;
int trans_weight_idx = -1;
int trans_output_idx = -1;

constexpr size_t buf_alignment = 256;

if(is_nchw)
{
TransposeSolutionDefault2Nhwc trans_input(ctx, ctx.out_data_type, n, c, hi, wi);
Expand Down Expand Up @@ -1065,9 +1073,6 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();

trans_weight_offset = trans_input_offset + trans_input_size;
trans_output_offset = trans_weight_offset + trans_weight_size;

int idx = 0;
if(!trans_input_skippable)
trans_input_idx = idx++;
Expand All @@ -1079,11 +1084,17 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(

MIOPEN_LOG_I2(SolverDbId(*this) << ": " << config.ToString() << msg.str());

// 4 bytes alignment to do atomic add
const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0;
const size_t cast_size = need_cast ?
miopen::GetTypeSize(miopenFloat) * k * (c / group) * y * x : 0;

MultiBufferWorkspaceTraits wt({trans_input_size, trans_weight_size, trans_output_size, cast_size}, buf_alignment);

trans_input_offset = wt.GetOffset(0);
trans_weight_offset = wt.GetOffset(1);
trans_output_offset = wt.GetOffset(2);

const size_t cast_offset = wt.GetOffset(3);

const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

const TensorDescriptor cast_desc(miopenFloat, ctx.conv_problem.GetWeights().GetLengths(), ctx.conv_problem.GetWeights().GetStrides());
Expand Down
Loading