Skip to content

Commit

Permalink
[NHWC] Add gpu reference kernels. Fix MIOpenDriver (#728)
Browse files Browse the repository at this point in the history
* support nhwc/ndhwc, fwd/bwd/wrw, fp32/fp16/bf16, group conv, for reference kernel.
* fix MIOpenDriver to have correct result of CPU NHWC convolution.
* refactor LayoutToStrides() into test/tensor_layout.hpp, to let driver/test code both can call this function
* MIOPEN_ENABLE_LOGGING_CMD=1:  dump layout info, if it is not NCHW (current default).
  • Loading branch information
carlushuang authored Mar 18, 2021
1 parent 7bd5999 commit 1c72099
Show file tree
Hide file tree
Showing 12 changed files with 2,232 additions and 294 deletions.
47 changes: 31 additions & 16 deletions driver/conv_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,9 +1019,12 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
break;
}

warmup_in = tensor<warmup_Tgpu>(miopen::deref(warmupInputTensor).GetLengths());
warmup_wei = tensor<warmup_Tgpu>(miopen::deref(warmupWeightTensor).GetLengths());
warmup_out = tensor<warmup_Tgpu>(miopen::deref(warmupOutputTensor).GetLengths());
warmup_in = tensor<warmup_Tgpu>(miopen::deref(warmupInputTensor).GetLengths(),
miopen::deref(warmupInputTensor).GetStrides());
warmup_wei = tensor<warmup_Tgpu>(miopen::deref(warmupWeightTensor).GetLengths(),
miopen::deref(warmupWeightTensor).GetStrides());
warmup_out = tensor<warmup_Tgpu>(miopen::deref(warmupOutputTensor).GetLengths(),
miopen::deref(warmupOutputTensor).GetStrides());

warmup_in_dev =
std::unique_ptr<GPUMem>(new GPUMem(ctx, warmup_in_sz, sizeof(warmup_Tgpu)));
Expand Down Expand Up @@ -1105,13 +1108,17 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
}

if(is_fwd || is_wrw)
in = tensor<Tgpu>(miopen::deref(inputTensor).GetLengths());
in = tensor<Tgpu>(miopen::deref(inputTensor).GetLengths(),
miopen::deref(inputTensor).GetStrides());
if(is_fwd || is_bwd)
wei = tensor<Tgpu>(miopen::deref(weightTensor).GetLengths());
wei = tensor<Tgpu>(miopen::deref(weightTensor).GetLengths(),
miopen::deref(weightTensor).GetStrides());
if(is_fwd)
out = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths());
out = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths(),
miopen::deref(outputTensor).GetStrides());
if(is_bwd || is_wrw)
dout = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths());
dout = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths(),
miopen::deref(outputTensor).GetStrides());

if(is_bwd)
din = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
Expand All @@ -1127,9 +1134,12 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
new GPUMem(ctx, GetTensorSize(weightTensor_vect4), sizeof(Tgpu)));
}

outhost = tensor<Tref>(miopen::deref(outputTensor).GetLengths());
din_host = tensor<Tref>(miopen::deref(inputTensor).GetLengths());
dwei_host = tensor<Tref>(miopen::deref(weightTensor).GetLengths());
outhost = tensor<Tref>(miopen::deref(outputTensor).GetLengths(),
miopen::deref(outputTensor).GetStrides());
din_host = tensor<Tref>(miopen::deref(inputTensor).GetLengths(),
miopen::deref(inputTensor).GetStrides());
dwei_host = tensor<Tref>(miopen::deref(weightTensor).GetLengths(),
miopen::deref(weightTensor).GetStrides());

std::string inFileName = inflags.GetValueStr("in_data");
std::string weiFileName = inflags.GetValueStr("weights");
Expand Down Expand Up @@ -1236,9 +1246,11 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
size_t b_sz = GetTensorSize(biasTensor);
b_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, b_sz, sizeof(Tgpu)));
db_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, b_sz, sizeof(Tgpu)));
b = tensor<Tgpu>(miopen::deref(biasTensor).GetLengths());
db = std::vector<Tgpu>(b_sz, static_cast<Tgpu>(0));
db_host = tensor<Tref>(miopen::deref(biasTensor).GetLengths());
b = tensor<Tgpu>(miopen::deref(biasTensor).GetLengths(),
miopen::deref(biasTensor).GetStrides());
db = std::vector<Tgpu>(b_sz, static_cast<Tgpu>(0));
db_host = tensor<Tref>(miopen::deref(biasTensor).GetLengths(),
miopen::deref(biasTensor).GetStrides());
for(int i = 0; i < b_sz; i++)
{
b.data[i] = static_cast<Tgpu>(i % 8) +
Expand Down Expand Up @@ -2042,7 +2054,8 @@ int ConvDriver<Tgpu, Tref>::RunForwardGPUReference()
out_dev->FromGPU(GetStream(), outhost.data.data());
else
{
auto out_tmp = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths());
auto out_tmp = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths(),
miopen::deref(outputTensor).GetStrides());
out_dev->FromGPU(GetStream(), out_tmp.data.data());
for(int i = 0; i < out_tmp.data.size(); i++)
{
Expand Down Expand Up @@ -2958,7 +2971,8 @@ int ConvDriver<Tgpu, Tref>::RunBackwardWeightsGPUReference()
dwei_dev->FromGPU(GetStream(), dwei_host.data.data());
else
{
auto dwei_tmp = tensor<Tgpu>(miopen::deref(weightTensor).GetLengths());
auto dwei_tmp = tensor<Tgpu>(miopen::deref(weightTensor).GetLengths(),
miopen::deref(weightTensor).GetStrides());
dwei_dev->FromGPU(GetStream(), dwei_tmp.data.data());
for(int i = 0; i < dwei_tmp.data.size(); i++)
{
Expand Down Expand Up @@ -3006,7 +3020,8 @@ int ConvDriver<Tgpu, Tref>::RunBackwardDataGPUReference()
din_dev->FromGPU(GetStream(), din_host.data.data());
else
{
auto din_tmp = tensor<Tgpu>(miopen::deref(inputTensor).GetLengths());
auto din_tmp = tensor<Tgpu>(miopen::deref(inputTensor).GetLengths(),
miopen::deref(inputTensor).GetStrides());
din_dev->FromGPU(GetStream(), din_tmp.data.data());
for(int i = 0; i < din_tmp.data.size(); i++)
{
Expand Down
47 changes: 3 additions & 44 deletions driver/tensor_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include <miopen/tensor_extra.hpp>
#include <numeric>
#include <vector>
#include "../test/tensor_layout.hpp"

std::vector<int> GetTensorLengths(miopenTensorDescriptor_t& tensor)
{
Expand Down Expand Up @@ -108,48 +109,6 @@ int SetTensorNd(miopenTensorDescriptor_t t,
return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), nullptr);
}

void LayoutToStrides(const std::vector<int>& len,
const std::string& len_layout,
const std::string& layout,
std::vector<int>& strides)
{
// Bind the layout and the dimension lengths together into a map.
std::map<char, int> dim_to_len;
std::transform(len.begin(),
len.end(),
len_layout.begin(),
std::inserter(dim_to_len, dim_to_len.end()),
[](int l, char dim) { return std::make_pair(dim, l); });

// Now construct the strides according to layout by multiply the
// dimension lengths together.
std::transform(len_layout.begin(),
len_layout.end(),
std::back_inserter(strides),
[&layout, &dim_to_len](char cur_layout_char) {
auto pos = layout.find(cur_layout_char);
if(pos == std::string::npos)
{
MIOPEN_THROW(std::string("mismatched layout string, unexpect char: ")
.append(1, cur_layout_char));
}
return std::accumulate(layout.begin() + pos + 1,
layout.end(),
1,
[&dim_to_len](int accumulator, char l) {
return accumulator * dim_to_len[l];
});
});
}

std::string GetDefaultTensorLayout(int size)
{
if(size != 4)
return "";

return "NCHW";
}

int SetTensorNd(miopenTensorDescriptor_t t,
std::vector<int>& len,
const std::string& layout,
Expand All @@ -166,14 +125,14 @@ int SetTensorNd(miopenTensorDescriptor_t t,
}

// Dimension lengths vector 'len' comes with a default layout.
std::string len_layout = GetDefaultTensorLayout(layout.size());
std::string len_layout = tensor_layout_get_default(layout.size());
if(len_layout.empty())
{
return SetTensorNd(t, len, data_type);
}

std::vector<int> strides;
LayoutToStrides(len, len_layout, layout, strides);
tensor_layout_to_strides(len, len_layout, layout, strides);

return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), strides.data());
}
Expand Down
33 changes: 26 additions & 7 deletions src/convolution_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ enum class ConvDirection
static void LogCmdConvolution(const miopenTensorDescriptor_t xDesc,
const miopenTensorDescriptor_t wDesc,
const miopenConvolutionDescriptor_t convDesc,
const miopenTensorDescriptor_t yDesc,
const ConvDirection conv_dir,
const bool is_immediate)
{
Expand Down Expand Up @@ -343,6 +344,15 @@ static void LogCmdConvolution(const miopenTensorDescriptor_t xDesc,
<< " -v " << miopen::deref(convDesc).GetConvStrides()[1]
<< " -l " << miopen::deref(convDesc).GetConvDilations()[0]
<< " -j " << miopen::deref(convDesc).GetConvDilations()[1]; // clang-format on
std::string x_layout = miopen::deref(xDesc).GetLayout("NCHW");
std::string w_layout = miopen::deref(wDesc).GetLayout("NCHW");
std::string y_layout = miopen::deref(yDesc).GetLayout("NCHW");
if(x_layout != "NCHW")
ss << " --in_layout " << x_layout;
if(w_layout != "NCHW")
ss << " --fil_layout " << w_layout;
if(y_layout != "NCHW")
ss << " --out_layout " << y_layout;
}
else if(miopen::deref(convDesc).GetSpatialDimension() == 3)
{
Expand All @@ -365,6 +375,15 @@ static void LogCmdConvolution(const miopenTensorDescriptor_t xDesc,
<< " -l " << miopen::deref(convDesc).GetConvDilations()[1]
<< " -j " << miopen::deref(convDesc).GetConvDilations()[2]
<< " --spatial_dim 3"; // clang-format on
std::string x_layout = miopen::deref(xDesc).GetLayout("NCDHW");
std::string w_layout = miopen::deref(wDesc).GetLayout("NCDHW");
std::string y_layout = miopen::deref(yDesc).GetLayout("NCDHW");
if(x_layout != "NCDHW")
ss << " --in_layout " << x_layout;
if(w_layout != "NCDHW")
ss << " --fil_layout " << w_layout;
if(y_layout != "NCDHW")
ss << " --out_layout " << y_layout;
}
ss << " -m " << (miopen::deref(convDesc).mode == 1 ? "trans" : "conv") // clang-format off
<< " -g " << miopen::deref(convDesc).group_count
Expand Down Expand Up @@ -480,7 +499,7 @@ extern "C" miopenStatus_t miopenConvolutionForward(miopenHandle_t handle,
y,
workSpace,
workSpaceSize);
LogCmdConvolution(xDesc, wDesc, convDesc, ConvDirection::Fwd, false);
LogCmdConvolution(xDesc, wDesc, convDesc, yDesc, ConvDirection::Fwd, false);

/// workaround for previous trans conv logic
if(miopen::deref(convDesc).mode == miopenTranspose)
Expand Down Expand Up @@ -674,7 +693,7 @@ miopenConvolutionForwardImmediate(miopenHandle_t handle,
{
MIOPEN_LOG_FUNCTION(
handle, wDesc, w, xDesc, x, convDesc, yDesc, y, workSpace, workSpaceSize, solution_id);
LogCmdConvolution(xDesc, wDesc, convDesc, ConvDirection::Fwd, true);
LogCmdConvolution(xDesc, wDesc, convDesc, yDesc, ConvDirection::Fwd, true);

return miopen::try_([&] {
if(miopen::deref(convDesc).mode == miopenTranspose)
Expand Down Expand Up @@ -828,7 +847,7 @@ miopenConvolutionBackwardDataImmediate(miopenHandle_t handle,
{
MIOPEN_LOG_FUNCTION(
handle, dyDesc, wDesc, convDesc, dxDesc, workSpace, workSpaceSize, solution_id);
LogCmdConvolution(dxDesc, wDesc, convDesc, ConvDirection::Bwd, true);
LogCmdConvolution(dxDesc, wDesc, convDesc, dyDesc, ConvDirection::Bwd, true);
return miopen::try_([&] {
if(miopen::deref(convDesc).mode == miopenTranspose)
miopen::deref(convDesc).ConvolutionForwardImmediate(miopen::deref(handle),
Expand Down Expand Up @@ -978,7 +997,7 @@ miopenConvolutionBackwardWeightsImmediate(miopenHandle_t handle,
{
MIOPEN_LOG_FUNCTION(
handle, dyDesc, dy, xDesc, x, convDesc, dwDesc, dw, workSpace, workSpaceSize, solution_id);
LogCmdConvolution(xDesc, dwDesc, convDesc, ConvDirection::WrW, true);
LogCmdConvolution(xDesc, dwDesc, convDesc, dyDesc, ConvDirection::WrW, true);
return miopen::try_([&] {
if(miopen::deref(convDesc).mode == miopenTranspose)
miopen::deref(convDesc).ConvolutionWrwImmediate(miopen::deref(handle),
Expand Down Expand Up @@ -1108,7 +1127,7 @@ miopenConvolutionBackwardData(miopenHandle_t handle,
dx,
workSpace,
workSpaceSize);
LogCmdConvolution(dxDesc, wDesc, convDesc, ConvDirection::Bwd, false);
LogCmdConvolution(dxDesc, wDesc, convDesc, dyDesc, ConvDirection::Bwd, false);

/// workaround for previous trans conv logic
if(miopen::deref(convDesc).mode == miopenTranspose)
Expand Down Expand Up @@ -1221,7 +1240,7 @@ miopenFindConvolutionBackwardWeightsAlgorithm(miopenHandle_t handle,
workSpace,
workSpaceSize,
exhaustiveSearch);
LogCmdConvolution(xDesc, dwDesc, convDesc, ConvDirection::WrW, false);
LogCmdConvolution(xDesc, dwDesc, convDesc, dyDesc, ConvDirection::WrW, false);

return miopen::try_([&] {
miopen::deref(convDesc).FindConvBwdWeightsAlgorithm(
Expand Down Expand Up @@ -1273,7 +1292,7 @@ miopenConvolutionBackwardWeights(miopenHandle_t handle,
dw,
workSpace,
workSpaceSize);
LogCmdConvolution(xDesc, dwDesc, convDesc, ConvDirection::WrW, false);
LogCmdConvolution(xDesc, dwDesc, convDesc, dyDesc, ConvDirection::WrW, false);

return miopen::try_([&] {
miopen::deref(convDesc).ConvolutionBackwardWeights(
Expand Down
2 changes: 2 additions & 0 deletions src/include/miopen/problem_description.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ struct ProblemDescription

bool IsLayoutDefault() const;

bool IsLayoutNHWC() const;

template <class Self>
static void Visit(Self&& self, std::function<void(int, std::string)> f)
{
Expand Down
Loading

0 comments on commit 1c72099

Please sign in to comment.