Merge branch 'develop' into wip-rocmtest-trial-vega10

# RESOLVED Conflicts: # Jenkinsfile
ROCm · Apr 14, 2021 · 05c96e7 · 05c96e7
2 parents 9a642d4 + 6dc32fe
commit 05c96e7
Show file tree

Hide file tree

Showing 19 changed files with 299 additions and 40 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -296,7 +296,7 @@ pipeline {
                             ulimit -c unlimited
                             cd build
                             CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=debug -DMIOPEN_TEST_FLAGS=--disable-verification-cache ..
-                            CTEST_PARALLEL_LEVEL=4 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check
+                            CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check
                         """
 
                     }
@@ -342,7 +342,7 @@ pipeline {
                             ulimit -c unlimited
                             cd build
                             CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_USE_COMGR=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=debug -DMIOPEN_TEST_FLAGS='--verbose --disable-verification-cache' ..
-                            CTEST_PARALLEL_LEVEL=2 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check
+                            CTEST_PARALLEL_LEVEL=2 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check
                         """
                     }
                     steps{
@@ -368,7 +368,7 @@ pipeline {
                             ulimit -c unlimited
                             cd build
                             CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DBUILD_SHARED_LIBS=Off -DCMAKE_BUILD_TYPE=release -DMIOPEN_TEST_FLAGS=--disable-verification-cache ..
-                            CTEST_PARALLEL_LEVEL=4 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check
+                            CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 make -j\$(nproc) check
                         """
                     }
                     steps{
@@ -400,7 +400,7 @@ pipeline {
                             cd build
                             CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release ..
                             make -j test_conv2d
-                            MIOPEN_FIND_MODE=normal CTEST_PARALLEL_LEVEL=4 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 bin/test_conv2d --disable-verification-cache
+                            MIOPEN_FIND_MODE=normal CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 bin/test_conv2d --disable-verification-cache
                         """
                     }
                     steps{
@@ -427,7 +427,7 @@ pipeline {
                             cd build
                             CXX=/opt/rocm/llvm/bin/clang++ cmake -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release ..
                             make -j test_conv2d
-                            MIOPEN_FIND_MODE=fast CTEST_PARALLEL_LEVEL=4 MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM=0 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 bin/test_conv2d --disable-verification-cache
+                            MIOPEN_FIND_MODE=fast CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 bin/test_conv2d --disable-verification-cache
                         """
                     }
                     steps{

diff --git a/README.md b/README.md
@@ -20,6 +20,7 @@ MIOpen supports two programming models -
 * [Half](http://half.sourceforge.net/) - IEEE 754-based half-precision floating point library
 * [Boost](http://www.boost.org/) at version 1.72 (other versions are not supported)
   * MIOpen uses `boost-system` and `boost-filesystem` packages to enable persistent [kernel cache](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/cache.html)
+* [SQLite3](https://sqlite.org/index.html) for reading and writing performance database
 * [MIOpenTENSILE](https://github.com/ROCmSoftwarePlatform/MIOpenTensile) Users can enable this library using the cmake configuration flag `-DMIOPEN_USE_MIOPENTENSILE=On`.
 * [rocBlas](https://github.com/ROCmSoftwarePlatform/rocBLAS) 
   * Minimum version branch for pre-ROCm 3.5 [master-rocm-2.10](https://github.com/ROCmSoftwarePlatform/rocBLAS/tree/master-rocm-2.10)

diff --git a/driver/lrn_driver.hpp b/driver/lrn_driver.hpp
@@ -250,6 +250,14 @@ int LRNDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     {
         scale     = std::vector<Tgpu>(workSpaceNbVal, static_cast<Tgpu>(0));
         scalehost = std::vector<Tref>(workSpaceNbVal, static_cast<Tref>(0));
+        if(inflags.GetValueInt("forw") == 2)
+        {
+            for(int i = 0; i < scale.size(); i++)
+            {
+                scale[i]     = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+                scalehost[i] = Tref(scale[i]);
+            }
+        }
     }
     din     = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
     dout    = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));

diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp
@@ -31,9 +31,9 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <miopen/tensor_extra.hpp>
+#include <miopen/tensor_layout.hpp>
 #include <numeric>
 #include <vector>
-#include "../test/tensor_layout.hpp"
 
 std::vector<int> GetTensorLengths(miopenTensorDescriptor_t& tensor)
 {
@@ -125,14 +125,14 @@ int SetTensorNd(miopenTensorDescriptor_t t,
     }
 
     // Dimension lengths vector 'len' comes with a default layout.
-    std::string len_layout = tensor_layout_get_default(layout.size());
+    std::string len_layout = miopen::tensor_layout_get_default(layout.size());
     if(len_layout.empty())
     {
         return SetTensorNd(t, len, data_type);
     }
 
     std::vector<int> strides;
-    tensor_layout_to_strides(len, len_layout, layout, strides);
+    miopen::tensor_layout_to_strides(len, len_layout, layout, strides);
 
     return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), strides.data());
 }

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -153,6 +153,7 @@ set( MIOpen_Source
     include/miopen/mlir_build.hpp
     include/miopen/oclkernel.hpp
     include/miopen/tensor.hpp
+    include/miopen/tensor_layout.hpp
     include/miopen/tensor_ops.hpp
     include/miopen/pooling.hpp
     include/miopen/lrn.hpp

diff --git a/src/conv/problem_description.cpp b/src/conv/problem_description.cpp
@@ -28,6 +28,7 @@
 
 #include <miopen/conv/data_invoke_params.hpp>
 #include <miopen/conv/wrw_invoke_params.hpp>
+#include <miopen/tensor_layout.hpp>
 
 #include <sstream>
 
@@ -59,6 +60,29 @@ std::ostream& operator<<(std::ostream& stream, std::function<void(std::ostream&)
     return stream;
 }
 
+void ProblemDescription::HeuristicUpdateLayouts()
+{
+    const std::string labels = tensor_layout_get_default(in_layout.size());
+
+    static const std::vector<std::string> supported_layouts = {"NCHW", "NHWC", "NCDHW"};
+    for(const std::string& layout : supported_layouts)
+    {
+        // Skip layouts that doesn't match dimension sizes
+        if(layout.size() != labels.size())
+            continue;
+
+        if(in.IsPossibleLayout(labels, layout) && out.IsPossibleLayout(labels, layout) &&
+           weights.IsPossibleLayout(labels, layout))
+        {
+            in_layout      = layout;
+            weights_layout = layout;
+            out_layout     = layout;
+            return;
+        }
+    }
+    // If we did not find consistent layout, leave them as-is
+}
+
 void ProblemDescription::BuildConfKey(std::string& conf_key) const
 {
     std::ostringstream ss;

diff --git a/src/convolution.cpp b/src/convolution.cpp
@@ -36,6 +36,7 @@
 #include <miopen/mlo_internal.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/tensor.hpp>
+#include <miopen/tensor_layout.hpp>
 #include <miopen/algorithm.hpp>
 
 #include <cassert>
@@ -140,9 +141,11 @@ const std::vector<int>& ConvolutionDescriptor::GetTransposeConvPads() const
 
 int ConvolutionDescriptor::GetGroupCount() const { return group_count; }
 
-TensorDescriptor ConvolutionDescriptor::GetForwardOutputTensor(const TensorDescriptor& xDesc,
-                                                               const TensorDescriptor& wDesc,
-                                                               miopenDataType_t yType) const
+TensorDescriptor
+ConvolutionDescriptor::GetForwardOutputTensorWithLayout(const TensorDescriptor& xDesc,
+                                                        const TensorDescriptor& wDesc,
+                                                        const std::string& yLayout,
+                                                        miopenDataType_t yType) const
 {
     const std::size_t spatial_dim = GetSpatialDimension();
 
@@ -255,10 +258,25 @@ TensorDescriptor ConvolutionDescriptor::GetForwardOutputTensor(const TensorDescr
     out_lens[0] = in_n;
     out_lens[1] = out_c;
 
+    const std::string default_layout = tensor_layout_get_default(xDesc.GetSize());
+    std::vector<std::size_t> out_strides;
+    tensor_layout_to_strides(out_lens, default_layout, yLayout, out_strides);
+
     return TensorDescriptor((xDesc.GetType() == miopenInt8 || xDesc.GetType() == miopenInt8x4
                                  ? (yType == miopenInt32 ? yType : miopenFloat)
                                  : xDesc.GetType()),
-                            out_lens);
+                            out_lens,
+                            out_strides);
+}
+
+TensorDescriptor ConvolutionDescriptor::GetForwardOutputTensor(const TensorDescriptor& xDesc,
+                                                               const TensorDescriptor& wDesc,
+                                                               miopenDataType_t yType) const
+{
+    // output layout same as input
+    const std::string default_layout = tensor_layout_get_default(xDesc.GetSize());
+    const std::string in_layout      = xDesc.GetLayout(default_layout);
+    return GetForwardOutputTensorWithLayout(xDesc, wDesc, in_layout, yType);
 }
 
 /// There is assumption that if Winograd is applicable and granularity loss is low, then there is no

diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
@@ -144,6 +144,7 @@ struct ProblemDescription
           direction(direction_),
           bias(bias_)
     {
+        HeuristicUpdateLayouts();
     }
 
     // Conv descriptor getters
@@ -303,6 +304,9 @@ struct ProblemDescription
     }
 
     bool IsLayoutDefault() const;
+
+    void HeuristicUpdateLayouts();
+
     void BuildConfKey(std::string& conf_key) const;
 
     NetworkConfig BuildConfKey() const

diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp
@@ -99,6 +99,11 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
 
     int GetGroupCount() const;
 
+    TensorDescriptor GetForwardOutputTensorWithLayout(const TensorDescriptor& xDesc,
+                                                      const TensorDescriptor& wDesc,
+                                                      const std::string& yLayout,
+                                                      miopenDataType_t yType = miopenFloat) const;
+
     TensorDescriptor GetForwardOutputTensor(const TensorDescriptor& xDesc,
                                             const TensorDescriptor& wDesc,
                                             miopenDataType_t yType = miopenFloat) const;

diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
@@ -32,6 +32,7 @@
 #include <miopen/each_args.hpp>
 #include <miopen/returns.hpp>
 #include <miopen/errors.hpp>
+#include <miopen/functional.hpp>
 
 #include <algorithm>
 #include <cassert>
@@ -175,13 +176,17 @@ struct TensorDescriptor : miopenTensorDescriptor
 
     std::string ToString() const;
 
-    template <class Vector, class Op>
-    static inline std::vector<int64_t> sort_permutation(const Vector& data, Op op)
+    bool IsPossibleLayout(const std::string& labels, const std::string& layout) const;
+
+    static inline std::vector<int64_t> find_permutation(const std::vector<std::size_t>& lens,
+                                                        const std::vector<std::size_t>& strides)
     {
-        std::vector<std::int64_t> result(data.size());
+        std::vector<std::int64_t> result(lens.size());
         std::iota(result.begin(), result.end(), 0);
-        std::sort(
-            result.begin(), result.end(), [&](auto x, auto y) { return op(data[x], data[y]); });
+        std::stable_sort(
+            result.begin(),
+            result.end(),
+            by(std::greater<>{}, [&](auto x) { return std::make_tuple(strides[x], lens[x]); }));
         return result;
     }
 
@@ -196,7 +201,7 @@ struct TensorDescriptor : miopenTensorDescriptor
         // Copy construct the result string from labels. This allocates the space at one go
         // and is faster than calling push_back in transform.
         auto result = labels;
-        auto p      = sort_permutation(strides, std::greater<>{});
+        auto p      = find_permutation(lens, strides);
         std::transform(p.begin(), p.end(), result.begin(), [&](auto i) { return labels[i]; });
         return result;
     }

diff --git a/src/include/miopen/tensor_layout.hpp b/src/include/miopen/tensor_layout.hpp
@@ -0,0 +1,84 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_TENSOR_LAYOUT_HPP
+#define GUARD_TENSOR_LAYOUT_HPP
+
+#include <miopen/errors.hpp>
+#include <map>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <iterator>
+
+namespace miopen {
+
+template <typename T>
+void tensor_layout_to_strides(const std::vector<T>& len,
+                              const std::string& len_layout,
+                              const std::string& layout,
+                              std::vector<T>& strides)
+{
+    // Bind the layout and the dimension lengths together into a map.
+    std::map<char, T> dim_to_len;
+    std::transform(len.begin(),
+                   len.end(),
+                   len_layout.begin(),
+                   std::inserter(dim_to_len, dim_to_len.end()),
+                   [](T l, char dim) { return std::make_pair(dim, l); });
+
+    // Now construct the strides according to layout by multiply the
+    // dimension lengths together.
+    std::transform(len_layout.begin(),
+                   len_layout.end(),
+                   std::back_inserter(strides),
+                   [&layout, &dim_to_len](char cur_layout_char) {
+                       auto pos = layout.find(cur_layout_char);
+                       if(pos == std::string::npos)
+                       {
+                           MIOPEN_THROW(std::string("mismatched layout string, unexpect char: ")
+                                            .append(1, cur_layout_char));
+                       }
+                       return std::accumulate(layout.begin() + pos + 1,
+                                              layout.end(),
+                                              1,
+                                              [&dim_to_len](T accumulator, char l) {
+                                                  return accumulator * dim_to_len[l];
+                                              });
+                   });
+}
+
+inline std::string tensor_layout_get_default(int size)
+{
+    if(size == 4)
+        return "NCHW";
+    if(size == 5)
+        return "NCDHW";
+    return "";
+}
+
+} // namespace miopen
+
+#endif
diff --git a/src/pooling.cpp b/src/pooling.cpp
@@ -26,6 +26,7 @@
 #include <miopen/pooling.hpp>
 #include <miopen/logger.hpp>
 #include <miopen/tensor.hpp>
+#include <miopen/tensor_layout.hpp>
 #include <miopen/datatype.hpp>
 
 #include <cassert>
@@ -216,7 +217,13 @@ TensorDescriptor PoolingDescriptor::GetForwardOutputTensor(const TensorDescripto
 {
     std::vector<int> out_dim(xDesc.GetSize());
     GetForwardOutputDimNd(xDesc, xDesc.GetSize(), out_dim.data());
-    return TensorDescriptor(xDesc.GetType(), out_dim);
+
+    const std::string default_layout = tensor_layout_get_default(xDesc.GetSize());
+    const std::string in_layout      = xDesc.GetLayout(default_layout);
+    std::vector<int> out_strides;
+    tensor_layout_to_strides(out_dim, default_layout, in_layout, out_strides);
+
+    return TensorDescriptor(xDesc.GetType(), out_dim, out_strides);
 }
 
 std::size_t PoolingDescriptor::GetWorkSpaceSize(const TensorDescriptor& yDesc) const

diff --git a/src/tensor.cpp b/src/tensor.cpp
@@ -28,6 +28,7 @@
 #include <miopen/errors.hpp>
 #include <miopen/logger.hpp>
 #include <miopen/tensor.hpp>
+#include <miopen/tensor_layout.hpp>
 #include <numeric>
 #include <string>
 
@@ -122,6 +123,13 @@ std::size_t TensorDescriptor::GetElementSpace() const
            1;
 }
 
+bool TensorDescriptor::IsPossibleLayout(const std::string& labels, const std::string& layout) const
+{
+    std::vector<size_t> derived_strides;
+    tensor_layout_to_strides(lens, labels, layout, derived_strides);
+    return derived_strides == strides;
+}
+
 std::size_t TensorDescriptor::GetNumBytes() const
 {
     std::size_t typesize = 0;