apache · eric-haibin-lin · Oct 8, 2018 · Oct 2, 2018
@@ -158,7 +158,17 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - Performance tests are run to pick the convolution algo when value is 1 or 2
   - Value of 1 chooses the best algo in a limited workspace
   - Value of 2 chooses the fastest algo whose memory requirements may be larger than the default workspace threshold
-
+
+* MXNET_CUDA_ALLOW_TENSOR_CORE
+  - 0(false) or 1(true) ```(default=1)```
+	- If set to '0', disallows Tensor Core use in CUDA ops.
+	- If set to '1', allows Tensor Core use in CUDA ops.
+  - This variable can only be set once in a session.
+
+* MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION
+  - 0(false) or 1(true) ```(default=0)```
+	- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
+	- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
 
 * MXNET_GLUON_REPO
   - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```

diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
@@ -374,6 +374,22 @@ inline bool GetEnvAllowTensorCore() {
   return allow_tensor_core;
 }
 
+// The policy if the user hasn't set the environment variable
+// CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
+#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false
+
+/*!
+ * \brief Returns global policy for TensorCore implicit type casting
+ */
+inline bool GetEnvAllowTensorCoreConversion() {
+  // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be
+  // legal.
+  bool default_value = MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT;
+  return dmlc::GetEnv("MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION",
+                      dmlc::optional<bool>(default_value))
+      .value();
+}
+
 #if CUDA_VERSION >= 9000
 // Sets the cuBLAS math mode that determines the 'allow TensorCore' policy.  Returns previous.
 inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {

diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
@@ -496,6 +496,11 @@ class CuDNNRNNOp : public Operator{
         if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
           math_type = CUDNN_TENSOR_OP_MATH;
         }
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
         CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
       #endif
       // Get temp space sizes

diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -543,6 +543,11 @@ class CuDNNConvolutionOp {
     #if CUDNN_MAJOR >= 7
       cudnnMathType_t math_type = cudnn_tensor_core_ ? CUDNN_TENSOR_OP_MATH
                                                     : CUDNN_DEFAULT_MATH;
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
       CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, math_type));
       CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, math_type));
       CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, math_type));