From 2c6b13aad9fa1f7a79ac1995618b47bcb9f19656 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Sun, 30 Jun 2019 22:35:59 +0200
Subject: [PATCH] NVIDIA: releax 'threads' config

- with #2443 the possible maximum number of threads per block was
reduced because phase3 used 16 threads per share (is automatically
avoided if thread limit is exeeded)
- warn user if `threads` is to large and adjust to a valid value
---
 .../backend/nvidia/nvcc_code/cryptonight.hpp  |  1 +
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 27 ++++++++++++++-----
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 19 ++++++++++++-
 3 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 78abd7a3d..29e29d12c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -18,6 +18,7 @@ typedef struct
 	int device_threads;
 	int device_bfactor;
 	int device_bsleep;
+	int device_maxThreadsPerBlock;
 	int syncMode;
 	bool memMode;
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 4c4698b4b..53345c95c 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -895,12 +895,19 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 		roundsPhase3 *= 2;
 	}
 
+	int blockSizePhase3 = block8.x;
+	int gridSizePhase3 = grid.x;
+	if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock)
+	{
+		blockSizePhase3 *= 2;
+		gridSizePhase3 = (blockSizePhase3 + 1) / 2;
+	}
 	for(int i = 0; i < roundsPhase3; i++)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
-											  (grid.x + 1) / 2,
-											  block8.x * 2,
-											  2  * block8.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
+											  gridSizePhase3,
+											  blockSizePhase3,
+											  blockSizePhase3 * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
 											  ITERATIONS,
 											  MEM,
 											  ctx->device_blocks * ctx->device_threads,
@@ -966,12 +973,20 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_
 		roundsPhase3 *= 2;
 	}
 
+	int blockSizePhase3 = block8.x;
+	int gridSizePhase3 = grid.x;
+	if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock)
+	{
+		blockSizePhase3 *= 2;
+		gridSizePhase3 = (blockSizePhase3 + 1) / 2;
+	}
+
 	for(int i = 0; i < roundsPhase3; i++)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
-											  (grid.x + 1) / 2,
-											  block8.x * 2 ,
-											  2 * block8.x * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
+											  gridSizePhase3,
+											  blockSizePhase3,
+											  blockSizePhase3 * sizeof(uint32_t) * static_cast<int>(ctx->device_arch[0] < 3)>>>(
 											  ITERATIONS,
 											  MEM / 4,
 											  ctx->device_blocks * ctx->device_threads,
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index aa7c17057..8c1bb8b5d 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -560,12 +560,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 	ctx->device_mpcount = props.multiProcessorCount;
 	ctx->device_arch[0] = props.major;
 	ctx->device_arch[1] = props.minor;
+	ctx->device_maxThreadsPerBlock = props.maxThreadsPerBlock;
 
 	const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1];
 
 	ctx->name = std::string(props.name);
 
-	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ",
+	printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"...\n",
 		version / 1000, (version % 1000 / 10),
 		CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10,
 		ctx->device_id, gpuArch, ctx->device_name);
@@ -803,6 +804,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 			}
 		}
 	}
+
+	if(useCryptonight_gpu)
+	{
+		// cryptonight_gpu used 16 threads per share
+		if(ctx->device_threads * 16 > ctx->device_maxThreadsPerBlock)
+		{
+			ctx->device_threads = ctx->device_maxThreadsPerBlock / 16;
+			printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads);
+		}
+	}
+	else if(ctx->device_threads * 8 > ctx->device_maxThreadsPerBlock)
+	{
+		// by default cryptonight CUDA implementations uses 8 threads per thread for some kernel
+		ctx->device_threads = ctx->device_maxThreadsPerBlock / 8;
+		printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads);
+	}
 	printf("device init succeeded\n");
 
 	return 0;