From 2c6b13aad9fa1f7a79ac1995618b47bcb9f19656 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sun, 30 Jun 2019 22:35:59 +0200 Subject: [PATCH] NVIDIA: releax 'threads' config - with #2443 the possible maximum number of threads per block was reduced because phase3 used 16 threads per share (is automatically avoided if thread limit is exeeded) - warn user if `threads` is to large and adjust to a valid value --- .../backend/nvidia/nvcc_code/cryptonight.hpp | 1 + xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 27 ++++++++++++++----- .../backend/nvidia/nvcc_code/cuda_extra.cu | 19 ++++++++++++- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index 78abd7a3d..29e29d12c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -18,6 +18,7 @@ typedef struct int device_threads; int device_bfactor; int device_bsleep; + int device_maxThreadsPerBlock; int syncMode; bool memMode; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 4c4698b4b..53345c95c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -895,12 +895,19 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo roundsPhase3 *= 2; } + int blockSizePhase3 = block8.x; + int gridSizePhase3 = grid.x; + if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock) + { + blockSizePhase3 *= 2; + gridSizePhase3 = (blockSizePhase3 + 1) / 2; + } for(int i = 0; i < roundsPhase3; i++) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< - (grid.x + 1) / 2, - block8.x * 2, - 2 * block8.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( + gridSizePhase3, + blockSizePhase3, + blockSizePhase3 * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( ITERATIONS, MEM, ctx->device_blocks * ctx->device_threads, @@ -966,12 +973,20 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_ roundsPhase3 *= 2; } + int blockSizePhase3 = block8.x; + int gridSizePhase3 = grid.x; + if(blockSizePhase3 * 2 <= ctx->device_maxThreadsPerBlock) + { + blockSizePhase3 *= 2; + gridSizePhase3 = (blockSizePhase3 + 1) / 2; + } + for(int i = 0; i < roundsPhase3; i++) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< - (grid.x + 1) / 2, - block8.x * 2 , - 2 * block8.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( + gridSizePhase3, + blockSizePhase3, + blockSizePhase3 * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( ITERATIONS, MEM / 4, ctx->device_blocks * ctx->device_threads, diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index aa7c17057..8c1bb8b5d 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -560,12 +560,13 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_mpcount = props.multiProcessorCount; ctx->device_arch[0] = props.major; ctx->device_arch[1] = props.minor; + ctx->device_maxThreadsPerBlock = props.maxThreadsPerBlock; const int gpuArch = ctx->device_arch[0] * 10 + ctx->device_arch[1]; ctx->name = std::string(props.name); - printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"... ", + printf("CUDA [%d.%d/%d.%d] GPU#%d, device architecture %d: \"%s\"...\n", version / 1000, (version % 1000 / 10), CUDART_VERSION / 1000, (CUDART_VERSION % 1000) / 10, ctx->device_id, gpuArch, ctx->device_name); @@ -803,6 +804,22 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } } } + + if(useCryptonight_gpu) + { + // cryptonight_gpu used 16 threads per share + if(ctx->device_threads * 16 > ctx->device_maxThreadsPerBlock) + { + ctx->device_threads = ctx->device_maxThreadsPerBlock / 16; + printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads); + } + } + else if(ctx->device_threads * 8 > ctx->device_maxThreadsPerBlock) + { + // by default cryptonight CUDA implementations uses 8 threads per thread for some kernel + ctx->device_threads = ctx->device_maxThreadsPerBlock / 8; + printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads); + } printf("device init succeeded\n"); return 0;