Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVIDIA] fix possible out of memory with auto cfg #2538

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 11 additions & 17 deletions xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
if(props.multiProcessorCount <= 6)
ctx->device_bfactor += 2;
}

// for the most algorithms we are using 8 threads per hash
uint32_t threadsPerHash = 8;

if(ctx->device_threads == -1)
{
/* sm_20 devices can only run 512 threads per cuda block
Expand All @@ -642,9 +646,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
*/
const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024;

// for the most algorithms we are using 8 threads per hash
uint32_t threadsPerHash = 8;

// phase2_gpu uses 16 threads per hash
if(useCryptonight_gpu)
threadsPerHash = 16;
Expand Down Expand Up @@ -789,6 +790,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
// 8 is chosen by checking the occupancy calculator
size_t blockOptimal = 8 * ctx->device_mpcount;

if(gpuArch == 30)
blockOptimal = 8 * ctx->device_mpcount;
// the following values are calculated with CUDA10 and the occupancy calculator
if(gpuArch == 35 || gpuArch / 10 == 5 || gpuArch / 10 == 6)
blockOptimal = 7 * ctx->device_mpcount;
Expand All @@ -798,26 +801,17 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
blockOptimal = 6 * ctx->device_mpcount;

if(blockOptimal * threads * hashMemSize < limitedMemory)
{
ctx->device_threads = threads;
ctx->device_blocks = blockOptimal;
}
else
ctx->device_blocks = limitedMemory / hashMemSize / threads; // round to a memory fitting value
ctx->device_threads = threads;
}
}

if(useCryptonight_gpu)
{
// cryptonight_gpu used 16 threads per share
if(ctx->device_threads * 16 > ctx->device_maxThreadsPerBlock)
{
ctx->device_threads = ctx->device_maxThreadsPerBlock / 16;
printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads);
}
}
else if(ctx->device_threads * 8 > ctx->device_maxThreadsPerBlock)
if(ctx->device_threads * threadsPerHash > ctx->device_maxThreadsPerBlock)
{
// by default cryptonight CUDA implementations uses 8 threads per thread for some kernel
ctx->device_threads = ctx->device_maxThreadsPerBlock / 8;
ctx->device_threads = ctx->device_maxThreadsPerBlock / threadsPerHash;
printf("WARNING: 'threads' configuration to large, value adjusted to %i\n", ctx->device_threads);
}
printf("device init succeeded\n");
Expand Down