Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMD: optimize VEGA auto suggestion #2494

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 22 additions & 18 deletions xmrstak/backend/amd/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,21 @@ class autoAdjust
}
}

// check if cryptonight_monero_v8 is selected for the user or dev pool
bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());

// true for all cryptonight_heavy derivates since we check the user and dev pool
bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();

// true for cryptonight_gpu as main user pool algorithm
bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;

bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;

bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;

// 8 threads per block (this is a good value for the most gpus)
uint32_t default_workSize = 8;
size_t minFreeMem = 128u * byteToMiB;
/* 1000 is a magic selected limit, the reason is that more than 2GiB memory
* sowing down the memory performance because of TLB cache misses
Expand All @@ -130,6 +145,9 @@ class autoAdjust
* to avoid out of memory errors
*/
maxThreads = 2024u;

if(useCryptonight_gpu)
default_workSize = 16u;
}

// NVIDIA optimizations
Expand All @@ -142,19 +160,6 @@ class autoAdjust
minFreeMem = 512u * byteToMiB;
}

// check if cryptonight_monero_v8 is selected for the user or dev pool
bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());

// true for all cryptonight_heavy derivates since we check the user and dev pool
bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();

// true for cryptonight_gpu as main user pool algorithm
bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;

bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;

bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;

// set strided index to default
ctx.stridedIndex = 1;

Expand Down Expand Up @@ -203,11 +208,11 @@ class autoAdjust
size_t perThread = hashMemSize + 240u;
size_t maxIntensity = memPerThread / perThread;
size_t possibleIntensity = std::min(maxThreads, maxIntensity);
// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
// map intensity to a multiple of the compute unit count, default_workSize is the number of threads per work group
size_t intensity = (possibleIntensity / (default_workSize * ctx.computeUnits)) * ctx.computeUnits * default_workSize;
// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
if(numThreads == 2)
intensity = (possibleIntensity / 8) * 8;
intensity = (possibleIntensity / default_workSize) * default_workSize;

//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
if(intensity == 0)
Expand All @@ -225,9 +230,8 @@ class autoAdjust
conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n";
conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
// set 8 threads per block (this is a good value for the most gpus)
conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
" \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(default_workSize) + ",\n" +
" \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
" \"unroll\" : " +
std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
Expand Down