Skip to content

Commit

Permalink
Merge pull request #80 from ShijieYan/master
Browse files Browse the repository at this point in the history
add double-buffer to solve fangq/mcx#41
  • Loading branch information
fangq authored Sep 21, 2022
2 parents c3b15b2 + e661011 commit 4e1999e
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 19 deletions.
13 changes: 7 additions & 6 deletions src/mmc_cl_host.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/

cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing
cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer

cl_float* field, *dref = NULL;

Expand All @@ -119,7 +120,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
0.f,
#endif
mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z}},
cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
(MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
Expand Down Expand Up @@ -238,7 +239,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
fullload = totalcucore;
}

field = (cl_float*)calloc(sizeof(cl_float) * meshlen, cfg->maxgate);
field = (cl_float*)calloc(sizeof(cl_float) * meshlen * 2, cfg->maxgate);
dref = (cl_float*)calloc(sizeof(cl_float) * mesh->nf, cfg->maxgate);
Pdet = (float*)calloc(cfg->maxdetphoton * sizeof(float), hostdetreclen);

Expand Down Expand Up @@ -303,7 +304,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
}

OCL_ASSERT(((gseed[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN, Pseed, &status), status)));
OCL_ASSERT(((gweight[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * fieldlen, field, &status), status)));
OCL_ASSERT(((gweight[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * fieldlen * 2, field, &status), status)));
OCL_ASSERT(((gdref[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * nflen, dref, &status), status)));
OCL_ASSERT(((gdetphoton[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * cfg->maxdetphoton * hostdetreclen, Pdet, &status), status)));

Expand Down Expand Up @@ -652,15 +653,15 @@ is more than what your have specified (%d), please use the -H option to specify

//handling the 2pt distributions
if (cfg->issave2pt) {
float* rawfield = (float*)malloc(sizeof(float) * fieldlen);
float* rawfield = (float*)malloc(sizeof(float) * fieldlen * 2);

OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gweight[devid], CL_TRUE, 0, sizeof(cl_float)*fieldlen,
OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gweight[devid], CL_TRUE, 0, sizeof(cl_float)*fieldlen * 2,
rawfield, 0, NULL, NULL)));
MMC_FPRINTF(cfg->flog, "transfer complete: %d ms\n", GetTimeMillis() - tic);
fflush(cfg->flog);

for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
field[(i >> cfg->nbuffer)] += rawfield[i]; //+rawfield[i+fieldlen];
field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
}

free(rawfield);
Expand Down
42 changes: 35 additions & 7 deletions src/mmc_core.cl
Original file line number Diff line number Diff line change
Expand Up @@ -450,11 +450,11 @@ __device__ float rand_next_scatlen(__private RandType t[RAND_BUF_LEN]) {
// https://devtalk.nvidia.com/default/topic/458062/atomicadd-float-float-atomicmul-float-float-/

__device__ inline float atomicadd(volatile __global float* address, const float value) {
float old = value;
float old = value, orig;

while ((old = atomic_xchg(address, atomic_xchg(address, 0.0f) + old)) != 0.0f);
while ((old = atomic_xchg(address, (orig=atomic_xchg(address, 0.0f)) + old)) != 0.0f);

return old;
return orig;
}

/*
Expand Down Expand Up @@ -680,7 +680,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __

if (r->oldweight > 0.f) {
#ifdef USE_ATOMIC
atomicadd(weight + r->oldidx, r->oldweight);
float oldval = atomicadd(weight + r->oldidx, r->oldweight);
if (oldval > MAX_ACCUM) {
if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
atomicadd(weight + r->oldidx, oldval);
} else {
atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
}
}
#else
weight[r->oldidx] += r->oldweight;
#endif
Expand All @@ -697,7 +704,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __

if (r->faceid == -2 || !r->isend) {
#ifdef USE_ATOMIC
atomicadd(weight + newidx, r->oldweight);
float oldval = atomicadd(weight + newidx, r->oldweight);
if (oldval > MAX_ACCUM) {
if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
atomicadd(weight + r->oldidx, oldval);
} else {
atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
}
}
#else
weight[newidx] += r->oldweight;
#endif
Expand Down Expand Up @@ -744,7 +758,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
if (newidx != r->oldidx) {
#ifndef DO_NOT_SAVE
#ifdef USE_ATOMIC
atomicadd(weight + r->oldidx, r->oldweight);
float oldval = atomicadd(weight + r->oldidx, r->oldweight);
if (oldval > MAX_ACCUM) {
if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
atomicadd(weight + r->oldidx, oldval);
} else {
atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
}
}
#else
weight[r->oldidx] += r->oldweight;
#endif
Expand All @@ -759,7 +780,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __

if (r->faceid == -2 || !r->isend) {
#ifdef USE_ATOMIC
atomicadd(weight + newidx, r->oldweight);
float oldval = atomicadd(weight + r->oldidx, r->oldweight);
if (oldval > MAX_ACCUM) {
if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
atomicadd(weight + r->oldidx, oldval);
} else {
atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
}
}
#else
weight[newidx] += r->oldweight;
#endif
Expand Down
13 changes: 7 additions & 6 deletions src/mmc_cu_host.cu
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
MCXReporter* greporter;
uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne)
<< cfg->nbuffer; // use 4 copies to reduce racing
cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer

float* field, *dref = NULL;

Expand Down Expand Up @@ -372,7 +373,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
oddphotons =
(int)(cfg->nphoton * cfg->workload[gpuid] / (fullload * cfg->respin) -
threadphoton * gpu[gpuid].autothread);
field = (float*)calloc(sizeof(float) * meshlen, cfg->maxgate);
field = (float*)calloc(sizeof(float) * meshlen * 2, cfg->maxgate);
dref = (float*)calloc(sizeof(float) * mesh->nf, cfg->maxgate);
Pdet = (float*)calloc(cfg->maxdetphoton * sizeof(float), hostdetreclen);

Expand Down Expand Up @@ -457,8 +458,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
gseed, Pseed, sizeof(uint) * gpu[gpuid].autothread * RAND_SEED_WORD_LEN,
cudaMemcpyHostToDevice));

CUDA_ASSERT(cudaMalloc((void**)&gweight, sizeof(float) * fieldlen));
CUDA_ASSERT(cudaMemcpy(gweight, field, sizeof(float) * fieldlen,
CUDA_ASSERT(cudaMalloc((void**)&gweight, sizeof(float) * fieldlen * 2));
CUDA_ASSERT(cudaMemcpy(gweight, field, sizeof(float) * fieldlen * 2,
cudaMemcpyHostToDevice));

CUDA_ASSERT(cudaMalloc((void**)&gdref, sizeof(float) * nflen));
Expand Down Expand Up @@ -683,16 +684,16 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo

// handling the 2pt distributions
if (cfg->issave2pt) {
float* rawfield = (float*)malloc(sizeof(float) * fieldlen);
float* rawfield = (float*)malloc(sizeof(float) * fieldlen * 2);

CUDA_ASSERT(cudaMemcpy(rawfield, gweight, sizeof(float) * fieldlen,
CUDA_ASSERT(cudaMemcpy(rawfield, gweight, sizeof(float) * fieldlen * 2,
cudaMemcpyDeviceToHost));
MMC_FPRINTF(cfg->flog, "transfer complete: %d ms\n",
GetTimeMillis() - tic);
fflush(cfg->flog);

for (i = 0; i < fieldlen; i++) { // accumulate field, can be done in the GPU
field[(i >> cfg->nbuffer)] += rawfield[i]; //+rawfield[i+fieldlen];
field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen]; //+rawfield[i+fieldlen];
}

free(rawfield);
Expand Down

0 comments on commit 4e1999e

Please sign in to comment.