Merge pull request #80 from ShijieYan/master

add double-buffer to solve fangq/mcx#41
fangq · Sep 21, 2022 · 4e1999e · 4e1999e
2 parents c3b15b2 + e661011
commit 4e1999e
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 19 deletions.
diff --git a/src/mmc_cl_host.c b/src/mmc_cl_host.c
@@ -93,6 +93,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
     cl_mem* gprogress = NULL, *gdetected = NULL, *gphotonseed = NULL; /*write-only buffers*/
 
     cl_uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne) << cfg->nbuffer; // use 4 copies to reduce racing
+    cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
 
     cl_float*  field, *dref = NULL;
 
@@ -119,7 +120,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
         0.f,
 #endif
         mesh->nn, mesh->ne, mesh->nf, {{mesh->nmin.x, mesh->nmin.y, mesh->nmin.z}}, cfg->nout,
-        cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z}},
+        cfg->roulettesize, cfg->srcnum, {{cfg->crop0.x, cfg->crop0.y, cfg->crop0.z, cfg->crop0.w}},
         mesh->srcelemlen, {{cfg->bary0.x, cfg->bary0.y, cfg->bary0.z, cfg->bary0.w}},
         cfg->e0, cfg->isextdet, meshlen, cfg->nbuffer, (mesh->prop + 1 + cfg->isextdet) + cfg->detnum,
         (MIN((MAX_PROP - param.maxpropdet), ((mesh->ne) << 2)) >> 2), /*max count of elem normal data in const mem*/
@@ -238,7 +239,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
         fullload = totalcucore;
     }
 
-    field = (cl_float*)calloc(sizeof(cl_float) * meshlen, cfg->maxgate);
+    field = (cl_float*)calloc(sizeof(cl_float) * meshlen * 2, cfg->maxgate);
     dref = (cl_float*)calloc(sizeof(cl_float) * mesh->nf, cfg->maxgate);
     Pdet = (float*)calloc(cfg->maxdetphoton * sizeof(float), hostdetreclen);
 
@@ -303,7 +304,7 @@ void mmc_run_cl(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, void (*progress
         }
 
         OCL_ASSERT(((gseed[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(cl_uint) * gpu[i].autothread * RAND_SEED_WORD_LEN, Pseed, &status), status)));
-        OCL_ASSERT(((gweight[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * fieldlen, field, &status), status)));
+        OCL_ASSERT(((gweight[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * fieldlen * 2, field, &status), status)));
         OCL_ASSERT(((gdref[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * nflen, dref, &status), status)));
         OCL_ASSERT(((gdetphoton[i] = clCreateBuffer(mcxcontext, RW_MEM, sizeof(float) * cfg->maxdetphoton * hostdetreclen, Pdet, &status), status)));
 
@@ -652,15 +653,15 @@ is more than what your have specified (%d), please use the -H option to specify
 
                 //handling the 2pt distributions
                 if (cfg->issave2pt) {
-                    float* rawfield = (float*)malloc(sizeof(float) * fieldlen);
+                    float* rawfield = (float*)malloc(sizeof(float) * fieldlen * 2);
 
-                    OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gweight[devid], CL_TRUE, 0, sizeof(cl_float)*fieldlen,
+                    OCL_ASSERT((clEnqueueReadBuffer(mcxqueue[devid], gweight[devid], CL_TRUE, 0, sizeof(cl_float)*fieldlen * 2,
                                                     rawfield, 0, NULL, NULL)));
                     MMC_FPRINTF(cfg->flog, "transfer complete:        %d ms\n", GetTimeMillis() - tic);
                     fflush(cfg->flog);
 
                     for (i = 0; i < fieldlen; i++) { //accumulate field, can be done in the GPU
-                        field[(i >> cfg->nbuffer)] += rawfield[i];    //+rawfield[i+fieldlen];
+                        field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
                     }
 
                     free(rawfield);

diff --git a/src/mmc_core.cl b/src/mmc_core.cl
@@ -450,11 +450,11 @@ __device__ float rand_next_scatlen(__private RandType t[RAND_BUF_LEN]) {
 // https://devtalk.nvidia.com/default/topic/458062/atomicadd-float-float-atomicmul-float-float-/
 
 __device__ inline float atomicadd(volatile __global float* address, const float value) {
-    float old = value;
+    float old = value, orig;
 
-    while ((old = atomic_xchg(address, atomic_xchg(address, 0.0f) + old)) != 0.0f);
+    while ((old = atomic_xchg(address, (orig=atomic_xchg(address, 0.0f)) + old)) != 0.0f);
 
-    return old;
+    return orig;
 }
 
 /*
@@ -680,7 +680,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 
                     if (r->oldweight > 0.f) {
 #ifdef USE_ATOMIC
-                        atomicadd(weight + r->oldidx, r->oldweight);
+                        float oldval = atomicadd(weight + r->oldidx, r->oldweight);
+                        if (oldval > MAX_ACCUM) {
+                            if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
+                                atomicadd(weight + r->oldidx, oldval);
+                            } else {
+                                atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                            }
+                        }
 #else
                         weight[r->oldidx] += r->oldweight;
 #endif
@@ -697,7 +704,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 
                 if (r->faceid == -2 || !r->isend) {
 #ifdef USE_ATOMIC
-                    atomicadd(weight + newidx, r->oldweight);
+                    float oldval = atomicadd(weight + newidx, r->oldweight);
+                    if (oldval > MAX_ACCUM) {
+                        if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
+                            atomicadd(weight + r->oldidx, oldval);
+                        } else {
+                            atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                        }
+                    }
 #else
                     weight[newidx] += r->oldweight;
 #endif
@@ -744,7 +758,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
                     if (newidx != r->oldidx) {
 #ifndef DO_NOT_SAVE
 #ifdef USE_ATOMIC
-                        atomicadd(weight + r->oldidx, r->oldweight);
+                        float oldval = atomicadd(weight + r->oldidx, r->oldweight);
+                        if (oldval > MAX_ACCUM) {
+                            if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
+                                atomicadd(weight + r->oldidx, oldval);
+                            } else {
+                                atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                            }
+                        }
 #else
                         weight[r->oldidx] += r->oldweight;
 #endif
@@ -759,7 +780,14 @@ __device__ float branchless_badouel_raytet(ray* r, __constant MCXParam* gcfg, __
 
                     if (r->faceid == -2 || !r->isend) {
 #ifdef USE_ATOMIC
-                        atomicadd(weight + newidx, r->oldweight);
+                        float oldval = atomicadd(weight + r->oldidx, r->oldweight);
+                        if (oldval > MAX_ACCUM) {
+                            if (atomicadd(weight + r->oldidx, -oldval) < 0.0f) {
+                                atomicadd(weight + r->oldidx, oldval);
+                            } else {
+                                atomicadd(weight + r->oldidx + gcfg->crop0.w, oldval);
+                            }
+                        }
 #else
                         weight[newidx] += r->oldweight;
 #endif

diff --git a/src/mmc_cu_host.cu b/src/mmc_cu_host.cu
@@ -210,6 +210,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
     MCXReporter* greporter;
     uint meshlen = ((cfg->method == rtBLBadouelGrid) ? cfg->crop0.z : mesh->ne)
                    << cfg->nbuffer; // use 4 copies to reduce racing
+    cfg->crop0.w = meshlen * cfg->maxgate; // offset for the second buffer
 
     float* field, *dref = NULL;
 
@@ -372,7 +373,7 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
     oddphotons =
         (int)(cfg->nphoton * cfg->workload[gpuid] / (fullload * cfg->respin) -
               threadphoton * gpu[gpuid].autothread);
-    field = (float*)calloc(sizeof(float) * meshlen, cfg->maxgate);
+    field = (float*)calloc(sizeof(float) * meshlen * 2, cfg->maxgate);
     dref = (float*)calloc(sizeof(float) * mesh->nf, cfg->maxgate);
     Pdet = (float*)calloc(cfg->maxdetphoton * sizeof(float), hostdetreclen);
 
@@ -457,8 +458,8 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
                     gseed, Pseed, sizeof(uint) * gpu[gpuid].autothread * RAND_SEED_WORD_LEN,
                     cudaMemcpyHostToDevice));
 
-    CUDA_ASSERT(cudaMalloc((void**)&gweight, sizeof(float) * fieldlen));
-    CUDA_ASSERT(cudaMemcpy(gweight, field, sizeof(float) * fieldlen,
+    CUDA_ASSERT(cudaMalloc((void**)&gweight, sizeof(float) * fieldlen * 2));
+    CUDA_ASSERT(cudaMemcpy(gweight, field, sizeof(float) * fieldlen * 2,
                            cudaMemcpyHostToDevice));
 
     CUDA_ASSERT(cudaMalloc((void**)&gdref, sizeof(float) * nflen));
@@ -683,16 +684,16 @@ void mmc_run_simulation(mcconfig* cfg, tetmesh* mesh, raytracer* tracer, GPUInfo
 
             // handling the 2pt distributions
             if (cfg->issave2pt) {
-                float* rawfield = (float*)malloc(sizeof(float) * fieldlen);
+                float* rawfield = (float*)malloc(sizeof(float) * fieldlen * 2);
 
-                CUDA_ASSERT(cudaMemcpy(rawfield, gweight, sizeof(float) * fieldlen,
+                CUDA_ASSERT(cudaMemcpy(rawfield, gweight, sizeof(float) * fieldlen * 2,
                                        cudaMemcpyDeviceToHost));
                 MMC_FPRINTF(cfg->flog, "transfer complete:        %d ms\n",
                             GetTimeMillis() - tic);
                 fflush(cfg->flog);
 
                 for (i = 0; i < fieldlen; i++) { // accumulate field, can be done in the GPU
-                    field[(i >> cfg->nbuffer)] += rawfield[i];    //+rawfield[i+fieldlen];
+                    field[(i >> cfg->nbuffer)] += rawfield[i] + rawfield[i + fieldlen];    //+rawfield[i+fieldlen];
                 }
 
                 free(rawfield);