Optimize memory access in max reduction

parent 04f8ea3b
...@@ -176,22 +176,21 @@ void computeNetUpdatesKernel( ...@@ -176,22 +176,21 @@ void computeNetUpdatesKernel(
__syncthreads(); __syncthreads();
// initialize reduction block size with the original block size // initialize reduction block size with the original block size
int reductionBlockDimX = blockDim.y; int reductionBlockDimY = blockDim.y;
int reductionBlockDimY = blockDim.x; int reductionBlockDimX = blockDim.x;
// do the reduction // do the reduction
while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done. while(reductionBlockDimY != 1 || reductionBlockDimX != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
//! reduction partner for a thread //! reduction partner for a thread
int reductionPartner = 0; int reductionPartner = 0;
// split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1) // split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
if(reductionBlockDimX != 1) { if(reductionBlockDimX != 1) {
reductionBlockDimX /= 2; //reduce column wise reductionBlockDimX >>= 1; //reduce row wise (divide by 2)
reductionPartner = computeOneDPositionKernel(threadIdx.y + reductionBlockDimX, threadIdx.x, blockDim.x); reductionPartner = computeOneDPositionKernel(threadIdx.y, threadIdx.x + reductionBlockDimX, blockDim.x);
} } else if(reductionBlockDimY != 1) {
else if(reductionBlockDimY != 1) { reductionBlockDimY >>= 1; //reduce column wise (divide by 2)
reductionBlockDimY /= 2; //reduce row wise reductionPartner = computeOneDPositionKernel(threadIdx.y + reductionBlockDimY, threadIdx.x, blockDim.x);
reductionPartner = computeOneDPositionKernel(threadIdx.y, threadIdx.x+reductionBlockDimY, blockDim.x);
} }
#ifndef NDEBUG #ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200) #if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
...@@ -202,7 +201,7 @@ void computeNetUpdatesKernel( ...@@ -202,7 +201,7 @@ void computeNetUpdatesKernel(
} }
#endif #endif
#endif #endif
if(threadIdx.y < reductionBlockDimX && threadIdx.x < reductionBlockDimY) { // use only half the threads in each reduction if(threadIdx.y < reductionBlockDimY && threadIdx.x < reductionBlockDimX) { // use only half the threads in each reduction
//execute the reduction routine (maximum) //execute the reduction routine (maximum)
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
l_maxWaveSpeedShared[reductionPartner] l_maxWaveSpeedShared[reductionPartner]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment