Use coalesced memory access in the first kernel

222121c4 · Sebastian Rettenberger · c880d637 · 222121c4 · 222121c4
Commit 222121c4 authored Oct 05, 2012 by Sebastian Rettenberger
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 12 deletions

SWE_WavePropagationBlockCuda.cu src/SWE_WavePropagationBlockCuda.cu +15 -3

SWE_WavePropagationBlockCuda_kernels.cu src/SWE_WavePropagationBlockCuda_kernels.cu +11 -9

No files found.
--- a/src/SWE_WavePropagationBlockCuda.cu
+++ b/src/SWE_WavePropagationBlockCuda.cu
@@ -3,6 +3,7 @@
 * This file is part of SWE.
 *
 * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ * @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
 *
 * @section LICENSE
 *
@@ -218,8 +219,19 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
  /*
   * Initialization.
   */
+
+  /**
+   * <b>Row-major vs column-major</b>
+   *
+   * C/C++ arrays are row-major whereas warps are constructed in column-major order from threads/blocks. To get coalesced
+   * memory access in CUDA, we can use a 2-dimensional CUDA structure but we have to switch x and y inside a block.
+   *
+   * This means, we have to switch threadIdx.x <-> threadIdx.y as well as blockDim.x <-> blockDim.y.
+   * Important: blockDim has to be switched for the kernel call as well!
+   */
+
  //! definition of one CUDA-block. Typical size are 8*8 or 16*16
-  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+  dim3 dimBlock(TILE_SIZE, TILE_SIZE);

  /**
   * Definition of the "main" CUDA-grid.
@@ -288,7 +300,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {

  // compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
  // edges between cell nx and ghost layer nx+1
-  dim3 dimRightBlock(1, TILE_SIZE);
+  dim3 dimRightBlock(TILE_SIZE, 1);
  dim3 dimRightGrid(1, ny/TILE_SIZE);
  computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
                                                            hNetUpdatesLeftD,  hNetUpdatesRightD,
@@ -301,7 +313,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
                                                            dimGrid.x, 0);

  // edges between cell ny and ghost layer ny+1
-  dim3 dimTopBlock(TILE_SIZE, 1);
+  dim3 dimTopBlock(1, TILE_SIZE);
  dim3 dimTopGrid(nx/TILE_SIZE, 1);
  computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
                                                        hNetUpdatesLeftD,  hNetUpdatesRightD,

--- a/src/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src/SWE_WavePropagationBlockCuda_kernels.cu
@@ -3,6 +3,7 @@
 * This file is part of SWE.
 *
 * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ * @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
 *
 * @section LICENSE
 *
@@ -49,6 +50,7 @@
 *                                                        );
 *         To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
 *
+ * {@link SWE_WavePropagationBlockCuda::computeNumericalFluxes()} explains the coalesced memory access.
 *
 * @param i_h water heights (CUDA-array).
 * @param i_hu momentums in x-direction (CUDA-array).
@@ -87,7 +89,7 @@ void computeNetUpdatesKernel(
  __shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];

  //! thread local index in the shared maximum wave speed array
-  int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
+  int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.y, threadIdx.x, blockDim.x);

  // initialize shared maximum wave speed with zero
  l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
@@ -106,8 +108,8 @@ void computeNetUpdatesKernel(
  int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;

  // compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
-  l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
-  l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
+  l_cellIndexI += blockDim.y * blockIdx.x + threadIdx.y + 1; //+1: start at cell with index (1,0)
+  l_cellIndexJ += blockDim.x * blockIdx.y + threadIdx.x + 1; //+1: start at cell with index (1,1)

  /*
   * Computation of horizontal net-updates
@@ -174,8 +176,8 @@ void computeNetUpdatesKernel(
  __syncthreads();

  // initialize reduction block size with the original block size
-  int reductionBlockDimX = blockDim.x;
-  int reductionBlockDimY = blockDim.y;
+  int reductionBlockDimX = blockDim.y;
+  int reductionBlockDimY = blockDim.x;

  // do the reduction
  while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
@@ -185,11 +187,11 @@ void computeNetUpdatesKernel(
    // split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
    if(reductionBlockDimX != 1) {
      reductionBlockDimX /= 2; //reduce column wise
-      reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
+      reductionPartner = computeOneDPositionKernel(threadIdx.y + reductionBlockDimX, threadIdx.x, blockDim.x);
    }
    else if(reductionBlockDimY != 1) {
      reductionBlockDimY /= 2; //reduce row wise
-      reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
+      reductionPartner = computeOneDPositionKernel(threadIdx.y, threadIdx.x+reductionBlockDimY, blockDim.x);
    }
 #ifndef NDEBUG
 #if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
@@ -200,7 +202,7 @@ void computeNetUpdatesKernel(
    }
 #endif
 #endif
-    if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
+    if(threadIdx.y < reductionBlockDimX && threadIdx.x < reductionBlockDimY) { // use only half the threads in each reduction
      //execute the reduction routine (maximum)
      l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
                                                           l_maxWaveSpeedShared[reductionPartner]
@@ -210,7 +212,7 @@ void computeNetUpdatesKernel(
    __syncthreads();
  }

-  if(threadIdx.x == 0 && threadIdx.y == 0) {
+  if(threadIdx.y == 0 && threadIdx.x == 0) {
    /**
     * Position of the maximum wave speed in the global device array.
     *