Merge pull request #1 from rettenbs/master

Use rlink to get rid of LD_LIBRARY_PATH / use coalesced memory access

Merge pull request #1 from rettenbs/master
Use rlink to get rid of LD_LIBRARY_PATH / use coalesced memory access
2395d611 · Chair of Scientific Computing · c880d637 · f6789853 · 2395d611 · 2395d611
Commit 2395d611 authored Oct 08, 2012 by Chair of Scientific Computing
Showing with 37 additions and 18 deletions

SConstruct SConstruct +4 -0

SWE_WavePropagationBlockCuda.cu src/SWE_WavePropagationBlockCuda.cu +15 -3

SWE_WavePropagationBlockCuda_kernels.cu src/SWE_WavePropagationBlockCuda_kernels.cu +18 -15

No files found.
--- a/SConstruct
+++ b/SConstruct
@@ -43,6 +43,8 @@ print 'under certain conditions.'
 print 'Details can be found in the file \'gpl.txt\'.'
 print ''

+import os
+
 #
 # set possible variables
 #
@@ -178,8 +180,10 @@ if env['parallelization'] in ['cuda', 'mpi_with_cuda']:
  # set the directories for the CudaTool
  if 'cudaToolkitDir' in env:
    env['CUDA_TOOLKIT_PATH'] = env['cudaToolkitDir']
+    env.Append(RPATH=[os.path.join(env['cudaToolkitDir'], 'lib64')])
  if 'cudaSDKDir' in env:
    env['CUDA_SDK_PATH'] = env['cudaSDKDir']
+    env.Append(RPATH=[os.path.join(env['cudaSDKDir'], 'lib64')])

  env.Tool('CudaTool', toolpath = ['.'])
  

--- a/src/SWE_WavePropagationBlockCuda.cu
+++ b/src/SWE_WavePropagationBlockCuda.cu
@@ -3,6 +3,7 @@
 * This file is part of SWE.
 *
 * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ * @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
 *
 * @section LICENSE
 *
@@ -218,8 +219,19 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
  /*
   * Initialization.
   */
+
+  /**
+   * <b>Row-major vs column-major</b>
+   *
+   * C/C++ arrays are row-major whereas warps are constructed in column-major order from threads/blocks. To get coalesced
+   * memory access in CUDA, we can use a 2-dimensional CUDA structure but we have to switch x and y inside a block.
+   *
+   * This means, we have to switch threadIdx.x <-> threadIdx.y as well as blockDim.x <-> blockDim.y.
+   * Important: blockDim has to be switched for the kernel call as well!
+   */
+
  //! definition of one CUDA-block. Typical size are 8*8 or 16*16
-  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+  dim3 dimBlock(TILE_SIZE, TILE_SIZE);

  /**
   * Definition of the "main" CUDA-grid.
@@ -288,7 +300,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {

  // compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
  // edges between cell nx and ghost layer nx+1
-  dim3 dimRightBlock(1, TILE_SIZE);
+  dim3 dimRightBlock(TILE_SIZE, 1);
  dim3 dimRightGrid(1, ny/TILE_SIZE);
  computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
                                                            hNetUpdatesLeftD,  hNetUpdatesRightD,
@@ -301,7 +313,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
                                                            dimGrid.x, 0);

  // edges between cell ny and ghost layer ny+1
-  dim3 dimTopBlock(TILE_SIZE, 1);
+  dim3 dimTopBlock(1, TILE_SIZE);
  dim3 dimTopGrid(nx/TILE_SIZE, 1);
  computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
                                                        hNetUpdatesLeftD,  hNetUpdatesRightD,

--- a/src/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src/SWE_WavePropagationBlockCuda_kernels.cu
@@ -3,6 +3,7 @@
 * This file is part of SWE.
 *
 * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ * @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
 *
 * @section LICENSE
 *
@@ -49,6 +50,7 @@
 *                                                        );
 *         To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
 *
+ * {@link SWE_WavePropagationBlockCuda::computeNumericalFluxes()} explains the coalesced memory access.
 *
 * @param i_h water heights (CUDA-array).
 * @param i_hu momentums in x-direction (CUDA-array).
@@ -87,7 +89,7 @@ void computeNetUpdatesKernel(
  __shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];

  //! thread local index in the shared maximum wave speed array
-  int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
+  int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.y, threadIdx.x, blockDim.x);

  // initialize shared maximum wave speed with zero
  l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
@@ -106,8 +108,8 @@ void computeNetUpdatesKernel(
  int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;

  // compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
-  l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
-  l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
+  l_cellIndexI += blockDim.y * blockIdx.x + threadIdx.y + 1; //+1: start at cell with index (1,0)
+  l_cellIndexJ += blockDim.x * blockIdx.y + threadIdx.x + 1; //+1: start at cell with index (1,1)

  /*
   * Computation of horizontal net-updates
@@ -174,22 +176,21 @@ void computeNetUpdatesKernel(
  __syncthreads();

  // initialize reduction block size with the original block size
-  int reductionBlockDimX = blockDim.x;
  int reductionBlockDimY = blockDim.y;
+  int reductionBlockDimX = blockDim.x;

  // do the reduction
-  while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
+  while(reductionBlockDimY != 1 || reductionBlockDimX != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
    //! reduction partner for a thread
    int reductionPartner = 0;

    // split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
    if(reductionBlockDimX != 1) {
-      reductionBlockDimX /= 2; //reduce column wise
-      reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
-    }
-    else if(reductionBlockDimY != 1) {
-      reductionBlockDimY /= 2; //reduce row wise
-      reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
+      reductionBlockDimX >>= 1; //reduce row wise (divide by 2)
+      reductionPartner = computeOneDPositionKernel(threadIdx.y, threadIdx.x + reductionBlockDimX, blockDim.x);
+    } else if(reductionBlockDimY != 1) {
+      reductionBlockDimY >>= 1; //reduce column wise (divide by 2)
+      reductionPartner = computeOneDPositionKernel(threadIdx.y + reductionBlockDimY, threadIdx.x, blockDim.x);
    }
 #ifndef NDEBUG
 #if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
@@ -200,7 +201,7 @@ void computeNetUpdatesKernel(
    }
 #endif
 #endif
-    if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
+    if(threadIdx.y < reductionBlockDimY && threadIdx.x < reductionBlockDimX) { // use only half the threads in each reduction
      //execute the reduction routine (maximum)
      l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
                                                           l_maxWaveSpeedShared[reductionPartner]
@@ -210,7 +211,7 @@ void computeNetUpdatesKernel(
    __syncthreads();
  }

-  if(threadIdx.x == 0 && threadIdx.y == 0) {
+  if(threadIdx.y == 0 && threadIdx.x == 0) {
    /**
     * Position of the maximum wave speed in the global device array.
     *
@@ -269,6 +270,8 @@ void computeNetUpdatesKernel(
 /**
 * The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
 *
+ * {@link SWE_WavePropagationBlockCuda::computeNumericalFluxes()} explains the coalesced memory access.
+ *
 * @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
 * @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
 * @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
@@ -304,8 +307,8 @@ void updateUnknownsKernel(
  int l_cellPosition;

  // compute the thread local cell indices (start at cell (1,1))
-  l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1;
-  l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1;
+  l_cellIndexI = blockDim.y * blockIdx.x + threadIdx.y + 1;
+  l_cellIndexJ = blockDim.x * blockIdx.y + threadIdx.x + 1;

  // compute the global cell position
  l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);