Commit 2395d611 authored by Chair of Scientific Computing's avatar Chair of Scientific Computing

Merge pull request #1 from rettenbs/master

Use rlink to get rid of LD_LIBRARY_PATH / use coalesced memory access
parents c880d637 f6789853
...@@ -43,6 +43,8 @@ print 'under certain conditions.' ...@@ -43,6 +43,8 @@ print 'under certain conditions.'
print 'Details can be found in the file \'gpl.txt\'.' print 'Details can be found in the file \'gpl.txt\'.'
print '' print ''
import os
# #
# set possible variables # set possible variables
# #
...@@ -178,8 +180,10 @@ if env['parallelization'] in ['cuda', 'mpi_with_cuda']: ...@@ -178,8 +180,10 @@ if env['parallelization'] in ['cuda', 'mpi_with_cuda']:
# set the directories for the CudaTool # set the directories for the CudaTool
if 'cudaToolkitDir' in env: if 'cudaToolkitDir' in env:
env['CUDA_TOOLKIT_PATH'] = env['cudaToolkitDir'] env['CUDA_TOOLKIT_PATH'] = env['cudaToolkitDir']
env.Append(RPATH=[os.path.join(env['cudaToolkitDir'], 'lib64')])
if 'cudaSDKDir' in env: if 'cudaSDKDir' in env:
env['CUDA_SDK_PATH'] = env['cudaSDKDir'] env['CUDA_SDK_PATH'] = env['cudaSDKDir']
env.Append(RPATH=[os.path.join(env['cudaSDKDir'], 'lib64')])
env.Tool('CudaTool', toolpath = ['.']) env.Tool('CudaTool', toolpath = ['.'])
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
* This file is part of SWE. * This file is part of SWE.
* *
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer) * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
* @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
* *
* @section LICENSE * @section LICENSE
* *
...@@ -218,8 +219,19 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() { ...@@ -218,8 +219,19 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/* /*
* Initialization. * Initialization.
*/ */
/**
* <b>Row-major vs column-major</b>
*
* C/C++ arrays are row-major whereas warps are constructed in column-major order from threads/blocks. To get coalesced
* memory access in CUDA, we can use a 2-dimensional CUDA structure but we have to switch x and y inside a block.
*
* This means, we have to switch threadIdx.x <-> threadIdx.y as well as blockDim.x <-> blockDim.y.
* Important: blockDim has to be switched for the kernel call as well!
*/
//! definition of one CUDA-block. Typical size are 8*8 or 16*16 //! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE); dim3 dimBlock(TILE_SIZE, TILE_SIZE);
/** /**
* Definition of the "main" CUDA-grid. * Definition of the "main" CUDA-grid.
...@@ -288,7 +300,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() { ...@@ -288,7 +300,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
// compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer") // compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
// edges between cell nx and ghost layer nx+1 // edges between cell nx and ghost layer nx+1
dim3 dimRightBlock(1, TILE_SIZE); dim3 dimRightBlock(TILE_SIZE, 1);
dim3 dimRightGrid(1, ny/TILE_SIZE); dim3 dimRightGrid(1, ny/TILE_SIZE);
computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd, computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD, hNetUpdatesLeftD, hNetUpdatesRightD,
...@@ -301,7 +313,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() { ...@@ -301,7 +313,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
dimGrid.x, 0); dimGrid.x, 0);
// edges between cell ny and ghost layer ny+1 // edges between cell ny and ghost layer ny+1
dim3 dimTopBlock(TILE_SIZE, 1); dim3 dimTopBlock(1, TILE_SIZE);
dim3 dimTopGrid(nx/TILE_SIZE, 1); dim3 dimTopGrid(nx/TILE_SIZE, 1);
computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd, computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD, hNetUpdatesLeftD, hNetUpdatesRightD,
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
* This file is part of SWE. * This file is part of SWE.
* *
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer) * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
* @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
* *
* @section LICENSE * @section LICENSE
* *
...@@ -49,6 +50,7 @@ ...@@ -49,6 +50,7 @@
* ); * );
* To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges. * To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
* *
* {@link SWE_WavePropagationBlockCuda::computeNumericalFluxes()} explains the coalesced memory access.
* *
* @param i_h water heights (CUDA-array). * @param i_h water heights (CUDA-array).
* @param i_hu momentums in x-direction (CUDA-array). * @param i_hu momentums in x-direction (CUDA-array).
...@@ -87,7 +89,7 @@ void computeNetUpdatesKernel( ...@@ -87,7 +89,7 @@ void computeNetUpdatesKernel(
__shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE]; __shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
//! thread local index in the shared maximum wave speed array //! thread local index in the shared maximum wave speed array
int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y); int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.y, threadIdx.x, blockDim.x);
// initialize shared maximum wave speed with zero // initialize shared maximum wave speed with zero
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0; l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
...@@ -106,8 +108,8 @@ void computeNetUpdatesKernel( ...@@ -106,8 +108,8 @@ void computeNetUpdatesKernel(
int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition; int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
// compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge // compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0) l_cellIndexI += blockDim.y * blockIdx.x + threadIdx.y + 1; //+1: start at cell with index (1,0)
l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1) l_cellIndexJ += blockDim.x * blockIdx.y + threadIdx.x + 1; //+1: start at cell with index (1,1)
/* /*
* Computation of horizontal net-updates * Computation of horizontal net-updates
...@@ -174,22 +176,21 @@ void computeNetUpdatesKernel( ...@@ -174,22 +176,21 @@ void computeNetUpdatesKernel(
__syncthreads(); __syncthreads();
// initialize reduction block size with the original block size // initialize reduction block size with the original block size
int reductionBlockDimX = blockDim.x;
int reductionBlockDimY = blockDim.y; int reductionBlockDimY = blockDim.y;
int reductionBlockDimX = blockDim.x;
// do the reduction // do the reduction
while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done. while(reductionBlockDimY != 1 || reductionBlockDimX != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
//! reduction partner for a thread //! reduction partner for a thread
int reductionPartner = 0; int reductionPartner = 0;
// split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1) // split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
if(reductionBlockDimX != 1) { if(reductionBlockDimX != 1) {
reductionBlockDimX /= 2; //reduce column wise reductionBlockDimX >>= 1; //reduce row wise (divide by 2)
reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y); reductionPartner = computeOneDPositionKernel(threadIdx.y, threadIdx.x + reductionBlockDimX, blockDim.x);
} } else if(reductionBlockDimY != 1) {
else if(reductionBlockDimY != 1) { reductionBlockDimY >>= 1; //reduce column wise (divide by 2)
reductionBlockDimY /= 2; //reduce row wise reductionPartner = computeOneDPositionKernel(threadIdx.y + reductionBlockDimY, threadIdx.x, blockDim.x);
reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
} }
#ifndef NDEBUG #ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200) #if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
...@@ -200,7 +201,7 @@ void computeNetUpdatesKernel( ...@@ -200,7 +201,7 @@ void computeNetUpdatesKernel(
} }
#endif #endif
#endif #endif
if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction if(threadIdx.y < reductionBlockDimY && threadIdx.x < reductionBlockDimX) { // use only half the threads in each reduction
//execute the reduction routine (maximum) //execute the reduction routine (maximum)
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
l_maxWaveSpeedShared[reductionPartner] l_maxWaveSpeedShared[reductionPartner]
...@@ -210,7 +211,7 @@ void computeNetUpdatesKernel( ...@@ -210,7 +211,7 @@ void computeNetUpdatesKernel(
__syncthreads(); __syncthreads();
} }
if(threadIdx.x == 0 && threadIdx.y == 0) { if(threadIdx.y == 0 && threadIdx.x == 0) {
/** /**
* Position of the maximum wave speed in the global device array. * Position of the maximum wave speed in the global device array.
* *
...@@ -269,6 +270,8 @@ void computeNetUpdatesKernel( ...@@ -269,6 +270,8 @@ void computeNetUpdatesKernel(
/** /**
* The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates. * The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
* *
* {@link SWE_WavePropagationBlockCuda::computeNumericalFluxes()} explains the coalesced memory access.
*
* @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array). * @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array). * @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array). * @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
...@@ -304,8 +307,8 @@ void updateUnknownsKernel( ...@@ -304,8 +307,8 @@ void updateUnknownsKernel(
int l_cellPosition; int l_cellPosition;
// compute the thread local cell indices (start at cell (1,1)) // compute the thread local cell indices (start at cell (1,1))
l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1; l_cellIndexI = blockDim.y * blockIdx.x + threadIdx.y + 1;
l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1; l_cellIndexJ = blockDim.x * blockIdx.y + threadIdx.x + 1;
// compute the global cell position // compute the global cell position
l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2); l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment