Use coalesced memory access in the first kernel

parent c880d637
......@@ -3,6 +3,7 @@
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
* @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
*
* @section LICENSE
*
......@@ -218,8 +219,19 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Initialization.
*/
/**
* <b>Row-major vs column-major</b>
*
* C/C++ arrays are row-major whereas warps are constructed in column-major order from threads/blocks. To get coalesced
* memory access in CUDA, we can use a 2-dimensional CUDA structure but we have to switch x and y inside a block.
*
* This means, we have to switch threadIdx.x <-> threadIdx.y as well as blockDim.x <-> blockDim.y.
* Important: blockDim has to be switched for the kernel call as well!
*/
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
dim3 dimBlock(TILE_SIZE, TILE_SIZE);
/**
* Definition of the "main" CUDA-grid.
......@@ -288,7 +300,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
// compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
// edges between cell nx and ghost layer nx+1
dim3 dimRightBlock(1, TILE_SIZE);
dim3 dimRightBlock(TILE_SIZE, 1);
dim3 dimRightGrid(1, ny/TILE_SIZE);
computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
......@@ -301,7 +313,7 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
dimGrid.x, 0);
// edges between cell ny and ghost layer ny+1
dim3 dimTopBlock(TILE_SIZE, 1);
dim3 dimTopBlock(1, TILE_SIZE);
dim3 dimTopGrid(nx/TILE_SIZE, 1);
computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
......
......@@ -3,6 +3,7 @@
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
* @author Sebastian Rettenberger (rettenbs AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.)
*
* @section LICENSE
*
......@@ -49,6 +50,7 @@
* );
* To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
*
* {@link SWE_WavePropagationBlockCuda::computeNumericalFluxes()} explains the coalesced memory access.
*
* @param i_h water heights (CUDA-array).
* @param i_hu momentums in x-direction (CUDA-array).
......@@ -87,7 +89,7 @@ void computeNetUpdatesKernel(
__shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
//! thread local index in the shared maximum wave speed array
int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.y, threadIdx.x, blockDim.x);
// initialize shared maximum wave speed with zero
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
......@@ -106,8 +108,8 @@ void computeNetUpdatesKernel(
int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
// compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
l_cellIndexI += blockDim.y * blockIdx.x + threadIdx.y + 1; //+1: start at cell with index (1,0)
l_cellIndexJ += blockDim.x * blockIdx.y + threadIdx.x + 1; //+1: start at cell with index (1,1)
/*
* Computation of horizontal net-updates
......@@ -174,8 +176,8 @@ void computeNetUpdatesKernel(
__syncthreads();
// initialize reduction block size with the original block size
int reductionBlockDimX = blockDim.x;
int reductionBlockDimY = blockDim.y;
int reductionBlockDimX = blockDim.y;
int reductionBlockDimY = blockDim.x;
// do the reduction
while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
......@@ -185,11 +187,11 @@ void computeNetUpdatesKernel(
// split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
if(reductionBlockDimX != 1) {
reductionBlockDimX /= 2; //reduce column wise
reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
reductionPartner = computeOneDPositionKernel(threadIdx.y + reductionBlockDimX, threadIdx.x, blockDim.x);
}
else if(reductionBlockDimY != 1) {
reductionBlockDimY /= 2; //reduce row wise
reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
reductionPartner = computeOneDPositionKernel(threadIdx.y, threadIdx.x+reductionBlockDimY, blockDim.x);
}
#ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
......@@ -200,7 +202,7 @@ void computeNetUpdatesKernel(
}
#endif
#endif
if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
if(threadIdx.y < reductionBlockDimX && threadIdx.x < reductionBlockDimY) { // use only half the threads in each reduction
//execute the reduction routine (maximum)
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
l_maxWaveSpeedShared[reductionPartner]
......@@ -210,7 +212,7 @@ void computeNetUpdatesKernel(
__syncthreads();
}
if(threadIdx.x == 0 && threadIdx.y == 0) {
if(threadIdx.y == 0 && threadIdx.x == 0) {
/**
* Position of the maximum wave speed in the global device array.
*
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment