Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
S
SWE
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Gaurav Kukreja
SWE
Commits
dc73e40e
Commit
dc73e40e
authored
Jan 30, 2013
by
Sebastian Rettenberger
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Overwrite source files with skeleton files
parent
c880d637
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
11 additions
and
757 deletions
+11
-757
Readme.md
src/Readme.md
+0
-0
SWE_WavePropagationBlockCuda.cu
src/SWE_WavePropagationBlockCuda.cu
+9
-113
SWE_WavePropagationBlockCuda_kernels.cu
src/SWE_WavePropagationBlockCuda_kernels.cu
+2
-241
SWE_WavePropagationBlockCuda.cu
src_skeleton/SWE_WavePropagationBlockCuda.cu
+0
-270
SWE_WavePropagationBlockCuda_kernels.cu
src_skeleton/SWE_WavePropagationBlockCuda_kernels.cu
+0
-133
No files found.
src
_skeleton
/Readme.md
→
src/Readme.md
View file @
dc73e40e
File moved
src/SWE_WavePropagationBlockCuda.cu
View file @
dc73e40e
...
...
@@ -190,7 +190,6 @@ float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
updateUnknowns(maxTimestep);
t += maxTimestep;
// cout << "Simulation at time " << t << endl << flush;
} while(t < tEnd);
return t;
...
...
@@ -203,67 +202,15 @@ float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
* The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
* To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
* In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
*
* TODO: A splitting or direct computation of the time step width might increase the total time step size.
* Example:
* dx = 11, dy = 6;
* max wave speed in x-direction: 10
* max wave speed in y-direction: 5.5
* max wave speed in both direction: 10
*
* => maximum time step (current implementation): min(11/10, 6/10) = 0.6
* => maximum time step (splitting the dimensions): min(11/10, 6/5.5) = 1.09..
*/
void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Initialization.
*/
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
/**
* Definition of the "main" CUDA-grid.
* This grid covers only edges 0..#(edges in x-direction)-2 and 0..#(edges in y-direction)-2.
*
* An example with a computational domain of size
* nx = 24, ny = 16
* with a 1 cell ghost layer would result in a grid with
* (nx+2)*(ny+2) = (26*18)
* cells and
* (nx+1)*(ny+1) = (25*17)
* edges.
*
* The CUDA-blocks (here 8*8) mentioned above would cover all edges except
* the ones lying between the computational domain and the right/top ghost layer:
* <pre>
* *
* ** top ghost layer,
* ******** cell ids
* ******************************* ** = (*, ny+1)
* * * * * *
* * 8*8 * 8*8 * 8*8 *
* * block * block * block *
* * * * *
* *******************************
* * * * *
* * 8*8 * 8*8 * 8*8 *
* * * block * block * block *
* bottom ** * * * *
* ghost ******** *******************************
* layer, **
* cell ids * * *
* =(*,0) *** ***
* * *
* * *
* left ghost layer, right ghost layer,
* cell ids = (0,*) cell ids = (nx+1, *)
* </pre>
/*
* TODO: This part needs to be implemented.
*/
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
// assert a valid tile size
assert(nx%TILE_SIZE==0);
assert(ny%TILE_SIZE==0);
// "2D array" which holds the blockwise maximum wave speeds
float* l_maximumWaveSpeedsD;
...
...
@@ -276,42 +223,10 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Compute the net updates for the 'main part and the two 'boundary' parts.
*/
// compute the net-updates for the "main" part.
computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx,ny
);
// compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
// edges between cell nx and ghost layer nx+1
dim3 dimRightBlock(1, TILE_SIZE);
dim3 dimRightGrid(1, ny/TILE_SIZE);
computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx, ny,
nx, 0,
dimGrid.x, 0);
// edges between cell ny and ghost layer ny+1
dim3 dimTopBlock(TILE_SIZE, 1);
dim3 dimTopGrid(nx/TILE_SIZE, 1);
computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx, ny,
0, ny,
0, dimGrid.y);
/*
* TODO: This part needs to be implemented.
*/
/*
* Finalize (max reduction of the maximumWaveSpeeds-array.)
...
...
@@ -344,28 +259,9 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
* @param i_deltaT time step size.
*/
void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
//! definition of the CUDA-grid.
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
// assert a valid tile size
assert(nx%TILE_SIZE==0);
assert(ny%TILE_SIZE==0);
// compute the update width.
float l_updateWidthX = i_deltaT / dx;
float l_updateWidthY = i_deltaT / dy;
// update the unknowns (global time step)
updateUnknownsKernel<<<dimGrid,dimBlock>>>( hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
hd, hud, hvd,
l_updateWidthX, l_updateWidthY,
nx, ny);
/*
* TODO: This part needs to be implemented.
*/
// synchronize the copy layer for MPI communication
#ifdef USEMPI
...
...
src/SWE_WavePropagationBlockCuda_kernels.cu
View file @
dc73e40e
...
...
@@ -81,189 +81,8 @@ void computeNetUpdatesKernel(
const int i_blockOffSetX, const int i_blockOffSetY
) {
/*
*
Initialization
*
TODO: This kernel needs to be implemented.
*/
//! array maximum wave speed within this CUDA-block
__shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
//! thread local index in the shared maximum wave speed array
int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
// initialize shared maximum wave speed with zero
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
//! index (l_cellIndexI,l_cellIndexJ) of the cell lying on the right side of the edge/above the edge where the thread works on.
int l_cellIndexI, l_cellIndexJ;
// initialize the indices l_cellIndexI and l_cellIndexJ with the given offset
l_cellIndexI = i_offsetX;
l_cellIndexJ = i_offsetY;
//! array which holds the thread local net-updates.
float l_netUpdates[5];
//! location of the thread local cells in the global CUDA-arrays.
int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
// compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
/*
* Computation of horizontal net-updates
*/
if(i_offsetY == 0) { //horizontal edges?
// compute the locations of the thread local cells in the global CUDA-arrays.
l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+2);
l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
// compute the net-updates
fWaveComputeNetUpdates( 9.81,
i_h[l_leftCellPosition], i_h[l_rightCellPosition],
i_hu[l_leftCellPosition], i_hu[l_rightCellPosition],
i_b[l_leftCellPosition], i_b[l_rightCellPosition],
l_netUpdates
);
// compute the location of the net-updates in the global CUDA-arrays.
l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
// store the horizontal net-updates (thread local net-updates -> device net-updates)
o_hNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[0]; o_hNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[1];
o_huNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[2]; o_huNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[3];
// store the maximum wave speed in the shared array
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = l_netUpdates[4];
}
// synchronize the threads before the vertical edges (optimization)
__syncthreads();
/*
* Computation of vertical net-updates
*/
if(i_offsetX == 0) { //vertical edges?
// compute the locations of the thread local cells in the global CUDA-arrays.
l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+2);
l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
// compute the net-updates
fWaveComputeNetUpdates( 9.81,
i_h[l_leftCellPosition], i_h[l_rightCellPosition],
i_hv[l_leftCellPosition], i_hv[l_rightCellPosition],
i_b[l_leftCellPosition], i_b[l_rightCellPosition],
l_netUpdates
);
// compute the location of the net-updates in the global CUDA-arrays.
l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
// store the vertical net-updates (thread local net-updates -> device net-updates)
o_hNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[0]; o_hNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[1];
o_hvNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[2]; o_hvNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[3];
// store the maximum wave speed in the shared array
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax(l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_netUpdates[4]);
}
/*
* Compute the maximum observed wave speed
*/
// wait for all threads to finish
__syncthreads();
// initialize reduction block size with the original block size
int reductionBlockDimX = blockDim.x;
int reductionBlockDimY = blockDim.y;
// do the reduction
while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
//! reduction partner for a thread
int reductionPartner = 0;
// split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
if(reductionBlockDimX != 1) {
reductionBlockDimX /= 2; //reduce column wise
reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
}
else if(reductionBlockDimY != 1) {
reductionBlockDimY /= 2; //reduce row wise
reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
}
#ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
#warning skipping printf command, which was introduced for compute capability >= 2.0
#else
else {
printf("computeNetUpdatesKernel(...): There was an error in the reducing procedure!\n");
}
#endif
#endif
if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
//execute the reduction routine (maximum)
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
l_maxWaveSpeedShared[reductionPartner]
);
}
// wait for all threads to finish
__syncthreads();
}
if(threadIdx.x == 0 && threadIdx.y == 0) {
/**
* Position of the maximum wave speed in the global device array.
*
* In the 'main' part (e.g. gridDim = nx/TILE_SIZEm ny/TILE_SIZE) the position
* is simply given by the blockId in x- and y-direction with a stride of gridDim.x + 1. The +1 results from the
* speeds in the 'boundary' case, see below.
*
* In the 'boundary' case, where the edges lie between the computational domain and the right/top ghost layer,
* this is more complicated. In this case block offsets in x- and y-direction are used. The offsets define how
* many blocks in the resp. direction have to be added to get a valid result.
* Computational domain - right ghost layer: In this case the dimension of the grid in x-direction is 1.
* Computational domain - top ghost layer: In this case the dimension of the grid in y-direction is 1.
*
* Same Example as in SWE_WavePropagationBlockCuda::computeNumericalFluxes(), assume the CUDA-grid/-blocks has
* the following layout:
* <pre>
* *
* ** top ghost layer,
* * block 8 * block 9 * block 10* ******** cell ids
* ******************************** **
* * * * * *
* * block * block * block * b
* * 4 * 5 * 6 * 7
* * * * *
* ********************************
* * * * *
* * block * block * block * b
* * * 0 * 1 * 2 * 3
* bottom ** * * * *
* ghost ******** ********************************
* layer **
* * * *
* *** ***
* * *
* * *
* left ghost layer right ghost layer
* </pre>
* This results in a 'main' part containing of (3*2) blocks and two 'boundary' parts containing
* of (1*2) blocks and (3*1) blocks.
*
* The maximum wave speed array on the device represents therefore logically a (4 * 3)-1 2D-array (-1: no block on the top right).
* The 'main' part writes into cells 0, 1, 2, 4, 5 and 6.
* The 'computational domain - right ghost layer' part writes into 3 and 7 with offset in x-direction = 3
* The 'computational domain - top ghost layer' part writes into 8, 9, 10 with offset in y-direction = 2
*
*/
int l_maxWaveSpeedDevicePosition = computeOneDPositionKernel( i_blockOffSetX + blockIdx.x,
i_blockOffSetY + blockIdx.y,
max(i_blockOffSetY+1, gridDim.y+1) );
// write the block local maximum wave speed to the device array
o_maximumWaveSpeeds[ l_maxWaveSpeedDevicePosition ] = l_maxWaveSpeedShared[0];
}
}
/**
...
...
@@ -295,66 +114,8 @@ void updateUnknownsKernel(
const float i_updateWidthX, const float i_updateWidthY,
const int i_nX, const int i_nY ) {
/*
*
Initialization
*
TODO: This kernel needs to be implemented.
*/
//! cell indices (l_cellIndexI,l_cellIndexJ) of the cell which the thread updates.
int l_cellIndexI, l_cellIndexJ;
//! location of the thread local cell in the global CUDA-arrays.
int l_cellPosition;
// compute the thread local cell indices (start at cell (1,1))
l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1;
l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1;
// compute the global cell position
l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
#ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
#warning skipping printf command, which was introduced for compute capability >= 2.0
#else
if(l_cellPosition > (i_nX+2)*(i_nY+2))
printf("Warning: cellPosition(%i) > (i_nx+2)*(i_ny+2)\n", l_cellPosition);
#endif
#endif
//! positions of the net-updates in the global CUDA-arrays.
int l_netUpdatesLeftPosition, l_netUpdatesRightPosition, l_netUpdatesBelowPosition, l_netUpdatesAbovePosition;
/**
* Compute the positions of the net updates relative to a given cell
*
* <pre>
* netUpdateRight(i-1, j)
*
* | |
* | |
* | |
* netUpdateBelow(i,j) -----*************-----
* * *
* * cell(i,j) *
* * *
* -----*************------ netUpdateAbove(i, j-1)
* | |
* | |
* | |
* netUpdatesLeft(i,j)
* </pre>
*/
l_netUpdatesRightPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
l_netUpdatesLeftPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+1);
l_netUpdatesAbovePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
l_netUpdatesBelowPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+1);
//update the cell values
io_h[l_cellPosition] -= i_updateWidthX * ( i_hNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_hNetUpdatesLeftD[ l_netUpdatesLeftPosition ] )
+ i_updateWidthY * ( i_hNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
io_hu[l_cellPosition] -= i_updateWidthX * ( i_huNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_huNetUpdatesLeftD[ l_netUpdatesLeftPosition ] );
io_hv[l_cellPosition] -= i_updateWidthY * ( i_hvNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hvNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
}
/**
...
...
src_skeleton/SWE_WavePropagationBlockCuda.cu
deleted
100644 → 0
View file @
c880d637
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
*/
#include "SWE_WavePropagationBlockCuda.hh"
#include "SWE_BlockCUDA.hh"
#include "SWE_WavePropagationBlockCuda_kernels.hh"
#include <cassert>
#ifndef STATICLOGGER
#define STATICLOGGER
#include "tools/Logger.hpp"
static tools::Logger s_sweLogger;
#endif
// CUDA-C includes
#include <cuda.h>
#include <cuda_runtime_api.h>
// Thrust library (used for the final maximum reduction in the method computeNumericalFluxes(...))
#include <thrust/device_vector.h>
/**
* Constructor of a SWE_WavePropagationBlockCuda.
*
* Allocates the variables for the simulation:
* Please note: The definition of indices changed in contrast to the CPU-Implementation.
*
* unknowns hd,hud,hvd,bd stored on the CUDA device are defined for grid indices [0,..,nx+1]*[0,..,ny+1] (-> Abstract class SWE_BlockCUDA)
* -> computational domain is [1,..,nx]*[1,..,ny]
* -> plus ghost cell layer
*
* net-updates are defined for edges with indices [0,..,nx]*[0,..,ny] for horizontal and vertical edges for simplicity (one layer is not necessary).
*
* A left/right net update with index (i-1,j) is located on the edge between
* cells with index (i-1,j) and (i,j):
* <pre>
* *********************
* * * *
* * (i-1,j) * (i,j) *
* * * *
* *********************
*
* *
* ***
* *****
* *
* *
* NetUpdatesLeft(i-1,j)
* or
* NetUpdatesRight(i-1,j)
* </pre>
*
* A below/above net update with index (i, j-1) is located on the edge between
* cells with index (i, j-1) and (i,j):
* <pre>
* ***********
* * *
* * (i, j) * *
* * * ** NetUpdatesBelow(i,j-1)
* *********** ***** or
* * * ** NetUpdatesAbove(i,j-1)
* * (i,j-1) * *
* * *
* ***********
* </pre>
* @param i_offsetX spatial offset of the block in x-direction.
* @param i_offsetY spatial offset of the offset in y-direction.
* @param i_cudaDevice ID of the CUDA-device, which should be used.
*/
SWE_WavePropagationBlockCuda::SWE_WavePropagationBlockCuda( const float i_offsetX,
const float i_offsetY,
const int i_cudaDevice ): SWE_BlockCUDA(i_offsetX, i_offsetY, i_cudaDevice) {
// compute the size of one 1D net-update array.
int sizeOfNetUpdates = (nx+1)*(ny+1)*sizeof(float);
// allocate CUDA memory for the net-updates
cudaMalloc((void**)&hNetUpdatesLeftD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesLeftD");
cudaMalloc((void**)&hNetUpdatesRightD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesRightD");
cudaMalloc((void**)&huNetUpdatesLeftD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for huNetUpdatesLeftD");
cudaMalloc((void**)&huNetUpdatesRightD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for huNetUpdatesRightD");
cudaMalloc((void**)&hNetUpdatesBelowD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesBelowD");
cudaMalloc((void**)&hNetUpdatesAboveD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesAboveD");
cudaMalloc((void**)&hvNetUpdatesBelowD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hvNetUpdatesBelowD");
cudaMalloc((void**)&hvNetUpdatesAboveD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesAboveD");
}
/**
* Destructor of a SWE_WavePropagationBlockCuda.
*
* Frees all of the memory, which was allocated within the constructor.
* Resets the CUDA device: Useful if error occured and printf is used on the device (buffer).
*/
SWE_WavePropagationBlockCuda::~SWE_WavePropagationBlockCuda() {
// free the net-updates memory
cudaFree(hNetUpdatesLeftD);
cudaFree(hNetUpdatesRightD);
cudaFree(huNetUpdatesLeftD);
cudaFree(huNetUpdatesRightD);
cudaFree(hNetUpdatesBelowD);
cudaFree(hNetUpdatesAboveD);
cudaFree(hvNetUpdatesBelowD);
cudaFree(hvNetUpdatesAboveD);
// reset the cuda device
s_sweLogger.printString("Resetting the CUDA devices");
cudaDeviceReset();
}
/**
* Compute a single global time step of a given time step width.
* Remark: The user has to take care about the time step width. No additional check is done. The time step width typically available
* after the computation of the numerical fluxes (hidden in this method).
*
* First the net-updates are computed.
* Then the cells are updated with the net-updates and the given time step width.
*
* @param i_dT time step width in seconds.
*/
__host__
void SWE_WavePropagationBlockCuda::simulateTimestep(float i_dT) {
// Compute the numerical fluxes/net-updates in the wave propagation formulation.
computeNumericalFluxes();
// Update the unknowns with the net-updates.
updateUnknowns(i_dT);
}
/**
* perform forward-Euler time steps, starting with simulation time tStart,:
* until simulation time tEnd is reached;
* device-global variables hd, hud, hvd are updated;
* unknowns h, hu, hv in main memory are not updated.
* Ghost layers and bathymetry sources are updated between timesteps.
* intended as main simulation loop between two checkpoints
*/
__host__
float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
float t = tStart;
do {
// set values in ghost cells:
setGhostLayer();
// Compute the numerical fluxes/net-updates in the wave propagation formulation.
computeNumericalFluxes();
// Update the unknowns with the net-updates.
updateUnknowns(maxTimestep);
t += maxTimestep;
} while(t < tEnd);
return t;
}
/**
* Compute the numerical fluxes (net-update formulation here) on all edges.
*
* The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
* To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
* In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
*/
void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Initialization.
*/
/*
* TODO: This part needs to be implemented.
*/
// "2D array" which holds the blockwise maximum wave speeds
float* l_maximumWaveSpeedsD;
// size of the maximum wave speed array (dimension of the grid + ghost layers, without the top right block), sizeof(float) not included
int l_sizeMaxWaveSpeeds = ((dimGrid.x+1)*(dimGrid.y+1)-1);
cudaMalloc((void**)&l_maximumWaveSpeedsD, (l_sizeMaxWaveSpeeds*sizeof(float)) );
/*
* Compute the net updates for the 'main part and the two 'boundary' parts.
*/
/*
* TODO: This part needs to be implemented.
*/
/*
* Finalize (max reduction of the maximumWaveSpeeds-array.)
*
* The Thrust library is used in this step.
* An optional kernel could be written for the maximum reduction.
*/
// Thrust pointer to the device array
thrust::device_ptr<float> l_thrustDevicePointer(l_maximumWaveSpeedsD);
// use Thrusts max_element-function for the maximum reduction
thrust::device_ptr<float> l_thrustDevicePointerMax = thrust::max_element(l_thrustDevicePointer, l_thrustDevicePointer+l_sizeMaxWaveSpeeds);
// get the result from the device
float l_maximumWaveSpeed = l_thrustDevicePointerMax[0];
// free the max wave speeds array on the device
cudaFree(l_maximumWaveSpeedsD);
// set the maximum time step for this SWE_WavePropagationBlockCuda
maxTimestep = std::min( dx/l_maximumWaveSpeed, dy/l_maximumWaveSpeed );
// CFL = 0.5
maxTimestep *= (float)0.4;
}
/**
* Update the cells with a given global time step.
*
* @param i_deltaT time step size.
*/
void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
/*
* TODO: This part needs to be implemented.
*/
// synchronize the copy layer for MPI communication
#ifdef USEMPI
synchCopyLayerBeforeRead();
#endif
}
src_skeleton/SWE_WavePropagationBlockCuda_kernels.cu
deleted
100644 → 0
View file @
c880d637
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
*/
#include "SWE_BlockCUDA.hh"
#include "SWE_WavePropagationBlockCuda_kernels.hh"
#include <cmath>
#include <cstdio>
#include "solvers/FWaveCuda.h"
/**
* The compute net-updates kernel calls the solver for a defined CUDA-Block and does a reduction over the computed wave speeds within this block.
*
* Remark: In overall we have nx+1 / ny+1 edges.
* Therefore the edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer"
* will not be computed in a typical call of the function:
* computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
* hNetUpdatesLeftD, hNetUpdatesRightD,
* huNetUpdatesLeftD, huNetUpdatesRightD,
* hNetUpdatesBelowD, hNetUpdatesAboveD,
* hvNetUpdatesBelowD, hvNetUpdatesAboveD,
* l_maximumWaveSpeedsD,
* i_nx, i_ny
* );
* To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
*
*
* @param i_h water heights (CUDA-array).
* @param i_hu momentums in x-direction (CUDA-array).
* @param i_hv momentums in y-direction (CUDA-array).
* @param i_b bathymetry values (CUDA-array).
* @param o_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param o_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param o_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
* @param o_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
* @param o_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
* @param o_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
* @param o_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
* @param o_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
* @param o_maximumWaveSpeeds maximum wave speed which occurred within the CUDA-block is written here (CUDA-array).
* @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
* @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
* @param i_offsetX cell/edge offset in x-direction.
* @param i_offsetY cell/edge offset in y-direction.
*/
__global__
void computeNetUpdatesKernel(
const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
float* o_hNetUpdatesLeftD, float* o_hNetUpdatesRightD,
float* o_huNetUpdatesLeftD, float* o_huNetUpdatesRightD,
float* o_hNetUpdatesBelowD, float* o_hNetUpdatesAboveD,
float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
float* o_maximumWaveSpeeds,
const int i_nX, const int i_nY,
const int i_offsetX, const int i_offsetY,
const int i_blockOffSetX, const int i_blockOffSetY
) {
/*
* TODO: This kernel needs to be implemented.
*/
}
/**
* The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
*
* @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
* @param i_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
* @param i_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
* @param i_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
* @param i_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
* @param io_h water heights (CUDA-array).
* @param io_hu momentums in x-direction (CUDA-array).
* @param io_hv momentums in y-direction (CUDA-array).
* @param i_updateWidthX update width in x-direction.
* @param i_updateWidthY update width in y-direction.
* @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
* @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
*/
__global__
void updateUnknownsKernel(
const float* i_hNetUpdatesLeftD, const float* i_hNetUpdatesRightD,
const float* i_huNetUpdatesLeftD, const float* i_huNetUpdatesRightD,
const float* i_hNetUpdatesBelowD, const float* i_hNetUpdatesAboveD,
const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
float* io_h, float* io_hu, float* io_hv,
const float i_updateWidthX, const float i_updateWidthY,
const int i_nX, const int i_nY ) {
/*
* TODO: This kernel needs to be implemented.
*/
}
/**
* Compute the position of 2D coordinates in a 1D array.
* array[i][j] -> i * ny + j
*
* @param i_i row index.
* @param i_j column index.
* @param i_ny #(cells in y-direction).
* @return 1D index.
*/
__device__
inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_ny) {
return i_i*i_ny + i_j;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment