Commit 92ed477e authored by breuera's avatar breuera

Added the CUDA-files. Changed a few routines accordingly.

parent f935a3b2
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#include <math.h>
#include "tools/help.hh"
#include "SWE_BlockCUDA.hh"
#include "SWE_BlockCUDA_kernels.hh"
//const int TILE_SIZE=16;
//const int TILE_SIZE=8;
// #include "SWE_BlockCUDA_kernels.cu"
/*
* helper function to read CUDA error codes
* (implementation in swe.cu */
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "\nCuda error (%s): %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
/*
* helper function to read CUDA error codes
* (implementation in swe.cu */
void tryCUDA(cudaError_t err, const char *msg)
{
if( cudaSuccess != err)
{
fprintf(stderr, "\nCuda error (%s): %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
/**
* Constructor: allocate variables for simulation
*
* unknowns h,hu,hv,b are defined on grid indices [0,..,nx+1]*[0,..,ny+1]
* -> computational domain is [1,..,nx]*[1,..,ny]
* -> plus ghost cell layer
*
* flux terms are defined for edges with indices [0,..,nx]*[1,..,ny]
* or [1,..,nx]*[0,..,ny] (for horizontal/vertical edges)
* Flux term with index (i,j) is located on the edge between
* cells with index (i,j) and (i+1,j) or (i,j+1)
*
* bathymetry source terms are defined for cells with indices [1,..,nx]*[1,..,ny]
*/
SWE_BlockCUDA::SWE_BlockCUDA(float _offsetX, float _offsetY)
: SWE_Block(_offsetX,_offsetY)
{
if (nx % TILE_SIZE != 0) {
cout << "WARNING: nx not a multiple of TILE_SIZE -> will lead to crashes!"
<< endl << flush;
};
if (ny % TILE_SIZE != 0) {
cout << "WARNING: ny not a multiple of TILE_SIZE -> will lead to crashes!"
<< endl << flush;
};
int size = (nx+2)*(ny+2)*sizeof(float);
// allocate CUDA memory for unknows h,hu,hv and bathymetry b
cudaMalloc((void**)&hd, size);
checkCUDAError("allocate device memory for h");
cudaMalloc((void**)&hud, size);
checkCUDAError("allocate device memory for hu");
cudaMalloc((void**)&hvd, size);
checkCUDAError("allocate device memory for hv");
cudaMalloc((void**)&bd, size);
checkCUDAError("allocate device memory for bd");
//----> COULD BE IMPLEMENTED AS (INEFFICIENT) REFERENCE IMPLEMENTATION
// // allocate CUDA unknowns: maxmimum height and velocity
// size = (nx/TILE_SIZE)*(ny/TILE_SIZE)*sizeof(float);
// cudaMalloc((void**)&maxhd, size);
// checkCUDAError("allocate device memory for maxhd");
// cudaMalloc((void**)&maxvd, size);
// checkCUDAError("allocate device memory for maxvd");
// allocate consecutive memory for 2 columns with three unknowns each
// (h, hu, hv, excluding b) for copy/ghost layer at bottom/top boundary
size = nx+2;
bottomLayer = new float[6*size];
bottomGhostLayer = new SWE_Block1D( bottomLayer, bottomLayer+size, bottomLayer+(2*size), size );
bottomCopyLayer = new SWE_Block1D( bottomLayer+(3*size), bottomLayer+(4*size), bottomLayer+(5*size), size );
// same for top boundary:
topLayer = new float[6*size];
topGhostLayer = new SWE_Block1D( topLayer, topLayer+size, topLayer+(2*size), size );
topCopyLayer = new SWE_Block1D( topLayer+(3*size), topLayer+(4*size), topLayer+(5*size), size );
// allocate resp. memory on the CUDA device
cudaMalloc((void**)&bottomLayerDevice, 6*size*sizeof(float));
checkCUDAError("allocate device memory for bottom copy/ghost layer");
cudaMalloc((void**)&topLayerDevice, 6*size*sizeof(float));
checkCUDAError("allocate device memory for top copy/ghost layer");
}
/**
* Destructor: de-allocate all variables
*/
SWE_BlockCUDA::~SWE_BlockCUDA() {
cudaFree(hd); cudaFree(hud); cudaFree(hvd); cudaFree(bd);
// cudaFree(maxhd); cudaFree(maxvd);
// free memory for copy/ghost layers
delete bottomLayer;
delete topLayer;
cudaFree(bottomLayerDevice);
cudaFree(topLayerDevice);
}
//==================================================================
// methods for simulation
//==================================================================
/**
* set the values of all ghost cells depending on the specifed
* boundary conditions
*/
void SWE_BlockCUDA::setBoundaryConditions() {
#ifdef DBG
cout << "Call kernel to compute h in ghost layer corner (for visualisation only) "
<< flush << endl;
#endif
// Fill ghost layer corner cells
kernelHdBufferEdges<<<1,1>>>(hd, nx, ny);
#ifdef DBG
cout << "Call kernel to compute left/right boundaries " << flush << endl;
#endif
// synchWaterHeightAfterWrite();
// synchDischargeAfterWrite();
if (boundary[BND_LEFT] == PASSIVE || boundary[BND_LEFT] == CONNECT) {
// nothing to be done:
// ghost values are copied by SWE_BlockCUDA::synchGhostLayerAfterWrite(...)
}
else {
dim3 dimBlock(1,TILE_SIZE);
dim3 dimGrid(1,ny/TILE_SIZE);
kernelLeftBoundary<<<dimGrid,dimBlock>>>(
hd,hud,hvd,nx,ny,boundary[BND_LEFT]);
};
if (boundary[BND_RIGHT] == PASSIVE || boundary[BND_RIGHT] == CONNECT) {
// nothing to be done:
// ghost values are copied by SWE_BlockCUDA::synchGhostLayerAfterWrite(...)
}
else {
dim3 dimBlock(1,TILE_SIZE);
dim3 dimGrid(1,ny/TILE_SIZE);
kernelRightBoundary<<<dimGrid,dimBlock>>>(
hd,hud,hvd,nx,ny,boundary[BND_RIGHT]);
};
#ifdef DBG
cout << "Call kernel to compute bottom/top boundaries " << flush << endl;
#endif
switch (boundary[BND_BOTTOM]) {
case CONNECT:
{
// set ghost layer data in auxiliary data structure for ghost layer:
for(int i=0; i<=nx+1; i++) {
bottomGhostLayer->h[i] = neighbour[BND_BOTTOM]->h[i];
bottomGhostLayer->hu[i] = neighbour[BND_BOTTOM]->hu[i];
bottomGhostLayer->hv[i] = neighbour[BND_BOTTOM]->hv[i];
};
}
case PASSIVE: /* also executed for case CONNECT */
{
// copy ghost layer data from buffer bottomLayerDevice
// into bottom ghost layer of unknowns
dim3 dimBlock(TILE_SIZE,1);
dim3 dimGrid(nx/TILE_SIZE,1);
kernelBottomGhostBoundary<<<dimGrid,dimBlock>>>(
hd,hud,hvd,bottomLayerDevice,nx,ny);
break;
}
default:
{
// set simple boundary conditions (OUTFLOW, WALL) by resp. kernel:
dim3 dimBlock(TILE_SIZE,1);
dim3 dimGrid(nx/TILE_SIZE,1);
kernelBottomBoundary<<<dimGrid,dimBlock>>>(
hd,hud,hvd,nx,ny,boundary[BND_BOTTOM]);
}
};
switch (boundary[BND_TOP]) {
case CONNECT:
{
// set ghost layer data in auxiliary data structure for ghost layer:
for(int i=0; i<=nx+1; i++) {
topGhostLayer->h[i] = neighbour[BND_TOP]->h[i];
topGhostLayer->hu[i] = neighbour[BND_TOP]->hu[i];
topGhostLayer->hv[i] = neighbour[BND_TOP]->hv[i];
};
}
case PASSIVE: /* also executed for case CONNECT */
{
// copy ghost layer data from buffer bottomLayerDevice
// into bottom ghost layer of unknowns
dim3 dimBlock(TILE_SIZE,1);
dim3 dimGrid(nx/TILE_SIZE,1);
kernelTopGhostBoundary<<<dimGrid,dimBlock>>>(
hd,hud,hvd,topLayerDevice,nx,ny);
break;
}
default:
{
// set simple boundary conditions (OUTFLOW, WALL) by resp. kernel:
dim3 dimBlock(TILE_SIZE,1);
dim3 dimGrid(nx/TILE_SIZE,1);
kernelTopBoundary<<<dimGrid,dimBlock>>>(
hd,hud,hvd,nx,ny,boundary[BND_TOP]);
}
};
}
/**
* synchronise the ghost layer content of h, hu, and hv in main memory
* with device memory and auxiliary data structures, i.e. transfer
* memory from main/auxiliary memory into device memory
*/
void SWE_BlockCUDA::synchGhostLayerAfterWrite() {
if (boundary[BND_LEFT] == PASSIVE || boundary[BND_LEFT] == CONNECT) {
#ifdef DBG
cout << "Set left passive boundary" << endl;
#endif
// transfer h, hu, and hv from left ghost layer to resp. device memory
cudaMemcpy(hd, h[0], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("left ghost layer not transferred to device");
cudaMemcpy(hud, hu[0], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("left ghost layer not transferred to device");
cudaMemcpy(hvd, hv[0], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("left ghost layer not transferred to device");
};
if (boundary[BND_RIGHT] == PASSIVE || boundary[BND_RIGHT] == CONNECT) {
#ifdef DBG
cout << "Set right passive boundary" << endl;
#endif
// transfer h, hu, and hv from right ghost layer to resp. device memory
cudaMemcpy(hd+((nx+1)*(ny+2)), h[nx+1], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("right ghost layer not transferred to device");
cudaMemcpy(hud+((nx+1)*(ny+2)), hu[nx+1], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("right ghost layer not transferred to device");
cudaMemcpy(hvd+((nx+1)*(ny+2)), hv[nx+1], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("right ghost layer not transferred to device");
};
// bottom and top boundary ghost layers are replicated (for efficiency reasons)
// in the memory regions starting from bottomLayer and topLayer
// -> these need to be transfered to device memory
if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
#ifdef DBG
cout << "Set bottom passive boundary" << endl;
cout << " * transfer " << 3*(nx+2)*sizeof(float) << " bytes." << endl;
#endif
// transfer bottom ghost layer
// (3 arrays - h, hu, hv - of nx+2 floats, consecutive in memory)
cudaMemcpy(bottomLayerDevice, bottomLayer, 3*(nx+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("bottom ghost layer not transferred to device");
};
if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
#ifdef DBG
cout << "Set top passive boundary" << endl;
cout << " * transfer " << 3*(nx+2)*sizeof(float) << " bytes." << endl;
#endif
// transfer top ghost layer
// (3 arrays - h, hu, hv - of nx+2 floats, consecutive in memory)
cudaMemcpy(topLayerDevice, topLayer, 3*(nx+2)*sizeof(float), cudaMemcpyHostToDevice);
checkCUDAError("top ghost layer not transferred to device");
};
}
/**
* Update (for heterogeneous computing) variables h, hu, hv in copy layers
* before an external access to the unknowns
* (only required for PASSIVE and CONNECT boundaries)
* - copy (up-to-date) content from device memory into main memory
*/
void SWE_BlockCUDA::synchCopyLayerBeforeRead() {
#ifdef DBG
cout << "Transfer copy layers from device to main memory" << flush << endl;
#endif
// copy values in copy layers to main memory
// left copy layer:
int offset = ny+2;
cudaMemcpy(h[1], hd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("left ghost layer not transferred to device");
cudaMemcpy(hu[1], hud+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("left ghost layer not transferred to device");
cudaMemcpy(hv[1], hvd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("left ghost layer not transferred to device");
// right copy layer
offset = nx*(ny+2);
cudaMemcpy(h[nx], hd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("right ghost layer not transferred to device");
cudaMemcpy(hu[nx], hud+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("right ghost layer not transferred to device");
cudaMemcpy(hv[nx], hvd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("right ghost layer not transferred to device");
int size = 3*(nx+2);
// bottom copy layer
if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
dim3 dimBlock(TILE_SIZE,1);
dim3 dimGrid(nx/TILE_SIZE,1);
kernelBottomCopyLayer<<<dimGrid,dimBlock>>>(
hd,hud,hvd,bottomLayerDevice+size,nx,ny);
cudaMemcpy(bottomLayer+size, bottomLayerDevice+size, size*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("bottom copy layer not transferred from device");
};
// top copy layer
if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
dim3 dimBlock(TILE_SIZE,1);
dim3 dimGrid(nx/TILE_SIZE,1);
kernelTopCopyLayer<<<dimGrid,dimBlock>>>(
hd,hud,hvd,topLayerDevice+size,nx,ny);
cudaMemcpy(topLayer+size, topLayerDevice+size, size*sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("top copy layer not transferred from device");
};
}
/**
* register the row or column layer next to a boundary as a "copy layer",
* from which values will be copied into the ghost layer or a neighbour;
* @return a SWE_Block1D object that contains row variables h, hu, and hv
*/
SWE_Block1D* SWE_BlockCUDA::registerCopyLayer(BoundaryEdge edge){
// for TOP and BOTTOM layer, the implementation is identical to that in SWE_Block
// for LEFT and RIGHT layer, separate layers are used that avoid strided copies
// when transferring memory between host and device memory
switch (edge) {
case BND_LEFT:
return new SWE_Block1D( h.getColProxy(1), hu.getColProxy(1), hv.getColProxy(1));
case BND_RIGHT:
return new SWE_Block1D( h.getColProxy(nx), hu.getColProxy(nx), hv.getColProxy(nx));
case BND_BOTTOM:
// transfer bottom ghost and copy layer to extra SWE_Block1D
for(int i=0; i<=nx+1; i++) {
bottomGhostLayer->h[i] = h[i][0];
bottomGhostLayer->hu[i] = hu[i][0];
bottomGhostLayer->hv[i] = hv[i][0];
bottomCopyLayer->h[i] = h[i][1];
bottomCopyLayer->hu[i] = hu[i][1];
bottomCopyLayer->hv[i] = hv[i][1];
};
return bottomCopyLayer;
case BND_TOP:
// transfer bottom ghost and copy layer to extra SWE_Block1D
for(int i=0; i<=nx+1; i++) {
topGhostLayer->h[i] = h[i][ny+1];
topGhostLayer->hu[i] = hu[i][ny+1];
topGhostLayer->hv[i] = hv[i][ny+1];
topCopyLayer->h[i] = h[i][ny];
topCopyLayer->hu[i] = hu[i][ny];
topCopyLayer->hv[i] = hv[i][ny];
};
return topCopyLayer;
};
return NULL;
}
/**
* "grab" the ghost layer at the specific boundary in order to set boundary values
* in this ghost layer externally.
* The boundary conditions at the respective ghost layer is set to PASSIVE,
* such that the grabbing program component is responsible to provide correct
* values in the ghost layer, for example by receiving data from a remote
* copy layer via MPI communication.
* @param specified edge
* @return a SWE_Block1D object that contains row variables h, hu, and hv
*/
SWE_Block1D* SWE_BlockCUDA::grabGhostLayer(BoundaryEdge edge){
// for TOP and BOTTOM layer, the implementation is identical to that in SWE_Block
// for LEFT and RIGHT layer, separate layers are used that avoid strided copies
// when transferring memory between host and device memory
boundary[edge] = PASSIVE;
switch (edge) {
case BND_LEFT:
return new SWE_Block1D( h.getColProxy(0), hu.getColProxy(0), hv.getColProxy(0));
case BND_RIGHT:
return new SWE_Block1D( h.getColProxy(nx+1), hu.getColProxy(nx+1), hv.getColProxy(nx+1));
case BND_BOTTOM:
return bottomGhostLayer;
case BND_TOP:
return topGhostLayer;
};
return NULL;
}
//==================================================================
// protected member functions for memory model:
// in case of temporary variables (especial in non-local memory, for
// example on accelerators), the main variables h, hu, hv, and b
// are not necessarily updated after each time step.
// The following methods are called to synchronise before or after
// external read or write to the variables.
//==================================================================
/**
* Update all temporary and non-local (for heterogeneous computing) variables
* after an external update of the main variables h, hu, hv, and b.
*/
void SWE_BlockCUDA::synchAfterWrite() {
// update h, hu, hv, b in device memory
synchWaterHeightAfterWrite();
synchDischargeAfterWrite();
synchBathymetryAfterWrite();
// update the auxiliary data structures for copy and ghost layers
// at bottom (and top, see below) boundaries
// -> only required for PASSIVE and CONNECT boundaries
if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
// transfer bottom ghost and copy layer to extra SWE_Block1D
for(int i=0; i<=nx+1; i++) {
bottomGhostLayer->h[i] = h[i][0];
bottomGhostLayer->hu[i] = hu[i][0];
bottomGhostLayer->hv[i] = hv[i][0];
bottomCopyLayer->h[i] = h[i][1];
bottomCopyLayer->hu[i] = hu[i][1];
bottomCopyLayer->hv[i] = hv[i][1];
};
};
if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
// transfer bottom ghost and copy layer to extra SWE_Block1D
for(int i=0; i<=nx+1; i++) {
topGhostLayer->h[i] = h[i][ny+1];
topGhostLayer->hu[i] = hu[i][ny+1];
topGhostLayer->hv[i] = hv[i][ny+1];
topCopyLayer->h[i] = h[i][ny];
topCopyLayer->hu[i] = hu[i][ny];
topCopyLayer->hv[i] = hv[i][ny];
};
};
}
/**
* Update temporary and non-local (for heterogeneous computing) variables
* after an external update of the water height h
*/
void SWE_BlockCUDA::synchWaterHeightAfterWrite() {
#ifdef DBG
cout << "Load water height h into device memory" << flush << endl;
#endif
int size = (nx+2)*(ny+2)*sizeof(float);
cudaMemcpy(hd,h.elemVector(), size, cudaMemcpyHostToDevice);
checkCUDAError("memory of h not transferred");
}
/**
* Update temporary and non-local (for heterogeneous computing) variables
* after an external update of the discharge variables hu and hv
*/
void SWE_BlockCUDA::synchDischargeAfterWrite() {
#ifdef DBG
cout << "Load discharge hu and hv into device memory" << flush << endl;
#endif
int size = (nx+2)*(ny+2)*sizeof(float);
cudaMemcpy(hud,hu.elemVector(), size, cudaMemcpyHostToDevice);
checkCUDAError("memory of hu not transferred");
cudaMemcpy(hvd,hv.elemVector(), size, cudaMemcpyHostToDevice);
checkCUDAError("memory of hv not transferred");
}
/**
* Update temporary and non-local (for heterogeneous computing) variables
* after an external update of the bathymetry b
*/
void SWE_BlockCUDA::synchBathymetryAfterWrite() {
#ifdef DBG
cout << "Load bathymetry unknowns into device memory" << flush << endl;
#endif
int size = (nx+2)*(ny+2)*sizeof(float);
cudaMemcpy(bd,b.elemVector(), size, cudaMemcpyHostToDevice);
checkCUDAError("memory of b not transferred");
// computeBathymetrySources();
}
/**
* Update the main variables h, hu, hv, and b before an external read access:
* copies current content of the respective device variables hd, hud, hvd, bd
*/
void SWE_BlockCUDA::synchBeforeRead() {
synchWaterHeightBeforeRead();
synchDischargeBeforeRead();
synchBathymetryBeforeRead();
/* --- the following part is excluded:
--- it should not be necessary to update the auxiliary data structures
--- for top/bottom copy/ghost layers in main memory: these need to be
--- kept consistent by class SWE_BlockCUDA (in particular, they cannot be
--- changed externally).
--- */
// if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
// // transfer bottom ghost and copy layer to extra SWE_Block1D
// for(int i=0; i<=nx+1; i++) {
// h[i][0] = bottomGhostLayer->h[i];
// hu[i][0]= bottomGhostLayer->hu[i];
// hv[i][0]= bottomGhostLayer->hv[i];
// h[i][1] = bottomCopyLayer->h[i];
// hu[i][1]= bottomCopyLayer->hu[i];
// hv[i][1]= bottomCopyLayer->hv[i];
// };
// };
//
// if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
// // transfer bottom ghost and copy layer to extra SWE_Block1D
// for(int i=0; i<=nx+1; i++) {
// h[i][ny+1] = topGhostLayer->h[i];
// hu[i][ny+1] = topGhostLayer->hu[i];
// hv[i][ny+1] = topGhostLayer->hv[i];
// h[i][ny] = topCopyLayer->h[i];
// hu[i][ny] = topCopyLayer->hu[i];
// hv[i][ny] = topCopyLayer->hv[i];
// };
// };
}
/**
* Update temporary and non-local (for heterogeneous computing) variables
* before an external access to the water height h
*/
void SWE_BlockCUDA::synchWaterHeightBeforeRead() {
int size = (nx+2)*(ny+2)*sizeof(float);
#ifdef DBG
cout << "Copy water height h from device" << flush << endl;
#endif
cudaMemcpy(h.elemVector(),hd, size, cudaMemcpyDeviceToHost);
checkCUDAError("memory of h not transferred");
#ifdef DBG
cout << "Set water height in ghost-layer corner cells" << flush << endl;
#endif
// only required for visualisation: set values in corner ghost cells
h[0][0] = h[1][1];
h[0][ny+1] = h[1][ny];
h[nx+1][0] = h[nx][1];
h[nx+1][ny+1] = h[nx][ny];
}
/**
* Update temporary and non-local (for heterogeneous computing) variables
* before an external access to the discharge variables hu and hv
*/
void SWE_BlockCUDA::synchDischargeBeforeRead() {
int size = (nx+2)*(ny+2)*sizeof(float);
#ifdef DBG
cout << "Copy discharge hu and hv from device" << flush << endl;
#endif
cudaMemcpy(hu.elemVector(),hud, size, cudaMemcpyDeviceToHost);
checkCUDAError("memory of hu not transferred");
cudaMemcpy(hv.elemVector(),hvd, size, cudaMemcpyDeviceToHost);
checkCUDAError("memory of hv not transferred");
}
/**
* Update temporary and non-local (for heterogeneous computing) variables
* before an external access to the bathymetry b
*/
void SWE_BlockCUDA::synchBathymetryBeforeRead() {
int size = (nx+2)*(ny+2)*sizeof(float);
#ifdef DBG
cout << "Copy water bathymetry b from device" << flush << endl;
#endif
cudaMemcpy(b.elemVector(),bd, size, cudaMemcpyDeviceToHost);
checkCUDAError("memory of b not transferred");
}
//==================================================================
/**
* overload operator<< such that data can be written via cout <<
* -> needs to be declared as friend to be allowed to access private data
*/
ostream& operator<<(ostream& os, const SWE_BlockCUDA& swe) {
os << "Grid dimensions: " << swe.nx << "x" << swe.ny << endl;
cout << "Water height:" << endl;
for(int j=swe.ny+1; j>=0; j--) {
for(int i=0; i<=swe.nx+1; i++) {
os << swe.h[i][j] << " ";
};
os << endl;
};
cout << "Momentum in x-direction:" << endl;
for(int j=swe.ny+1; j>=0; j--) {
for(int i=0; i<=swe.nx+1; i++) {
os << swe.hu[i][j] << " ";
};
os << endl;
};
cout << "Momentum in y-direction:" << endl;
for(int j=swe.ny+1; j>=0; j--) {
for(int i=0; i<=swe.nx+1; i++) {
os << swe.hv[i][j] << " ";
};
os << endl;
};
#ifdef DBG
cout << "Ghost/Copy Layer bottom:" << endl;
for(int i=0; i<=swe.nx+1; i++) {
os << swe.bottomGhostLayer->h[i] << " ";
os << swe.bottomGhostLayer->hu[i] << " ";
os << swe.bottomGhostLayer->hv[i] << " ";
os << swe.bottomCopyLayer->h[i] << " ";
os << swe.bottomCopyLayer->hu[i] << " ";
os << swe.bottomCopyLayer->hv[i] << " ";
cout << endl;
};
cout << "Ghost/Copy Layer top:" << endl;
for(int i=0; i<=swe.nx+1; i++) {
os << swe.topGhostLayer->h[i] << " ";
os << swe.topGhostLayer->hu[i] << " ";
os << swe.topGhostLayer->hv[i] << " ";
os << swe.topCopyLayer->h[i] << " ";
os << swe.topCopyLayer->hu[i] << " ";
os << swe.topCopyLayer->hv[i] << " ";
cout << endl;
};
#endif
os << flush;
return os;
}
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#ifndef __SWE_BLOCKCUDA_HH
#define __SWE_BLOCKCUDA_HH
#include <iostream>
#include <stdio.h>
#include <fstream>
#include <cuda_runtime.h>
#include "tools/help.hh"
#include "SWE_Block.hh"
using namespace std;
void checkCUDAError(const char *msg);
void tryCUDA(cudaError_t err, const char *msg);
const int TILE_SIZE=16;
//const int TILE_SIZE=8;
/**
* SWE_BlockCUDA extends the base class SWE_Block towards
* a base class for a CUDA implementation of the shallow water equations.
* It adds the respective variables in GPU memory, and provides
* methods for data transfer between main and GPU memory.
*/
class SWE_BlockCUDA : public SWE_Block {
public:
// Constructor und Destructor
SWE_BlockCUDA(float _offsetX = 0, float _offsetY = 0);
virtual ~SWE_BlockCUDA();
// object methods
// ---> COULD BE IMPLEMENTED TO PROVIDE A DEFAULT IMPLEMENTATION
// // determine maximum possible time step
// virtual float getMaxTimestep();
// deliver a pointer to proxy class that represents
// the layer that is copied to an external ghost layer
virtual SWE_Block1D* registerCopyLayer(BoundaryEdge edge);
// "grab" the ghost layer in order to set these values externally
virtual SWE_Block1D* grabGhostLayer(BoundaryEdge edge);
// access to CUDA variables
/**
* @return pointer to the array #hd (water height) in device memory
*/
const float* getCUDA_waterHeight() { return hd; };
/**
* @return pointer to the array #hb (bathymetry) in device memory
*/
const float* getCUDA_bathymetry() { return bd; };
protected:
// synchronisation Methods
virtual void synchAfterWrite();
virtual void synchWaterHeightAfterWrite();
virtual void synchDischargeAfterWrite();
virtual void synchBathymetryAfterWrite();
virtual void synchGhostLayerAfterWrite();
virtual void synchBeforeRead();
virtual void synchWaterHeightBeforeRead();
virtual void synchDischargeBeforeRead();
virtual void synchBathymetryBeforeRead();
virtual void synchCopyLayerBeforeRead();
// set boundary conditions in ghost layers (set boundary conditions)
virtual void setBoundaryConditions();
// define arrays for main unknowns in CUDA global memory:
// hd, hud, hvd, and bd are CUDA arrays corresp. to h, hu, hv, and b
float* hd;
float* hud;
float* hvd;
float* bd;
private:
// separate memory to hold bottom and top ghost and copy layer
// in main memory allowing non-strided access
float* bottomLayer;
float* topLayer;
SWE_Block1D* bottomGhostLayer;
SWE_Block1D* bottomCopyLayer;
SWE_Block1D* topGhostLayer;
SWE_Block1D* topCopyLayer;
// and resp. memory on the CUDA device:
float* bottomLayerDevice;
float* topLayerDevice;
// helper arrays: store maximum height and velocities to determine time step
float* maxhd;
float* maxvd;
// overload operator<< such that data can be written via cout <<
// -> needs to be declared as friend to be allowed to access private data
friend ostream& operator<< (ostream& os, const SWE_BlockCUDA& swe);
};
ostream& operator<< (ostream& os, const SWE_BlockCUDA& swe);
/**
Return index of hd[i][j] in linearised array
@param i,j x- and y-coordinate of grid cell
@param ny grid size in y-direction (without ghost layers)
*/
inline __device__
int getCellCoord(int x, int y, int ny) {
return x*(ny+2) + y;
}
/**
Return index of edge-data Fhd[i][j] or Ghd[i][j] in linearised array
@param i,j x- and y-coordinate of grid cell
@param ny grid size in y-direction (without ghost layers)
*/
inline __device__
int getEdgeCoord(int x, int y, int ny) {
return x*(ny+1) + y;
}
/**
Return index of a specific element in the arrays of bathymetry source terms
@param i,j x- and y-coordinate of grid cell
@param ny grid size in y-direction (without ghost layers)
*/
inline __device__
int getBathyCoord(int x, int y, int ny) {
return x*ny + y;
}
#endif
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#include "SWE_BlockCUDA.hh"
#include "SWE_BlockCUDA_kernels.hh"
/**
Sets corner values of hd (only needed for visualization)
@param hd h-values on device
*/
__global__
void kernelHdBufferEdges(float* hd, int nx, int ny)
{
hd[getCellCoord(0 ,0 ,ny)] = hd[getCellCoord(1 ,1 ,ny)];
hd[getCellCoord(0 ,ny+1,ny)] = hd[getCellCoord(1 ,ny,ny)];
hd[getCellCoord(nx+1,0 ,ny)] = hd[getCellCoord(nx,1 ,ny)];
hd[getCellCoord(nx+1,ny+1,ny)] = hd[getCellCoord(nx,ny,ny)];
//Corresponding C-Code:
//h[0][0] = h[1][1];
//h[0][ny+1] = h[1][ny];
//h[nx+1][0] = h[nx][1];
//h[nx+1][ny+1] = h[nx][ny];
}
//******************************************************************
// kernels to implement boundary conditions
//******************************************************************
/**
* CUDA kernel to set left boundary layer for conditions WALL & OUTFLOW
* blockIdx.y and threadIdx.y loop over the boundary elements
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelLeftBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound)
{
int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
int ghost = getCellCoord(0,j,ny);
int inner = getCellCoord(1,j,ny);
// consider only WALL & OUTFLOW boundary conditions
hd[ghost] = hd[inner];
hud[ghost] = (bound==WALL) ? -hud[inner] : hud[inner];
hvd[ghost] = hvd[inner];
}
/**
* CUDA kernel to set right boundary layer for conditions WALL & OUTFLOW
* blockIdx.y and threadIdx.y loop over the boundary elements
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelRightBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound)
{
int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
int ghost = getCellCoord(nx+1,j,ny);
int inner = getCellCoord(nx ,j,ny);
// consider only WALL & OUTFLOW boundary conditions
hd[ghost] = hd[inner];
hud[ghost] = (bound==WALL) ? -hud[inner] : hud[inner];
hvd[ghost] = hvd[inner];
}
/**
* CUDA kernel to set bottom boundary layer for conditions WALL & OUTFLOW
* blockIdx.x and threadIdx.x loop over the boundary elements
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelBottomBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int ghost = getCellCoord(i,0,ny);
int inner = getCellCoord(i,1,ny);
// consider only WALL & OUTFLOW boundary conditions
hd[ghost] = hd[inner];
hud[ghost] = hud[inner];
hvd[ghost] = (bound==WALL) ? -hvd[inner] : hvd[inner];
}
/**
* CUDA kernel to set bottom boundary layer for conditions WALL & OUTFLOW
* blockIdx.x and threadIdx.x loop over the boundary elements
*/
__global__
void kernelTopBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int ghost = getCellCoord(i,ny+1,ny);
int inner = getCellCoord(i,ny ,ny);
// consider only WALL & OUTFLOW boundary conditions
hd[ghost] = hd[inner];
hud[ghost] = hud[inner];
hvd[ghost] = (bound==WALL) ? -hvd[inner] : hvd[inner];
}
/**
* CUDA kernel to set bottom boundary layer according to the external
* ghost layer status (conditions PASSIVE and CONNECT)
* blockIdx.x and threadIdx.x loop over the boundary elements.
* Note that diagonal elements are currently not copied!
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelBottomGhostBoundary(float* hd, float* hud, float* hvd,
float* bottomGhostLayer, int nx, int ny)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int ghost = getCellCoord(i,0,ny);
hd[ghost] = bottomGhostLayer[i];
hud[ghost] = bottomGhostLayer[(nx+2)+i];
hvd[ghost] = bottomGhostLayer[2*(nx+2)+i];
}
/**
* CUDA kernel to set top boundary layer according to the external
* ghost layer status (conditions PASSIVE and CONNECT)
* blockIdx.x and threadIdx.x loop over the boundary elements
* Note that diagonal elements are currently not copied!
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelTopGhostBoundary(float* hd, float* hud, float* hvd,
float* topGhostLayer, int nx, int ny)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int ghost = getCellCoord(i,ny+1,ny);
hd[ghost] = topGhostLayer[i];
hud[ghost] = topGhostLayer[(nx+2)+i];
hvd[ghost] = topGhostLayer[2*(nx+2)+i];
}
/**
* CUDA kernel to update bottom copy layer according
* (for boundary conditions PASSIVE and CONNECT)
* blockIdx.x and threadIdx.x loop over the boundary elements.
* Note that diagonal elements are currently not copied!
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelBottomCopyLayer(float* hd, float* hud, float* hvd,
float* bottomCopyLayer, int nx, int ny)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int copy = getCellCoord(i,1,ny);
bottomCopyLayer[i] = hd[copy];
bottomCopyLayer[(nx+2)+i] = hud[copy];
bottomCopyLayer[2*(nx+2)+i] = hvd[copy];
}
/**
* CUDA kernel to set top boundary layer according to the external
* ghost layer status (conditions PASSIVE and CONNECT)
* blockIdx.x and threadIdx.x loop over the boundary elements
* Note that diagonal elements are currently not copied!
* SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
*/
__global__
void kernelTopCopyLayer(float* hd, float* hud, float* hvd,
float* topCopyLayer, int nx, int ny)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int copy = getCellCoord(i,ny,ny);
topCopyLayer[i] = hd[copy];
topCopyLayer[(nx+2)+i] = hud[copy];
topCopyLayer[2*(nx+2)+i] = hvd[copy];
}
// //******************************************************************
// // kernels to implement boundary conditions
// //******************************************************************
//
//
// /**
// * CUDA kernel for maximum reduction
// * required to compute maximum water height and velocities to determine
// * allow time step
// */
// __global__
// void kernelMaximum(float* maxhd, float* maxvd, int start, int size) {
// int tx = start+threadIdx.x;
// for (int i=size>>1; i>0; i>>=1) {
// __syncthreads();
// if (tx < i) {
// if( maxhd[tx] < maxhd[tx+i] ) maxhd[tx] = maxhd[tx+i];
// if( maxvd[tx] < maxvd[tx+i] ) maxvd[tx] = maxvd[tx+i];
// };
// };
// }
//
//
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#ifndef __SWE_BLOCKCUDAKERNELS_HH
#define __SWE_BLOCKCUDAKERNELS_HH
// declaration of CUDA kernels
__global__
void kernelHdBufferEdges(float* hd, int nx, int ny);
__global__
void kernelMaximum(float* maxhd, float* maxvd, int start, int size);
__global__
void kernelLeftBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound);
__global__
void kernelRightBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound);
__global__
void kernelBottomBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound);
__global__
void kernelTopBoundary(float* hd, float* hud, float* hvd,
int nx, int ny, BoundaryType bound);
__global__
void kernelBottomGhostBoundary(float* hd, float* hud, float* hvd,
float* bottomGhostLayer, int nx, int ny);
__global__
void kernelTopGhostBoundary(float* hd, float* hud, float* hvd,
float* topGhostLayer, int nx, int ny);
__global__
void kernelBottomCopyLayer(float* hd, float* hud, float* hvd,
float* bottomCopyLayer, int nx, int ny);
__global__
void kernelTopCopyLayer(float* hd, float* hud, float* hvd,
float* topCopyLayer, int nx, int ny);
#endif
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#include <math.h>
#include "tools/help.hh"
#include "SWE_BlockCUDA.hh"
#include "SWE_RusanovBlockCUDA.hh"
//const int TILE_SIZE=16;
//const int TILE_SIZE=8;
// #include "SWE_BlockCUDA_kernels.cu"
#include "SWE_RusanovBlockCUDA_kernels.hh"
/**
* Constructor: allocate variables for simulation
*
* unknowns h,hu,hv,b are defined on grid indices [0,..,nx+1]*[0,..,ny+1]
* -> computational domain is [1,..,nx]*[1,..,ny]
* -> plus ghost cell layer
*
* flux terms are defined for edges with indices [0,..,nx]*[1,..,ny]
* or [1,..,nx]*[0,..,ny] (for horizontal/vertical edges)
* Flux term with index (i,j) is located on the edge between
* cells with index (i,j) and (i+1,j) or (i,j+1)
*
* bathymetry source terms are defined for cells with indices [1,..,nx]*[1,..,ny]
*/
SWE_RusanovBlockCUDA::SWE_RusanovBlockCUDA(float _offsetX, float _offsetY)
: SWE_BlockCUDA(_offsetX,_offsetY)
#ifdef DBG
, Fh(nx+1,ny+1), Fhu(nx+1,ny+1), Fhv(nx+1,ny+1),
Gh(nx+1,ny+1), Ghu(nx+1,ny+1), Ghv(nx+1,ny+1)
#endif
{
int size = (nx+1)*(ny+1)*sizeof(float);
// allocate CUDA memory for flow unknows
cudaMalloc((void**)&Fhd, size);
checkCUDAError("allocate device memory for Fh");
cudaMalloc((void**)&Fhud, size);
checkCUDAError("allocate device memory for Fhu");
cudaMalloc((void**)&Fhvd, size);
checkCUDAError("allocate device memory for Fhv");
// allocate CUDA memory for flow unknows
cudaMalloc((void**)&Ghd, size);
checkCUDAError("allocate device memory for Fh");
cudaMalloc((void**)&Ghud, size);
checkCUDAError("allocate device memory for Ghu");
cudaMalloc((void**)&Ghvd, size);
checkCUDAError("allocate device memory for Ghv");
// allocate CUDA unknowns: bathymetry source terms
size = nx*ny*sizeof(float);
cudaMalloc((void**)&Bxd, size);
checkCUDAError("allocate device memory for Bxd");
cudaMalloc((void**)&Byd, size);
checkCUDAError("allocate device memory for Byd");
// allocate CUDA unknowns: maxmimum height and velocity
size = (nx/TILE_SIZE)*(ny/TILE_SIZE)*sizeof(float);
cudaMalloc((void**)&maxhd, size);
checkCUDAError("allocate device memory for maxhd");
cudaMalloc((void**)&maxvd, size);
checkCUDAError("allocate device memory for maxvd");
}
/**
* Destructor: de-allocate all variables
*/
SWE_RusanovBlockCUDA::~SWE_RusanovBlockCUDA() {
cudaFree(Fhd); cudaFree(Fhud); cudaFree(Fhvd);
cudaFree(Ghd); cudaFree(Ghud); cudaFree(Ghvd);
cudaFree(Bxd); cudaFree(Byd);
cudaFree(maxhd); cudaFree(maxvd);
}
//==================================================================
// methods for simulation
//==================================================================
/**
* compute the largest allowed time step for the current grid block;
* will be stored in the member variable SWE_Block::maxTimestep.
* This specific CUDA implementation requires velocity data precomputed
* by the kernel kernelEulerTimestep, and therefore needs to be called
* directly after calling this kernel.
*/
void SWE_RusanovBlockCUDA::computeMaxTimestepCUDA() {
float hmax = 0.0;
float vmax = 0.0;
float meshSize = (dx<dy) ? dx : dy;
int numTiles = (nx/TILE_SIZE)*(ny/TILE_SIZE);
#ifdef DBG
cout << "Number of tiles: " << numTiles << " -> ";
#endif
// use 512 threads for maxmimum computation ...
int threads = 512;
// ... or the largest power-of-two smaller than numTiles:
while (threads > numTiles) threads >>=1;
#ifdef DBG
cout << "use " << threads << " threads per tile" << endl << flush;
#endif
// loop over arrays of maxhd and maxvd to determine maximum
// GPU maximum kernel called for 512 (or threads) elements at a time
// CPU maintains maximum of all groups-of-512
for(int tile=numTiles; tile>0; tile -= threads) {
float newmax = 0;
dim3 dimBlock(threads,1,1);
dim3 dimGrid(1,1);
// Compute maximum of arrays maxhd and maxvd
// (for elements in range [tile-threads .. tile-1])
int start = tile-threads;
// use element range [0 .. threads-1], if tile-threads < 0
if (start<0) start=0;
// cout << "Call kernel to compute time step " << start << flush << endl;
kernelMaximum<<<dimGrid,dimBlock>>>(maxhd,maxvd,start,threads);
checkCUDAError("compute maximum height and velocity");
cudaMemcpy(&newmax,maxhd, sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("memory of hmax not transferred");
if (newmax>hmax) hmax = newmax;
cudaMemcpy(&newmax,maxvd, sizeof(float), cudaMemcpyDeviceToHost);
checkCUDAError("memory of vmax not transferred");
if (newmax>vmax) vmax = newmax;
#ifdef DBG
cout << "Current maxmimum of tile " << start << ":"
<< hmax << ',' << vmax << endl << flush;
#endif
};
#ifdef DBG
cout << "Computed Time step (CUDA): "
<< " from " << hmax << " and " << vmax << " -> "
<< meshSize/(sqrt(g*hmax) + vmax) << endl << flush;
#endif
// sqrt(g*hmax) + vmax is the velocity of a characteristic shallow-water wave
// such that a wave must not propagate farther than dx in a single time step
// (uses 0.5 as a pessimistic factor to reduce time step size)
maxTimestep = 0.5*meshSize/(sqrt(g*hmax) + vmax);
}
/**
* Depending on the current values of h, hu, hv (incl. ghost layers)
* update these unknowns in each grid cell
* (ghost layers and bathymetry are not updated).
* The Rusanov CUDA-implementation of simulateTimestep
* subsequently calls the functions computeNumericalFluxes (to compute
* all fluxes on grid edges), and updateUnknowns (to update the variables
* according to flux values, typically according to an Euler time step).
* @param dt size of the time step
*/
void SWE_RusanovBlockCUDA::simulateTimestep(float dt) {
// compute the numerical fluxes on all edges
computeNumericalFluxes();
// update the unknowns according to an Euler timestep
updateUnknowns(dt);
}
/**
* perform forward-Euler time steps, starting with simulation time tStart,:
* until simulation time tEnd is reached;
* device-global variables hd, hud, hvd are updated;
* unknowns h, hu, hv in main memory are not updated.
* Ghost layers and bathymetry sources are updated between timesteps.
* intended as main simulation loop between two checkpoints
*/
__host__
float SWE_RusanovBlockCUDA::simulate(float tStart, float tEnd) {
float t = tStart;
do {
// set values in ghost cells:
setGhostLayer();
// execute Euler time step:
simulateTimestep(maxTimestep);
t += maxTimestep;
#ifdef DBG
cout << "Simulation at time " << t << endl << flush;
#endif
} while(t < tEnd);
return t;
}
/**
* implements interface function updateUnknowns:
* based on the (Rusanov) fluxes computed on each edge
* (and stored in the variables Fh, Gh, etc.);
* compute the balance terms for each cell, and update the
* unknowns according to an Euler time step.
* It will force an update of the copy layer in the main
* memory by calling synchCopyLayerBeforeRead(),
* and provide an compute the maximum allowed time step size
* by calling computeMaxTimestepCUDA().
* @param dt size of the time step.
*/
__host__
void SWE_RusanovBlockCUDA::updateUnknowns(float dt) {
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
#ifdef DBG
cout << "Call kernel for Euler timestep " << flush << endl;
#endif
kernelEulerTimestep<<<dimGrid,dimBlock>>>(hd,hud,hvd,
Fhd,Fhud,Fhvd,Ghd,Ghud,Ghvd,
Bxd,Byd,
maxhd,maxvd,
nx,ny,dt,1.0f/dx,1.0f/dy);
computeMaxTimestepCUDA();
synchCopyLayerBeforeRead();
}
/**
* compute the flux terms on all edges
*/
void SWE_RusanovBlockCUDA::computeNumericalFluxes() {
// time step required to compute Lax Friedrichs constant dx/dt
float dt = 2*maxTimestep;
// compute bathymetry source terms (depend on h)
computeBathymetrySources();
// compute F-fluxes, i.e. all fluxes in x-direction (on left/right edges)
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
#ifdef DBG
cout << "Call kernel to compute F-fluxes " << flush << endl;
#endif
kernelComputeFluxesF<<<dimGrid,dimBlock>>>(hd,hud,hvd, Fhd,Fhud,Fhvd,
ny,g,dx/dt,1);
dim3 dimLeftBlock(1,TILE_SIZE);
dim3 dimLeftGrid(1,ny/TILE_SIZE);
kernelComputeFluxesF<<<dimLeftGrid,dimLeftBlock>>>(hd,hud,hvd, Fhd,Fhud,Fhvd,
ny,g,dx/dt,0);
// compute G-fluxes, i.e. all fluxes in y-direction (on top/bottom edges)
#ifdef DBG
cout << "Call kernel to compute G-fluxes " << flush << endl;
#endif
kernelComputeFluxesG<<<dimGrid,dimBlock>>>(hd,hud,hvd, Ghd,Ghud,Ghvd,
ny,g,dy/dt,1);
dim3 dimTopBlock(TILE_SIZE,1);
dim3 dimTopGrid(nx/TILE_SIZE,1);
kernelComputeFluxesG<<<dimTopGrid,dimTopBlock>>>(hd,hud,hvd, Ghd,Ghud,Ghvd,
ny,g,dy/dt,0);
// cout << (*this) << flush;
}
/**
* compute the bathymetry source terms in all cells
*/
void SWE_RusanovBlockCUDA::computeBathymetrySources() {
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
#ifdef DBG
cout << "Call kernel to compute bathymetry sources" << flush << endl;
#endif
kernelComputeBathymetrySources<<<dimGrid,dimBlock>>>(hd,bd,Bxd,Byd,ny,g);
// cout << (*this) << flush;
}
//==================================================================
/**
* overload operator<< such that data can be written via cout <<
* -> needs to be declared as friend to be allowed to access private data
*/
ostream& operator<<(ostream& os, const SWE_RusanovBlockCUDA& swe) {
os << "Grid dimensions: " << swe.nx << "x" << swe.ny << endl;
cout << "Water height:" << endl;
for(int j=swe.ny+1; j>=0; j--) {
for(int i=0; i<=swe.nx+1; i++) {
os << swe.h[i][j] << " ";
};
os << endl;
};
cout << "Momentum in x-direction:" << endl;
for(int j=swe.ny+1; j>=0; j--) {
for(int i=0; i<=swe.nx+1; i++) {
os << swe.hu[i][j] << " ";
};
os << endl;
};
cout << "Momentum in y-direction:" << endl;
for(int j=swe.ny+1; j>=0; j--) {
for(int i=0; i<=swe.nx+1; i++) {
os << swe.hv[i][j] << " ";
};
os << endl;
};
#ifdef DBG
cout << "Ghost/Copy Layer bottom:" << endl;
for(int i=0; i<=swe.nx+1; i++) {
os << swe.bottomGhostLayer->h[i] << " ";
os << swe.bottomGhostLayer->hu[i] << " ";
os << swe.bottomGhostLayer->hv[i] << " ";
os << swe.bottomCopyLayer->h[i] << " ";
os << swe.bottomCopyLayer->hu[i] << " ";
os << swe.bottomCopyLayer->hv[i] << " ";
cout << endl;
};
cout << "Ghost/Copy Layer top:" << endl;
for(int i=0; i<=swe.nx+1; i++) {
os << swe.topGhostLayer->h[i] << " ";
os << swe.topGhostLayer->hu[i] << " ";
os << swe.topGhostLayer->hv[i] << " ";
os << swe.topCopyLayer->h[i] << " ";
os << swe.topCopyLayer->hu[i] << " ";
os << swe.topCopyLayer->hv[i] << " ";
cout << endl;
};
#endif
// #ifdef DBG
// cout << "Fluss - Wellenhoehe:" << endl;
// for(int i=0; i<=swe.nx; i++) {
// for(int j=1; j<=swe.ny; j++) {
// os << swe.Fh[i][j] << " ";
// };
// os << endl;
// };
//
// cout << "Fluss - Durchfluss in x-Richtung:" << endl;
// for(int i=0; i<=swe.nx; i++) {
// for(int j=1; j<=swe.ny; j++) {
// os << swe.Fhu[i][j] << " ";
// };
// os << endl;
// };
//
// cout << "Fluss - Durchfluss in y-Richtung:" << endl;
// for(int i=0; i<=swe.nx; i++) {
// for(int j=1; j<=swe.ny; j++) {
// os << swe.Fhv[i][j] << " ";
// };
// os << endl;
// };
//
// cout << "Fluss - Wellenhoehe:" << endl;
// for(int i=1; i<=swe.nx; i++) {
// for(int j=0; j<=swe.ny; j++) {
// os << swe.Gh[i][j] << " ";
// };
// os << endl;
// };
//
// cout << "Fluss - Durchfluss in x-Richtung:" << endl;
// for(int i=1; i<=swe.nx; i++) {
// for(int j=0; j<=swe.ny; j++) {
// os << swe.Ghu[i][j] << " ";
// };
// os << endl;
// };
//
// cout << "Fluss - Durchfluss in y-Richtung:" << endl;
// for(int i=1; i<=swe.nx; i++) {
// for(int j=0; j<=swe.ny; j++) {
// os << swe.Ghv[i][j] << " ";
// };
// os << endl;
// };
// #endif
os << flush;
return os;
}
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#ifndef __SWE_RUSANOVBLOCKCUDA_HH
#define __SWE_RUSANOVBLOCKCUDA_HH
#include <iostream>
#include <stdio.h>
#include <fstream>
#include <cuda_runtime.h>
#include "tools/help.hh"
#include "SWE_Block.hh"
#include "SWE_BlockCUDA.hh"
using namespace std;
/**
* SWE_RusanovBlockCUDA extends the base class SWE_BlockCUDA,
* and provides a concrete CUDA implementation of a simple
* shallow water model based on Rusanov Flux computation on the
* edges and explicit time stepping.
*/
class SWE_RusanovBlockCUDA : public SWE_BlockCUDA {
public:
// Constructor und Destructor
SWE_RusanovBlockCUDA(float _offsetX = 0, float _offsetY = 0);
virtual ~SWE_RusanovBlockCUDA();
// object methods
virtual void computeNumericalFluxes();
// simulate for specified time range
// execute Euler time step
virtual void updateUnknowns(float dt);
/// execute a single time step of the simulation
virtual void simulateTimestep(float dt);
// compute flux terms on edges
virtual float simulate(float tStart, float tEnd);
private:
// compute bathymetry source terms
void computeBathymetrySources();
// determine maximum possible time step
void computeMaxTimestepCUDA();
// arrays to hold the values of the flux terms at cell edges
float* Fhd;
float* Fhud;
float* Fhvd;
float* Ghd;
float* Ghud;
float* Ghvd;
// arrays to hold the bathymetry source terms for the hu and hv equations
float* Bxd;
float* Byd;
// helper arrays: store maximum height and velocities to determine time step
float* maxhd;
float* maxvd;
// overload operator<< such that data can be written via cout <<
// -> needs to be declared as friend to be allowed to access private data
friend ostream& operator<< (ostream& os, const SWE_RusanovBlockCUDA& swe);
#ifdef DBG
// --- only required for debugging purposes ---
// arrays for fluxes for h,hu,hv in main memory
Float2D Fh;
Float2D Fhu;
Float2D Fhv;
Float2D Gh;
Float2D Ghu;
Float2D Ghv;
// dump fluxes for h,hu,hv from CUDA device memory into main memory
void cudaDumpFlux();
#endif
};
ostream& operator<< (ostream& os, const SWE_RusanovBlockCUDA& swe);
#endif
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#include "SWE_BlockCUDA.hh"
#include "SWE_RusanovBlockCUDA_kernels.hh"
//******************************************************************
// kernels to implement Euler time-stepping
//******************************************************************
inline __device__
float computeFlux(float fLow, float fHigh, float xiLow, float xiHigh, float llf) {
// local Lax-Friedrich
return 0.5f*(fLow+fHigh) - 0.5f*llf*(xiHigh-xiLow);
}
/**
* computes the flux vector components Fhd, Fhud and Fhvd for a single
* edge by calling the function computeFlux
*/
__global__
void kernelComputeFluxesF(float* hd, float* hud, float* hvd,
float* Fhd, float* Fhud, float* Fhvd,
int ny, float g, float llf, int istart)
{
int i = istart + TILE_SIZE*blockIdx.x + threadIdx.x;
int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
int iL = getCellCoord(i,j,ny); // index of left cell
int iR = getCellCoord(i+1,j,ny); // index of right cell
int iEdge = getEdgeCoord(i,j,ny); // index of current edge
float upwind = max( fabs(hud[iL]/hd[iL]), fabs(hud[iR]/hd[iR]) );
Fhd[iEdge] = computeFlux( hud[iL], hud[iR], hd[iL], hd[iR], upwind );
Fhud[iEdge] = computeFlux( hud[iL]*hud[iL]/hd[iL] + 0.5f*g*hd[iL]*hd[iL],
hud[iR]*hud[iR]/hd[iR] + 0.5f*g*hd[iR]*hd[iR],
hud[iL],
hud[iR],
llf );
Fhvd[iEdge] = computeFlux( hud[iL]*hvd[iL]/hd[iL],hud[iR]*hvd[iR]/hd[iR],
hvd[iL], hvd[iR],
llf );
}
/**
* computes the flux vector components Ghd, Ghud and Ghvd for a single
* edge by calling the function computeFlux
*/
__global__
void kernelComputeFluxesG(float* hd, float* hud, float* hvd,
float* Ghd, float* Ghud, float* Ghvd,
int ny, float g, float llf, int jstart)
{
int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
int j = jstart + TILE_SIZE*blockIdx.y + threadIdx.y;
int iB = getCellCoord(i,j ,ny);
int iT = getCellCoord(i,j+1,ny);
int iEdge = getEdgeCoord(i,j,ny);
float upwind = max( fabs(hvd[iB]/hd[iB]), fabs(hvd[iT]/hd[iT]) );
Ghd[iEdge] = computeFlux( hvd[iB], hvd[iT], hd[iB], hd[iT], upwind );
Ghud[iEdge] = computeFlux( hud[iB]*hvd[iB]/hd[iB],hud[iT]*hvd[iT]/hd[iT],
hud[iB], hud[iT],
llf );
Ghvd[iEdge] = computeFlux( hvd[iB]*hvd[iB]/hd[iB] + 0.5f*g*hd[iB]*hd[iB],
hvd[iT]*hvd[iT]/hd[iT] + 0.5f*g*hd[iT]*hd[iT],
hvd[iB], hvd[iT],
llf );
}
/**
* computes the bathymetry source terms for the hu and hv equation for
* a given cell in the resp. array elements Bxd and Byd
*/
__global__
void kernelComputeBathymetrySources(float* hd, float* bd, float* Bxd, float* Byd,
int ny, float g)
{
// Note: different index ranges for h and b vs. Bxd, Byd:
// [0..nx+]x[0..ny+1] vs. [1..nx]x[1..ny]
// Note: indices for Bxd, Byd shifted to start with 0
int i = TILE_SIZE*blockIdx.x + threadIdx.x;
int j = TILE_SIZE*blockIdx.y + threadIdx.y;
// compute indices of involved array elements
int ij = getBathyCoord(i,j,ny);
int left = getCellCoord(i ,j+1,ny); // index of left cell (arrays hd,bd)
int right = getCellCoord(i+2,j+1,ny); // index of right cell (array hd,bb)
Bxd[ij] = g * 0.5f*(hd[right] + hd[left]) * 0.5f*(bd[right] - bd[left]);
int bot = getCellCoord(i+1,j,ny); // index of left cell (arrays hd,bd)
int top = getCellCoord(i+1,j+2,ny); // index of right cell (array hd,bb)
Byd[ij] = g * 0.5f*(hd[top] + hd[bot]) * 0.5f*(bd[top] - bd[bot]);
}
/**
* CUDA kernel for Euler time step
*/
__global__
void kernelEulerTimestep(float* hd, float* hud, float* hvd,
float* Fhd, float* Fhud, float* Fhvd,
float* Ghd, float* Ghud, float* Ghvd,
float* Bxd, float* Byd,
float* maxhd, float* maxvd,
int nx, int ny, float dt, float dxi, float dyi)
{
__shared__ float Fds[TILE_SIZE+1][TILE_SIZE+1];
__shared__ float Gds[TILE_SIZE+1][TILE_SIZE+1];
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = 1 + TILE_SIZE*blockIdx.x + tx;
int j = 1 + TILE_SIZE*blockIdx.y + ty;
int iElem = getCellCoord(i,j,ny); // index of current cell
int iEdge = getEdgeCoord(i,j,ny); // index of right/top Edge
int iLeft = getEdgeCoord(i-1,j,ny); // index of left Edge
int iBot = getEdgeCoord(i,j-1,ny); // index of bottom Edge
float h;
float hu;
float hv;
// copy flux unknowns from global into local memory
// -> for fluxes corresponding to variable h
Fds[tx+1][ty] = Fhd[iEdge];
Gds[tx][ty+1] = Ghd[iEdge];
if (tx==0) Fds[tx][ty] = Fhd[iLeft];
if (ty==0) Gds[tx][ty] = Ghd[iBot];
__syncthreads();
// compute new value of h from fluxes
h = hd[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi
+(Gds[tx][ty+1]-Gds[tx][ty])*dyi );
__syncthreads();
// copy flux unknowns from global into local memory
// -> for fluxes corresponding to variable hu
Fds[tx+1][ty] = Fhud[iEdge];
Gds[tx][ty+1] = Ghud[iEdge];
if (tx==0) Fds[tx][ty] = Fhud[iLeft];
if (ty==0) Gds[tx][ty] = Ghud[iBot];
__syncthreads();
// compute new value of hu from fluxes
hu = hud[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi
+(Gds[tx][ty+1]-Gds[tx][ty])*dyi
+ Bxd[getBathyCoord(i-1,j-1,ny)]*dxi );
__syncthreads();
// copy flux unknowns from global into local memory
// -> for fluxes corresponding to variable hv
Fds[tx+1][ty] = Fhvd[iEdge];
Gds[tx][ty+1] = Ghvd[iEdge];
if (tx==0) Fds[tx][ty] = Fhvd[iLeft];
if (ty==0) Gds[tx][ty] = Ghvd[iBot];
__syncthreads();
// compute new value of hv from fluxes
hv = hvd[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi
+(Gds[tx][ty+1]-Gds[tx][ty])*dyi
+ Byd[getBathyCoord(i-1,j-1,ny)]*dyi );
__syncthreads();
/* precompute maxmimal height and velocity per thread block
* (for computation of allowed time step size)
*/
// compute absolute values of h and absolute velocity
hd[iElem] = h; Fds[tx][ty] = h;
hud[iElem] = hu; hu = (h>0.0) ? fabs(hu/h) : 0.0;
hvd[iElem] = hv; hv = (h>0.0) ? fabs(hv/h) : 0.0;
Gds[tx][ty] = (hu>hv) ? hu : hv;
// parallel reduction on thread block:
// determine maximum wave height and velocity
// step 1: reduction in ty-direction
for (i=TILE_SIZE>>1; i>0; i>>=1) {
__syncthreads();
if (ty < i) {
if( Fds[tx][ty] < Fds[tx][ty+i]) Fds[tx][ty] = Fds[tx][ty+i];
if( Gds[tx][ty] < Gds[tx][ty+i]) Gds[tx][ty] = Gds[tx][ty+i];
};
};
// step 2: reduction in ty-direction
for (i=TILE_SIZE>>1; i>0; i>>=1) {
__syncthreads();
if ((tx < i) && (ty==0)) {
if( Fds[tx][ty] < Fds[tx+i][ty]) Fds[tx][ty] = Fds[tx+i][ty];
if( Gds[tx][ty] < Gds[tx+i][ty]) Gds[tx][ty] = Gds[tx+i][ty];
};
};
// save maxima in array maxhd and maxvd
if ((tx == 0) && (ty==0)) {
j = blockIdx.x*(nx/TILE_SIZE)+blockIdx.y;
maxhd[j] = Fds[0][0];
maxvd[j] = Gds[0][0];
};
}
//******************************************************************
// kernels to implement boundary conditions
//******************************************************************
/**
* CUDA kernel for maximum reduction
* required to compute maximum water height and velocities to determine
* allow time step
*/
__global__
void kernelMaximum(float* maxhd, float* maxvd, int start, int size) {
int tx = start+threadIdx.x;
for (int i=size>>1; i>0; i>>=1) {
__syncthreads();
if (tx < i) {
if( maxhd[tx] < maxhd[tx+i] ) maxhd[tx] = maxhd[tx+i];
if( maxvd[tx] < maxvd[tx+i] ) maxvd[tx] = maxvd[tx+i];
};
};
}
/**
* @file
* This file is part of SWE.
*
* @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* TODO
*/
#ifndef __SWE_RUSANOVBLOCKCUDAKERNELS_HH
#define __SWE_RUSANOVBLOCKCUDAKERNELS_HH
//******************************************************************
// kernels to implement Euler time-stepping
//******************************************************************
__global__
void kernelComputeFluxesF(float* hd, float* hud, float* hvd,
float* Fhd, float* Fhud, float* Fhvd,
int ny, float g, float llf, int istart);
__global__
void kernelComputeFluxesG(float* hd, float* hud, float* hvd,
float* Ghd, float* Ghud, float* Ghvd,
int ny, float g, float llf, int jstart);
__global__
void kernelComputeBathymetrySources(float* hd, float* bd, float* Bxd, float* Byd,
int ny, float g);
__global__
void kernelEulerTimestep(float* hd, float* hud, float* hvd,
float* Fhd, float* Fhud, float* Fhvd,
float* Ghd, float* Ghud, float* Ghvd,
float* Bxd, float* Byd,
float* maxhd, float* maxvd,
int nx, int ny, float dt, float dxi, float dyi);
__global__
void kernelMaximum(float* maxhd, float* maxvd, int start, int size);
#endif
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
*/
#include "SWE_WavePropagationBlockCuda.hh"
#include "SWE_BlockCUDA.hh"
#include "SWE_WavePropagationBlockCuda_kernels.hh"
#include <cassert>
#ifndef STATICLOGGER
#define STATICLOGGER
#include "tools/Logger.hpp"
static tools::Logger s_sweLogger;
#endif
// CUDA-C includes
#include <cuda.h>
#include <cuda_runtime_api.h>
// Thrust library (used for the final maximum reduction in the method computeNumericalFluxes(...))
#include <thrust/device_vector.h>
/**
* Constructor of a SWE_WavePropagationBlockCuda.
*
* Allocates the variables for the simulation:
* Please note: The definition of indices changed in contrast to the CPU-Implementation.
*
* unknowns hd,hud,hvd,bd stored on the CUDA device are defined for grid indices [0,..,nx+1]*[0,..,ny+1] (-> Abstract class SWE_BlockCUDA)
* -> computational domain is [1,..,nx]*[1,..,ny]
* -> plus ghost cell layer
*
* net-updates are defined for edges with indices [0,..,nx]*[0,..,ny] for horizontal and vertical edges for simplicity (one layer is not necessary).
*
* A left/right net update with index (i-1,j) is located on the edge between
* cells with index (i-1,j) and (i,j):
*
* *********************
* * * *
* * (i-1,j) * (i,j) *
* * * *
* *********************
*
* *
* ***
* *****
* *
* *
* NetUpdatesLeft(i-1,j)
* or
* NetUpdatesRight(i-1,j)
*
*
* A below/above net update with index (i, j-1) is located on the edge between
* cells with index (i, j-1) and (i,j):
*
* ***********
* * *
* * (i, j) * *
* * * ** NetUpdatesBelow(i,j-1)
* *********** ***** or
* * * ** NetUpdatesAbove(i,j-1)
* * (i,j-1) * *
* * *
* ***********
*
* @param i_offsetX spatial offset of the block in x-direction.
* @param i_offsetY spatial offset of the offset in y-direction.
*/
SWE_WavePropagationBlockCuda::SWE_WavePropagationBlockCuda( const float i_offsetX,
const float i_offsetY ): SWE_BlockCUDA(i_offsetX,i_offsetY) {
// compute the size of one 1D net-update array.
int sizeOfNetUpdates = (nx+1)*(ny+1)*sizeof(float);
// allocate CUDA memory for the net-updates
cudaMalloc((void**)&hNetUpdatesLeftD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesLeftD");
cudaMalloc((void**)&hNetUpdatesRightD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesRightD");
cudaMalloc((void**)&huNetUpdatesLeftD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for huNetUpdatesLeftD");
cudaMalloc((void**)&huNetUpdatesRightD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for huNetUpdatesRightD");
cudaMalloc((void**)&hNetUpdatesBelowD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesBelowD");
cudaMalloc((void**)&hNetUpdatesAboveD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesAboveD");
cudaMalloc((void**)&hvNetUpdatesBelowD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hvNetUpdatesBelowD");
cudaMalloc((void**)&hvNetUpdatesAboveD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesAboveD");
}
/**
* Destructor of a SWE_WavePropagationBlockCuda.
*
* Frees all of the memory, which was allocated within the constructor.
* Resets the CUDA device: Useful if error occured and printf is used on the device (buffer).
*/
SWE_WavePropagationBlockCuda::~SWE_WavePropagationBlockCuda() {
// free the net-updates memory
cudaFree(hNetUpdatesLeftD);
cudaFree(hNetUpdatesRightD);
cudaFree(huNetUpdatesLeftD);
cudaFree(huNetUpdatesRightD);
cudaFree(hNetUpdatesBelowD);
cudaFree(hNetUpdatesAboveD);
cudaFree(hvNetUpdatesBelowD);
cudaFree(hvNetUpdatesAboveD);
// reset the cuda device
s_sweLogger.printString("Resetting the CUDA devices");
cudaDeviceReset();
}
/**
* Compute a single global time step of a given time step width.
* Remark: The user has to take care about the time step width. No additional check is done. The time step width typically available
* after the computation of the numerical fluxes (hidden in this method).
*
* First the net-updates are computed.
* Then the cells are updated with the net-updates and the given time step width.
*
* @param i_dT time step width in seconds.
*/
__host__
void SWE_WavePropagationBlockCuda::simulateTimestep(float i_dT) {
// Compute the numerical fluxes/net-updates in the wave propagation formulation.
computeNumericalFluxes();
// Update the unknowns with the net-updates.
updateUnknowns(i_dT);
}
/**
* Compute the numerical fluxes (net-update formulation here) on all edges.
*
* The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
* To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
* In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
*
* TODO: A splitting or direct computation of the time step width might increase the total time step size.
* Example:
* dx = 11, dy = 6;
* max wave speed in x-direction: 10
* max wave speed in y-direction: 5.5
* max wave speed in both direction: 10
*
* => maximum time step (current implementation): min(11/10, 6/10) = 0.6
* => maximum time step (splitting the dimensions): min(11/10, 6/5.5) = 1.09..
*/
void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Initialization.
*/
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
/**
* Definition of the "main" CUDA-grid.
* This grid covers only edges 0..#(edges in x-direction)-2 and 0..#(edges in y-direction)-2.
*
* An example with a computational domain of size
* nx = 24, ny = 16
* with a 1 cell ghost layer would result in a grid with
* (nx+2)*(ny+2) = (26*18)
* cells and
* (nx+1)*(ny+1) = (25*17)
* edges.
*
* The CUDA-blocks (here 8*8) mentioned above would cover all edges except
* the ones lying between the computational domain and the right/top ghost layer:
*
* *
* ** top ghost layer,
* ******** cell ids
* ******************************* ** = (*, ny+1)
* * * * * *
* * 8*8 * 8*8 * 8*8 *
* * block * block * block *
* * * * *
* *******************************
* * * * *
* * 8*8 * 8*8 * 8*8 *
* * * block * block * block *
* bottom ** * * * *
* ghost ******** *******************************
* layer, **
* cell ids * * *
* =(*,0) *** ***
* * *
* * *
* left ghost layer, right ghost layer,
* cell ids = (0,*) cell ids = (nx+1, *)
*/
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
// assert a valid tile size
assert(nx%TILE_SIZE==0);
assert(ny%TILE_SIZE==0);
// "2D array" which holds the blockwise maximum wave speeds
float* l_maximumWaveSpeedsD;
// size of the maximum wave speed array (dimension of the grid + ghost layers, without the top right block), sizeof(float) not included
int l_sizeMaxWaveSpeeds = ((dimGrid.x+1)*(dimGrid.y+1)-1);
cudaMalloc((void**)&l_maximumWaveSpeedsD, (l_sizeMaxWaveSpeeds*sizeof(float)) );
/*
* Compute the net updates for the 'main part and the two 'boundary' parts.
*/
// compute the net-updates for the "main" part.
computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx,ny
);
// compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
// edges between cell nx and ghost layer nx+1
dim3 dimRightBlock(1, TILE_SIZE);
dim3 dimRightGrid(1, ny/TILE_SIZE);
computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx, ny,
nx, 0,
dimGrid.x, 0);
// edges between cell ny and ghost layer ny+1
dim3 dimTopBlock(TILE_SIZE, 1);
dim3 dimTopGrid(nx/TILE_SIZE, 1);
computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx, ny,
0, ny,
0, dimGrid.y);
/*
* Finalize (max reduction of the maximumWaveSpeeds-array.)
*
* The Thrust library is used in this step.
* An optional kernel could be written for the maximum reduction.
*/
// Thrust pointer to the device array
thrust::device_ptr<float> l_thrustDevicePointer(l_maximumWaveSpeedsD);
// use Thrusts max_element-function for the maximum reduction
thrust::device_ptr<float> l_thrustDevicePointerMax = thrust::max_element(l_thrustDevicePointer, l_thrustDevicePointer+l_sizeMaxWaveSpeeds);
// get the result from the device
float l_maximumWaveSpeed = l_thrustDevicePointerMax[0];
// free the max wave speeds array on the device
cudaFree(l_maximumWaveSpeedsD);
// set the maximum time step for this SWE_WavePropagationBlockCuda
maxTimestep = std::min( dx/l_maximumWaveSpeed, dy/l_maximumWaveSpeed );
// CFL = 0.5
maxTimestep *= (float)0.4;
}
/**
* Update the cells with a given global time step.
*
* @param i_deltaT time step size.
*/
void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
//! definition of the CUDA-grid.
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
// assert a valid tile size
assert(nx%TILE_SIZE==0);
assert(ny%TILE_SIZE==0);
// compute the update width.
float l_updateWidthX = i_deltaT / dx;
float l_updateWidthY = i_deltaT / dy;
// update the unknowns (global time step)
updateUnknownsKernel<<<dimGrid,dimBlock>>>( hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
hd, hud, hvd,
l_updateWidthX, l_updateWidthY,
nx, ny);
}
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
*/
#ifndef SWEWAVEPROPAGATIONBLOCKCUDA_HH_
#define SWEWAVEPROPAGATIONBLOCKCUDA_HH_
#include <cassert>
#include "SWE_BlockCUDA.hh"
/**
* SWE_WavePropagationBlockCuda is an implementation of the SWE_BlockCuda abstract class.
* It uses a wave propagation solver which is defined with the pre-compiler flag WAVE_PROPAGATION_SOLVER (see above).
*
* Possible wave propagation solvers are:
* F-Wave, <strike>Approximate Augmented Riemann, Hybrid (f-wave + augmented).</strike>
* (details can be found in the corresponding source files)
*/
class SWE_WavePropagationBlockCuda: public SWE_BlockCUDA {
//private:
//! "2D array" which holds the net-updates for the water height (wave propagating to the left).
float* hNetUpdatesLeftD;
//! "2D array" which holds the net-updates for the water height (wave propagating to the right).
float* hNetUpdatesRightD;
//! "2D array" which holds the net-updates for the momentum in x-direction (wave propagating to the left).
float* huNetUpdatesLeftD;
//! "2D array" which holds the net-updates for the momentum in x-direction (wave propagating to the right).
float* huNetUpdatesRightD;
//! "2D array" which holds the net-updates for the water height (wave propagating to the top).
float* hNetUpdatesBelowD;
//! "2D array" which holds the net-updates for the water height (wave propagating to the bottom).
float* hNetUpdatesAboveD;
//! "2D array" which holds the net-updates for the momentum in y-direction (wave propagating to the top).
float* hvNetUpdatesBelowD;
//! "2D array" which holds the net-updates for the momentum in y-direction (wave propagating to the bottom).
float* hvNetUpdatesAboveD;
public:
// constructor of SWE_WavePropagationBlockCuda
SWE_WavePropagationBlockCuda( const float i_offsetX = 0,
const float i_offsetY = 0 );
// destructor of SWE_WavePropagationBlockCuda
~SWE_WavePropagationBlockCuda();
// compute a single time step (net-updates + update of the cells).
void simulateTimestep( float i_dT );
// TODO: Not implemented
float simulate(float, float) {
assert(false);
return 0;
};
// TODO: not implemented, max time step reduction is done in each call of computeNumericalFluxes(...)
void computeMaxTimestep() {
assert(false);
};
// compute the numerical fluxes (net-update formulation here).
void computeNumericalFluxes();
// compute the new cell values.
void updateUnknowns(const float i_deltaT);
};
#endif /* SWEWAVEPROPAGATIONBLOCKCUDA_HH_ */
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
*/
#include "SWE_BlockCUDA.hh"
#include "SWE_WavePropagationBlockCuda_kernels.hh"
#include <cmath>
#include <cstdio>
#include "solvers/FWaveCuda.h"
/**
* The compute net-updates kernel calls the solver for a defined CUDA-Block and does a reduction over the computed wave speeds within this block.
*
* Remark: In overall we have nx+1 / ny+1 edges.
* Therefore the edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer"
* will not be computed in a typical call of the function:
* computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
* hNetUpdatesLeftD, hNetUpdatesRightD,
* huNetUpdatesLeftD, huNetUpdatesRightD,
* hNetUpdatesBelowD, hNetUpdatesAboveD,
* hvNetUpdatesBelowD, hvNetUpdatesAboveD,
* l_maximumWaveSpeedsD,
* i_nx, i_ny
* );
* To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
*
*
* @param i_h water heights (CUDA-array).
* @param i_hu momentums in x-direction (CUDA-array).
* @param i_hv momentums in y-direction (CUDA-array).
* @param i_b bathymetry values (CUDA-array).
* @param o_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param o_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param o_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
* @param o_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
* @param o_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
* @param o_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
* @param o_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
* @param o_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
* @param o_maximumWaveSpeeds maximum wave speed which occurred within the CUDA-block is written here (CUDA-array).
* @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
* @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
* @param i_offsetX cell/edge offset in x-direction.
* @param i_offsetY cell/edge offset in y-direction.
*/
__global__
void computeNetUpdatesKernel(
const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
float* o_hNetUpdatesLeftD, float* o_hNetUpdatesRightD,
float* o_huNetUpdatesLeftD, float* o_huNetUpdatesRightD,
float* o_hNetUpdatesBelowD, float* o_hNetUpdatesAboveD,
float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
float* o_maximumWaveSpeeds,
const int i_nX, const int i_nY,
const int i_offsetX, const int i_offsetY,
const int i_blockOffSetX, const int i_blockOffSetY
) {
/*
* Initialization
*/
//! array maximum wave speed within this CUDA-block
__shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
//! thread local index in the shared maximum wave speed array
int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
// initialize shared maximum wave speed with zero
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
//! index (l_cellIndexI,l_cellIndexJ) of the cell lying on the right side of the edge/above the edge where the thread works on.
int l_cellIndexI, l_cellIndexJ;
// initialize the indices l_cellIndexI and l_cellIndexJ with the given offset
l_cellIndexI = i_offsetX;
l_cellIndexJ = i_offsetY;
//! array which holds the thread local net-updates.
float l_netUpdates[5];
//! location of the thread local cells in the global CUDA-arrays.
int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
// compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
/*
* Computation of horizontal net-updates
*/
if(i_offsetY == 0) { //horizontal edges?
// compute the locations of the thread local cells in the global CUDA-arrays.
l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+2);
l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
// compute the net-updates
fWaveComputeNetUpdates( 9.81,
i_h[l_leftCellPosition], i_h[l_rightCellPosition],
i_hu[l_leftCellPosition], i_hu[l_rightCellPosition],
i_b[l_leftCellPosition], i_b[l_rightCellPosition],
l_netUpdates
);
// compute the location of the net-updates in the global CUDA-arrays.
l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
// store the horizontal net-updates (thread local net-updates -> device net-updates)
o_hNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[0]; o_hNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[1];
o_huNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[2]; o_huNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[3];
// store the maximum wave speed in the shared array
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = l_netUpdates[4];
}
// synchronize the threads before the vertical edges (optimization)
__syncthreads();
/*
* Computation of vertical net-updates
*/
if(i_offsetX == 0) { //vertical edges?
// compute the locations of the thread local cells in the global CUDA-arrays.
l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+2);
l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
// compute the net-updates
fWaveComputeNetUpdates( 9.81,
i_h[l_leftCellPosition], i_h[l_rightCellPosition],
i_hv[l_leftCellPosition], i_hv[l_rightCellPosition],
i_b[l_leftCellPosition], i_b[l_rightCellPosition],
l_netUpdates
);
// compute the location of the net-updates in the global CUDA-arrays.
l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
// store the vertical net-updates (thread local net-updates -> device net-updates)
o_hNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[0]; o_hNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[1];
o_hvNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[2]; o_hvNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[3];
// store the maximum wave speed in the shared array
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax(l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_netUpdates[4]);
}
/*
* Compute the maximum observed wave speed
*/
// wait for all threads to finish
__syncthreads();
// initialize reduction block size with the original block size
int reductionBlockDimX = blockDim.x;
int reductionBlockDimY = blockDim.y;
// do the reduction
while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
//! reduction partner for a thread
int reductionPartner = 0;
// split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
if(reductionBlockDimX != 1) {
reductionBlockDimX /= 2; //reduce column wise
reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
}
else if(reductionBlockDimY != 1) {
reductionBlockDimY /= 2; //reduce row wise
reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
}
#ifndef NDEBUG
else {
printf("computeNetUpdatesKernel(...): There was an error in the reducing procedure!\n");
}
#endif
if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
//execute the reduction routine (maximum)
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
l_maxWaveSpeedShared[reductionPartner]
);
}
// wait for all threads to finish
__syncthreads();
}
if(threadIdx.x == 0 && threadIdx.y == 0) {
/**
* Position of the maximum wave speed in the global device array.
*
* In the 'main' part (e.g. gridDim = nx/TILE_SIZEm ny/TILE_SIZE) the position
* is simply given by the blockId in x- and y-direction with a stride of gridDim.x + 1. The +1 results from the
* speeds in the 'boundary' case, see below.
*
* In the 'boundary' case, where the edges lie between the computational domain and the right/top ghost layer,
* this is more complicated. In this case block offsets in x- and y-direction are used. The offsets define how
* many blocks in the resp. direction have to be added to get a valid result.
* Computational domain - right ghost layer: In this case the dimension of the grid in x-direction is 1.
* Computational domain - top ghost layer: In this case the dimension of the grid in y-direction is 1.
*
* Same Example as in SWE_WavePropagationBlockCuda::computeNumericalFluxes(), assume the CUDA-grid/-blocks has
* the following layout:
*
* *
* ** top ghost layer,
* * block 8 * block 9 * block 10* ******** cell ids
* ******************************** **
* * * * * *
* * block * block * block * b
* * 4 * 5 * 6 * 7
* * * * *
* ********************************
* * * * *
* * block * block * block * b
* * * 0 * 1 * 2 * 3
* bottom ** * * * *
* ghost ******** ********************************
* layer **
* * * *
* *** ***
* * *
* * *
* left ghost layer right ghost layer
*
* This results in a 'main' part containing of (3*2) blocks and two 'boundary' parts containing
* of (1*2) blocks and (3*1) blocks.
*
* The maximum wave speed array on the device represents therefore logically a (4 * 3)-1 2D-array (-1: no block on the top right).
* The 'main' part writes into cells 0, 1, 2, 4, 5 and 6.
* The 'computational domain - right ghost layer' part writes into 3 and 7 with offset in x-direction = 3
* The 'computational domain - top ghost layer' part writes into 8, 9, 10 with offset in y-direction = 2
*
*/
int l_maxWaveSpeedDevicePosition = computeOneDPositionKernel( i_blockOffSetX + blockIdx.x,
i_blockOffSetY + blockIdx.y,
max(i_blockOffSetY+1, gridDim.y+1) );
// write the block local maximum wave speed to the device array
o_maximumWaveSpeeds[ l_maxWaveSpeedDevicePosition ] = l_maxWaveSpeedShared[0];
}
}
/**
* The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
*
* @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
* @param i_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
* @param i_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
* @param i_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
* @param i_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
* @param io_h water heights (CUDA-array).
* @param io_hu momentums in x-direction (CUDA-array).
* @param io_hv momentums in y-direction (CUDA-array).
* @param i_updateWidthX update width in x-direction.
* @param i_updateWidthY update width in y-direction.
* @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
* @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
*/
__global__
void updateUnknownsKernel(
const float* i_hNetUpdatesLeftD, const float* i_hNetUpdatesRightD,
const float* i_huNetUpdatesLeftD, const float* i_huNetUpdatesRightD,
const float* i_hNetUpdatesBelowD, const float* i_hNetUpdatesAboveD,
const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
float* io_h, float* io_hu, float* io_hv,
const float i_updateWidthX, const float i_updateWidthY,
const int i_nX, const int i_nY ) {
/*
* Initialization
*/
//! cell indices (l_cellIndexI,l_cellIndexJ) of the cell which the thread updates.
int l_cellIndexI, l_cellIndexJ;
//! location of the thread local cell in the global CUDA-arrays.
int l_cellPosition;
// compute the thread local cell indices (start at cell (1,1))
l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1;
l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1;
// compute the global cell position
l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
#ifndef NDEBUG
if(l_cellPosition > (i_nX+2)*(i_nY+2))
printf("Warning: cellPosition(%i) > (i_nx+2)*(i_ny+2)\n", l_cellPosition);
#endif
//! positions of the net-updates in the global CUDA-arrays.
int l_netUpdatesLeftPosition, l_netUpdatesRightPosition, l_netUpdatesBelowPosition, l_netUpdatesAbovePosition;
/**
* Compute the positions of the net updates relative to a given cell
*
*
* netUpdateRight(i-1, j)
*
* | |
* | |
* | |
* netUpdateBelow(i,j) -----*************-----
* * *
* * cell(i,j) *
* * *
* -----*************------ netUpdateAbove(i, j-1)
* | |
* | |
* | |
* netUpdatesLeft(i,j)
*
*/
l_netUpdatesRightPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
l_netUpdatesLeftPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+1);
l_netUpdatesAbovePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
l_netUpdatesBelowPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+1);
//update the cell values
io_h[l_cellPosition] -= i_updateWidthX * ( i_hNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_hNetUpdatesLeftD[ l_netUpdatesLeftPosition ] )
+ i_updateWidthY * ( i_hNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
io_hu[l_cellPosition] -= i_updateWidthX * ( i_huNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_huNetUpdatesLeftD[ l_netUpdatesLeftPosition ] );
io_hv[l_cellPosition] -= i_updateWidthY * ( i_hvNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hvNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
}
/**
* Compute the position of 2D coordinates in a 1D array.
* array[i][j] -> i * ny + j
*
* @param i_i row index.
* @param i_j column index.
* @param i_ny #(cells in y-direction).
* @return 1D index.
*/
__device__
inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_ny) {
return i_i*i_ny + i_j;
}
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
*/
#ifndef SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_
#define SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_
// CUDA-kernel which computes the net-updates
__global__
void computeNetUpdatesKernel(
const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
float* o_hNetUpdatesLeftD, float* o_hNetUpdatesRightD,
float* o_huNetUpdatesLeftD, float* o_huNetUpdatesRightD,
float* o_hNetUpdatesBelowD, float* o_hNetUpdatesAboveD,
float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
float* o_maximumWaveSpeeds,
const int i_nx, const int i_ny,
const int i_offsetX = 0, const int i_offsetY = 0,
const int i_blockOffSetX = 0, const int i_blockOffSetY = 0
);
// CUDA-kernel which updates the unknowns
__global__
void updateUnknownsKernel(
const float* i_hNetUpdatesLeftD, const float* i_hNetUpdatesRightD,
const float* i_huNetUpdatesLeftD, const float* i_huNetUpdatesRightD,
const float* i_hNetUpdatesBelowD, const float* i_hNetUpdatesAboveD,
const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
float* io_h, float* io_hu, float* io_hv,
const float i_updateWidthX, const float i_updateWidthY,
const int i_nx, const int i_ny
);
// CUDA-kernel which computes the 1D position in an array from a given 2D index
__device__
inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_nx);
#endif /* SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_ */
......@@ -31,7 +31,11 @@
#include <string>
#include "../SWE_Block.hh"
#ifndef CUDA
#include "../SWE_WavePropagationBlock.hh"
#else
#include "../SWE_WavePropagationBlockCuda.hh"
#endif
#include "../scenarios/SWE_simple_scenarios.h"
#include "../tools/Logger.hpp"
......@@ -91,7 +95,11 @@ int main( int argc, char** argv ) {
l_originY = l_scenario.getBoundaryPos(BND_BOTTOM);
// create a single wave propagation block
#ifndef CUDA
SWE_WavePropagationBlock l_wavePropgationBlock(l_originX, l_originY);
#else
SWE_WavePropagationBlockCuda l_wavePropgationBlock(l_originX, l_originY);
#endif
// initialize the wave propgation block
l_wavePropgationBlock.initScenario(l_scenario);
......@@ -115,7 +123,7 @@ int main( int argc, char** argv ) {
/**
* Simulation.
*/
// print a start message and reset the wall clock time
// print the start message and reset the wall clock time
s_sweLogger.printStartMessage();
s_sweLogger.initWallClockTime(time(NULL));
......@@ -165,7 +173,7 @@ int main( int argc, char** argv ) {
s_sweLogger.printStatisticsMessage();
// print the cpu time
s_sweLogger.printCpuTime("CPU time");
s_sweLogger.printCpuTime("CPU/GPU time");
// print the wall clock time (includes plotting)
s_sweLogger.printWallClockTime(time(NULL));
......
......@@ -28,7 +28,7 @@
#ifndef __SWE_SIMPLE_SCENARIOS_H
#define __SWE_SIMPLE_SCENARIOS_H
#include <math.h>
#include <cmath>
#include "SWE_Scenario.h"
......@@ -56,12 +56,39 @@ class SWE_BathymetryDamBreakScenario : public SWE_Scenario {
public:
float getBathymetry(float x, float y) {
// return ( sqrt( (x-0.3f)*(x-0.3f) + (y-0.8f)*(y-0.8f) ) < 0.1f ) ? 0.1f: 0.0f;
return ( sqrt( (x-0.5f)*(x-0.5f) + (y-0.5f)*(y-0.5f) ) < 0.1f ) ? 0.1f: 0.0f;
return ( std::sqrt( (x-500.f)*(x-500.f) + (y-500.f)*(y-500.f) ) < 50.f ) ? -250.f: -260.f;
};
virtual float endSimulation() { return 0.2f; };
virtual float endSimulation() { return (float) 15; };
virtual BoundaryType getBoundaryType(BoundaryEdge edge) { return OUTFLOW; };
/** Get the boundary positions
*
* @param i_edge which edge
* @return value in the corresponding dimension
*/
float getBoundaryPos(BoundaryEdge i_edge) {
if ( i_edge == BND_LEFT )
return (float)0;
else if ( i_edge == BND_RIGHT)
return (float)1000;
else if ( i_edge == BND_BOTTOM )
return (float)0;
else
return (float)1000;
};
/**
* Get the water height at a specific location.
*
* @param i_positionX position relative to the origin of the bathymetry grid in x-direction
* @param i_positionY position relative to the origin of the bathymetry grid in y-direction
* @return water height (before the initial displacement)
*/
float getWaterHeight( float i_positionX,
float i_positionY ) {
return (float) 270;
}
};
/**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment