Overwrite source files with skeleton files

parent c880d637
......@@ -190,7 +190,6 @@ float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
updateUnknowns(maxTimestep);
t += maxTimestep;
// cout << "Simulation at time " << t << endl << flush;
} while(t < tEnd);
return t;
......@@ -203,67 +202,15 @@ float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
* The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
* To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
* In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
*
* TODO: A splitting or direct computation of the time step width might increase the total time step size.
* Example:
* dx = 11, dy = 6;
* max wave speed in x-direction: 10
* max wave speed in y-direction: 5.5
* max wave speed in both direction: 10
*
* => maximum time step (current implementation): min(11/10, 6/10) = 0.6
* => maximum time step (splitting the dimensions): min(11/10, 6/5.5) = 1.09..
*/
void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Initialization.
*/
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
/**
* Definition of the "main" CUDA-grid.
* This grid covers only edges 0..#(edges in x-direction)-2 and 0..#(edges in y-direction)-2.
*
* An example with a computational domain of size
* nx = 24, ny = 16
* with a 1 cell ghost layer would result in a grid with
* (nx+2)*(ny+2) = (26*18)
* cells and
* (nx+1)*(ny+1) = (25*17)
* edges.
*
* The CUDA-blocks (here 8*8) mentioned above would cover all edges except
* the ones lying between the computational domain and the right/top ghost layer:
* <pre>
* *
* ** top ghost layer,
* ******** cell ids
* ******************************* ** = (*, ny+1)
* * * * * *
* * 8*8 * 8*8 * 8*8 *
* * block * block * block *
* * * * *
* *******************************
* * * * *
* * 8*8 * 8*8 * 8*8 *
* * * block * block * block *
* bottom ** * * * *
* ghost ******** *******************************
* layer, **
* cell ids * * *
* =(*,0) *** ***
* * *
* * *
* left ghost layer, right ghost layer,
* cell ids = (0,*) cell ids = (nx+1, *)
* </pre>
/*
* TODO: This part needs to be implemented.
*/
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
// assert a valid tile size
assert(nx%TILE_SIZE==0);
assert(ny%TILE_SIZE==0);
// "2D array" which holds the blockwise maximum wave speeds
float* l_maximumWaveSpeedsD;
......@@ -276,42 +223,10 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Compute the net updates for the 'main part and the two 'boundary' parts.
*/
// compute the net-updates for the "main" part.
computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx,ny
);
// compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
// edges between cell nx and ghost layer nx+1
dim3 dimRightBlock(1, TILE_SIZE);
dim3 dimRightGrid(1, ny/TILE_SIZE);
computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx, ny,
nx, 0,
dimGrid.x, 0);
// edges between cell ny and ghost layer ny+1
dim3 dimTopBlock(TILE_SIZE, 1);
dim3 dimTopGrid(nx/TILE_SIZE, 1);
computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
l_maximumWaveSpeedsD,
nx, ny,
0, ny,
0, dimGrid.y);
/*
* TODO: This part needs to be implemented.
*/
/*
* Finalize (max reduction of the maximumWaveSpeeds-array.)
......@@ -344,28 +259,9 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
* @param i_deltaT time step size.
*/
void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
//! definition of one CUDA-block. Typical size are 8*8 or 16*16
dim3 dimBlock(TILE_SIZE,TILE_SIZE);
//! definition of the CUDA-grid.
dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
// assert a valid tile size
assert(nx%TILE_SIZE==0);
assert(ny%TILE_SIZE==0);
// compute the update width.
float l_updateWidthX = i_deltaT / dx;
float l_updateWidthY = i_deltaT / dy;
// update the unknowns (global time step)
updateUnknownsKernel<<<dimGrid,dimBlock>>>( hNetUpdatesLeftD, hNetUpdatesRightD,
huNetUpdatesLeftD, huNetUpdatesRightD,
hNetUpdatesBelowD, hNetUpdatesAboveD,
hvNetUpdatesBelowD, hvNetUpdatesAboveD,
hd, hud, hvd,
l_updateWidthX, l_updateWidthY,
nx, ny);
/*
* TODO: This part needs to be implemented.
*/
// synchronize the copy layer for MPI communication
#ifdef USEMPI
......
......@@ -81,189 +81,8 @@ void computeNetUpdatesKernel(
const int i_blockOffSetX, const int i_blockOffSetY
) {
/*
* Initialization
* TODO: This kernel needs to be implemented.
*/
//! array maximum wave speed within this CUDA-block
__shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
//! thread local index in the shared maximum wave speed array
int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
// initialize shared maximum wave speed with zero
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
//! index (l_cellIndexI,l_cellIndexJ) of the cell lying on the right side of the edge/above the edge where the thread works on.
int l_cellIndexI, l_cellIndexJ;
// initialize the indices l_cellIndexI and l_cellIndexJ with the given offset
l_cellIndexI = i_offsetX;
l_cellIndexJ = i_offsetY;
//! array which holds the thread local net-updates.
float l_netUpdates[5];
//! location of the thread local cells in the global CUDA-arrays.
int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
// compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
/*
* Computation of horizontal net-updates
*/
if(i_offsetY == 0) { //horizontal edges?
// compute the locations of the thread local cells in the global CUDA-arrays.
l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+2);
l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
// compute the net-updates
fWaveComputeNetUpdates( 9.81,
i_h[l_leftCellPosition], i_h[l_rightCellPosition],
i_hu[l_leftCellPosition], i_hu[l_rightCellPosition],
i_b[l_leftCellPosition], i_b[l_rightCellPosition],
l_netUpdates
);
// compute the location of the net-updates in the global CUDA-arrays.
l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
// store the horizontal net-updates (thread local net-updates -> device net-updates)
o_hNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[0]; o_hNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[1];
o_huNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[2]; o_huNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[3];
// store the maximum wave speed in the shared array
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = l_netUpdates[4];
}
// synchronize the threads before the vertical edges (optimization)
__syncthreads();
/*
* Computation of vertical net-updates
*/
if(i_offsetX == 0) { //vertical edges?
// compute the locations of the thread local cells in the global CUDA-arrays.
l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+2);
l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
// compute the net-updates
fWaveComputeNetUpdates( 9.81,
i_h[l_leftCellPosition], i_h[l_rightCellPosition],
i_hv[l_leftCellPosition], i_hv[l_rightCellPosition],
i_b[l_leftCellPosition], i_b[l_rightCellPosition],
l_netUpdates
);
// compute the location of the net-updates in the global CUDA-arrays.
l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
// store the vertical net-updates (thread local net-updates -> device net-updates)
o_hNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[0]; o_hNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[1];
o_hvNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[2]; o_hvNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[3];
// store the maximum wave speed in the shared array
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax(l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_netUpdates[4]);
}
/*
* Compute the maximum observed wave speed
*/
// wait for all threads to finish
__syncthreads();
// initialize reduction block size with the original block size
int reductionBlockDimX = blockDim.x;
int reductionBlockDimY = blockDim.y;
// do the reduction
while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
//! reduction partner for a thread
int reductionPartner = 0;
// split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
if(reductionBlockDimX != 1) {
reductionBlockDimX /= 2; //reduce column wise
reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
}
else if(reductionBlockDimY != 1) {
reductionBlockDimY /= 2; //reduce row wise
reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
}
#ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
#warning skipping printf command, which was introduced for compute capability >= 2.0
#else
else {
printf("computeNetUpdatesKernel(...): There was an error in the reducing procedure!\n");
}
#endif
#endif
if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
//execute the reduction routine (maximum)
l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
l_maxWaveSpeedShared[reductionPartner]
);
}
// wait for all threads to finish
__syncthreads();
}
if(threadIdx.x == 0 && threadIdx.y == 0) {
/**
* Position of the maximum wave speed in the global device array.
*
* In the 'main' part (e.g. gridDim = nx/TILE_SIZEm ny/TILE_SIZE) the position
* is simply given by the blockId in x- and y-direction with a stride of gridDim.x + 1. The +1 results from the
* speeds in the 'boundary' case, see below.
*
* In the 'boundary' case, where the edges lie between the computational domain and the right/top ghost layer,
* this is more complicated. In this case block offsets in x- and y-direction are used. The offsets define how
* many blocks in the resp. direction have to be added to get a valid result.
* Computational domain - right ghost layer: In this case the dimension of the grid in x-direction is 1.
* Computational domain - top ghost layer: In this case the dimension of the grid in y-direction is 1.
*
* Same Example as in SWE_WavePropagationBlockCuda::computeNumericalFluxes(), assume the CUDA-grid/-blocks has
* the following layout:
* <pre>
* *
* ** top ghost layer,
* * block 8 * block 9 * block 10* ******** cell ids
* ******************************** **
* * * * * *
* * block * block * block * b
* * 4 * 5 * 6 * 7
* * * * *
* ********************************
* * * * *
* * block * block * block * b
* * * 0 * 1 * 2 * 3
* bottom ** * * * *
* ghost ******** ********************************
* layer **
* * * *
* *** ***
* * *
* * *
* left ghost layer right ghost layer
* </pre>
* This results in a 'main' part containing of (3*2) blocks and two 'boundary' parts containing
* of (1*2) blocks and (3*1) blocks.
*
* The maximum wave speed array on the device represents therefore logically a (4 * 3)-1 2D-array (-1: no block on the top right).
* The 'main' part writes into cells 0, 1, 2, 4, 5 and 6.
* The 'computational domain - right ghost layer' part writes into 3 and 7 with offset in x-direction = 3
* The 'computational domain - top ghost layer' part writes into 8, 9, 10 with offset in y-direction = 2
*
*/
int l_maxWaveSpeedDevicePosition = computeOneDPositionKernel( i_blockOffSetX + blockIdx.x,
i_blockOffSetY + blockIdx.y,
max(i_blockOffSetY+1, gridDim.y+1) );
// write the block local maximum wave speed to the device array
o_maximumWaveSpeeds[ l_maxWaveSpeedDevicePosition ] = l_maxWaveSpeedShared[0];
}
}
/**
......@@ -295,66 +114,8 @@ void updateUnknownsKernel(
const float i_updateWidthX, const float i_updateWidthY,
const int i_nX, const int i_nY ) {
/*
* Initialization
* TODO: This kernel needs to be implemented.
*/
//! cell indices (l_cellIndexI,l_cellIndexJ) of the cell which the thread updates.
int l_cellIndexI, l_cellIndexJ;
//! location of the thread local cell in the global CUDA-arrays.
int l_cellPosition;
// compute the thread local cell indices (start at cell (1,1))
l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1;
l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1;
// compute the global cell position
l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
#ifndef NDEBUG
#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
#warning skipping printf command, which was introduced for compute capability >= 2.0
#else
if(l_cellPosition > (i_nX+2)*(i_nY+2))
printf("Warning: cellPosition(%i) > (i_nx+2)*(i_ny+2)\n", l_cellPosition);
#endif
#endif
//! positions of the net-updates in the global CUDA-arrays.
int l_netUpdatesLeftPosition, l_netUpdatesRightPosition, l_netUpdatesBelowPosition, l_netUpdatesAbovePosition;
/**
* Compute the positions of the net updates relative to a given cell
*
* <pre>
* netUpdateRight(i-1, j)
*
* | |
* | |
* | |
* netUpdateBelow(i,j) -----*************-----
* * *
* * cell(i,j) *
* * *
* -----*************------ netUpdateAbove(i, j-1)
* | |
* | |
* | |
* netUpdatesLeft(i,j)
* </pre>
*/
l_netUpdatesRightPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
l_netUpdatesLeftPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+1);
l_netUpdatesAbovePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
l_netUpdatesBelowPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+1);
//update the cell values
io_h[l_cellPosition] -= i_updateWidthX * ( i_hNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_hNetUpdatesLeftD[ l_netUpdatesLeftPosition ] )
+ i_updateWidthY * ( i_hNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
io_hu[l_cellPosition] -= i_updateWidthX * ( i_huNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_huNetUpdatesLeftD[ l_netUpdatesLeftPosition ] );
io_hv[l_cellPosition] -= i_updateWidthY * ( i_hvNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hvNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
}
/**
......
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
*/
#include "SWE_WavePropagationBlockCuda.hh"
#include "SWE_BlockCUDA.hh"
#include "SWE_WavePropagationBlockCuda_kernels.hh"
#include <cassert>
#ifndef STATICLOGGER
#define STATICLOGGER
#include "tools/Logger.hpp"
static tools::Logger s_sweLogger;
#endif
// CUDA-C includes
#include <cuda.h>
#include <cuda_runtime_api.h>
// Thrust library (used for the final maximum reduction in the method computeNumericalFluxes(...))
#include <thrust/device_vector.h>
/**
* Constructor of a SWE_WavePropagationBlockCuda.
*
* Allocates the variables for the simulation:
* Please note: The definition of indices changed in contrast to the CPU-Implementation.
*
* unknowns hd,hud,hvd,bd stored on the CUDA device are defined for grid indices [0,..,nx+1]*[0,..,ny+1] (-> Abstract class SWE_BlockCUDA)
* -> computational domain is [1,..,nx]*[1,..,ny]
* -> plus ghost cell layer
*
* net-updates are defined for edges with indices [0,..,nx]*[0,..,ny] for horizontal and vertical edges for simplicity (one layer is not necessary).
*
* A left/right net update with index (i-1,j) is located on the edge between
* cells with index (i-1,j) and (i,j):
* <pre>
* *********************
* * * *
* * (i-1,j) * (i,j) *
* * * *
* *********************
*
* *
* ***
* *****
* *
* *
* NetUpdatesLeft(i-1,j)
* or
* NetUpdatesRight(i-1,j)
* </pre>
*
* A below/above net update with index (i, j-1) is located on the edge between
* cells with index (i, j-1) and (i,j):
* <pre>
* ***********
* * *
* * (i, j) * *
* * * ** NetUpdatesBelow(i,j-1)
* *********** ***** or
* * * ** NetUpdatesAbove(i,j-1)
* * (i,j-1) * *
* * *
* ***********
* </pre>
* @param i_offsetX spatial offset of the block in x-direction.
* @param i_offsetY spatial offset of the offset in y-direction.
* @param i_cudaDevice ID of the CUDA-device, which should be used.
*/
SWE_WavePropagationBlockCuda::SWE_WavePropagationBlockCuda( const float i_offsetX,
const float i_offsetY,
const int i_cudaDevice ): SWE_BlockCUDA(i_offsetX, i_offsetY, i_cudaDevice) {
// compute the size of one 1D net-update array.
int sizeOfNetUpdates = (nx+1)*(ny+1)*sizeof(float);
// allocate CUDA memory for the net-updates
cudaMalloc((void**)&hNetUpdatesLeftD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesLeftD");
cudaMalloc((void**)&hNetUpdatesRightD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesRightD");
cudaMalloc((void**)&huNetUpdatesLeftD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for huNetUpdatesLeftD");
cudaMalloc((void**)&huNetUpdatesRightD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for huNetUpdatesRightD");
cudaMalloc((void**)&hNetUpdatesBelowD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesBelowD");
cudaMalloc((void**)&hNetUpdatesAboveD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesAboveD");
cudaMalloc((void**)&hvNetUpdatesBelowD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hvNetUpdatesBelowD");
cudaMalloc((void**)&hvNetUpdatesAboveD, sizeOfNetUpdates);
checkCUDAError("allocate device memory for hNetUpdatesAboveD");
}
/**
* Destructor of a SWE_WavePropagationBlockCuda.
*
* Frees all of the memory, which was allocated within the constructor.
* Resets the CUDA device: Useful if error occured and printf is used on the device (buffer).
*/
SWE_WavePropagationBlockCuda::~SWE_WavePropagationBlockCuda() {
// free the net-updates memory
cudaFree(hNetUpdatesLeftD);
cudaFree(hNetUpdatesRightD);
cudaFree(huNetUpdatesLeftD);
cudaFree(huNetUpdatesRightD);
cudaFree(hNetUpdatesBelowD);
cudaFree(hNetUpdatesAboveD);
cudaFree(hvNetUpdatesBelowD);
cudaFree(hvNetUpdatesAboveD);
// reset the cuda device
s_sweLogger.printString("Resetting the CUDA devices");
cudaDeviceReset();
}
/**
* Compute a single global time step of a given time step width.
* Remark: The user has to take care about the time step width. No additional check is done. The time step width typically available
* after the computation of the numerical fluxes (hidden in this method).
*
* First the net-updates are computed.
* Then the cells are updated with the net-updates and the given time step width.
*
* @param i_dT time step width in seconds.
*/
__host__
void SWE_WavePropagationBlockCuda::simulateTimestep(float i_dT) {
// Compute the numerical fluxes/net-updates in the wave propagation formulation.
computeNumericalFluxes();
// Update the unknowns with the net-updates.
updateUnknowns(i_dT);
}
/**
* perform forward-Euler time steps, starting with simulation time tStart,:
* until simulation time tEnd is reached;
* device-global variables hd, hud, hvd are updated;
* unknowns h, hu, hv in main memory are not updated.
* Ghost layers and bathymetry sources are updated between timesteps.
* intended as main simulation loop between two checkpoints
*/
__host__
float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
float t = tStart;
do {
// set values in ghost cells:
setGhostLayer();
// Compute the numerical fluxes/net-updates in the wave propagation formulation.
computeNumericalFluxes();
// Update the unknowns with the net-updates.
updateUnknowns(maxTimestep);
t += maxTimestep;
} while(t < tEnd);
return t;
}
/**
* Compute the numerical fluxes (net-update formulation here) on all edges.
*
* The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
* To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
* In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
*/
void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
/*
* Initialization.
*/
/*
* TODO: This part needs to be implemented.
*/
// "2D array" which holds the blockwise maximum wave speeds
float* l_maximumWaveSpeedsD;
// size of the maximum wave speed array (dimension of the grid + ghost layers, without the top right block), sizeof(float) not included
int l_sizeMaxWaveSpeeds = ((dimGrid.x+1)*(dimGrid.y+1)-1);
cudaMalloc((void**)&l_maximumWaveSpeedsD, (l_sizeMaxWaveSpeeds*sizeof(float)) );
/*
* Compute the net updates for the 'main part and the two 'boundary' parts.
*/
/*
* TODO: This part needs to be implemented.
*/
/*
* Finalize (max reduction of the maximumWaveSpeeds-array.)
*
* The Thrust library is used in this step.
* An optional kernel could be written for the maximum reduction.
*/
// Thrust pointer to the device array
thrust::device_ptr<float> l_thrustDevicePointer(l_maximumWaveSpeedsD);
// use Thrusts max_element-function for the maximum reduction
thrust::device_ptr<float> l_thrustDevicePointerMax = thrust::max_element(l_thrustDevicePointer, l_thrustDevicePointer+l_sizeMaxWaveSpeeds);
// get the result from the device
float l_maximumWaveSpeed = l_thrustDevicePointerMax[0];
// free the max wave speeds array on the device
cudaFree(l_maximumWaveSpeedsD);
// set the maximum time step for this SWE_WavePropagationBlockCuda
maxTimestep = std::min( dx/l_maximumWaveSpeed, dy/l_maximumWaveSpeed );
// CFL = 0.5
maxTimestep *= (float)0.4;
}
/**
* Update the cells with a given global time step.
*
* @param i_deltaT time step size.
*/
void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
/*
* TODO: This part needs to be implemented.
*/
// synchronize the copy layer for MPI communication
#ifdef USEMPI
synchCopyLayerBeforeRead();
#endif
}
/**
* @file
* This file is part of SWE.
*
* @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
*
* @section LICENSE
*
* SWE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* SWE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with SWE. If not, see <http://www.gnu.org/licenses/>.
*
*
* @section DESCRIPTION
*
* CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
*/
#include "SWE_BlockCUDA.hh"
#include "SWE_WavePropagationBlockCuda_kernels.hh"
#include <cmath>
#include <cstdio>
#include "solvers/FWaveCuda.h"
/**
* The compute net-updates kernel calls the solver for a defined CUDA-Block and does a reduction over the computed wave speeds within this block.
*
* Remark: In overall we have nx+1 / ny+1 edges.
* Therefore the edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer"
* will not be computed in a typical call of the function:
* computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
* hNetUpdatesLeftD, hNetUpdatesRightD,
* huNetUpdatesLeftD, huNetUpdatesRightD,
* hNetUpdatesBelowD, hNetUpdatesAboveD,
* hvNetUpdatesBelowD, hvNetUpdatesAboveD,
* l_maximumWaveSpeedsD,
* i_nx, i_ny
* );
* To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
*
*
* @param i_h water heights (CUDA-array).
* @param i_hu momentums in x-direction (CUDA-array).
* @param i_hv momentums in y-direction (CUDA-array).
* @param i_b bathymetry values (CUDA-array).
* @param o_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param o_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param o_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
* @param o_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
* @param o_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
* @param o_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
* @param o_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
* @param o_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
* @param o_maximumWaveSpeeds maximum wave speed which occurred within the CUDA-block is written here (CUDA-array).
* @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
* @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
* @param i_offsetX cell/edge offset in x-direction.
* @param i_offsetY cell/edge offset in y-direction.
*/
__global__
void computeNetUpdatesKernel(
const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
float* o_hNetUpdatesLeftD, float* o_hNetUpdatesRightD,
float* o_huNetUpdatesLeftD, float* o_huNetUpdatesRightD,
float* o_hNetUpdatesBelowD, float* o_hNetUpdatesAboveD,
float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
float* o_maximumWaveSpeeds,
const int i_nX, const int i_nY,
const int i_offsetX, const int i_offsetY,
const int i_blockOffSetX, const int i_blockOffSetY
) {
/*
* TODO: This kernel needs to be implemented.
*/
}
/**
* The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
*
* @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
* @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
* @param i_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
* @param i_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
* @param i_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
* @param i_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
* @param i_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
* @param io_h water heights (CUDA-array).
* @param io_hu momentums in x-direction (CUDA-array).
* @param io_hv momentums in y-direction (CUDA-array).
* @param i_updateWidthX update width in x-direction.
* @param i_updateWidthY update width in y-direction.
* @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
* @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
*/
__global__
void updateUnknownsKernel(
const float* i_hNetUpdatesLeftD, const float* i_hNetUpdatesRightD,
const float* i_huNetUpdatesLeftD, const float* i_huNetUpdatesRightD,
const float* i_hNetUpdatesBelowD, const float* i_hNetUpdatesAboveD,
const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
float* io_h, float* io_hu, float* io_hv,
const float i_updateWidthX, const float i_updateWidthY,
const int i_nX, const int i_nY ) {
/*
* TODO: This kernel needs to be implemented.
*/
}
/**
* Compute the position of 2D coordinates in a 1D array.
* array[i][j] -> i * ny + j
*
* @param i_i row index.
* @param i_j column index.
* @param i_ny #(cells in y-direction).
* @return 1D index.
*/
__device__
inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_ny) {
return i_i*i_ny + i_j;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment