Overwrite source files with skeleton files

dc73e40e · Sebastian Rettenberger · c880d637 · dc73e40e · dc73e40e · dc73e40e
Commit dc73e40e authored Jan 30, 2013 by Sebastian Rettenberger
5 changed files
--- a/src_skeleton/Readme.md
+++ b/src_skeleton/Readme.md
--- a/src/SWE_WavePropagationBlockCuda.cu
+++ b/src/SWE_WavePropagationBlockCuda.cu
@@ -190,7 +190,6 @@ float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
     updateUnknowns(maxTimestep);
 	 
 	 t += maxTimestep;
-//     cout << "Simulation at time " << t << endl << flush;
  } while(t < tEnd);

  return t;
@@ -203,67 +202,15 @@ float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
 * The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
 * To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
 *   In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
- *
- *   TODO: A splitting or direct computation of the time step width might increase the total time step size.
- *     Example:
- *       dx = 11, dy = 6;
- *       max wave speed in x-direction: 10
- *       max wave speed in y-direction: 5.5
- *       max wave speed in both direction: 10
- *
- *       => maximum time step (current implementation): min(11/10, 6/10) = 0.6
- *       => maximum time step (splitting the dimensions): min(11/10, 6/5.5) = 1.09..
 */
 void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
  /*
   * Initialization.
   */
-  //! definition of one CUDA-block. Typical size are 8*8 or 16*16
-  dim3 dimBlock(TILE_SIZE,TILE_SIZE);

-  /**
-   * Definition of the "main" CUDA-grid.
-   * This grid covers only edges 0..#(edges in x-direction)-2 and 0..#(edges in y-direction)-2.
-   *
-   * An example with a computational domain of size
-   *  nx = 24, ny = 16
-   * with a 1 cell ghost layer would result in a grid with
-   *  (nx+2)*(ny+2) = (26*18)
-   * cells and
-   *  (nx+1)*(ny+1) = (25*17)
-   * edges.
-   *
-   * The CUDA-blocks (here 8*8) mentioned above would cover all edges except
-   * the ones lying between the computational domain and the right/top ghost layer:
-   * <pre>
-   *                                                          *
-   *                                                         **        top ghost layer,
-   *                                                        ********   cell ids
-   *                        *******************************  **        = (*, ny+1)
-   *                        *         *         *         *   *
-   *                        *   8*8   *   8*8   *   8*8   *
-   *                        *  block  *  block  *  block  *
-   *                        *         *         *         *
-   *                        *******************************
-   *                        *         *         *         *
-   *                        *   8*8   *   8*8   *   8*8   *
-   *                    *   *  block  *  block  *  block  *
-   *     bottom         **  *         *         *         *
-   *     ghost     ******** *******************************
-   *     layer,         **
-   *     cell ids       *   *                              *
-   *     =(*,0)            ***                            ***
-   *                        *                              *
-   *                        *                              *
-   *                  left ghost layer,             right ghost layer,
-   *                  cell ids = (0,*)             cell ids = (nx+1, *)
-   * </pre>
+  /*
+   * TODO: This part needs to be implemented.
   */
-  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
-
-  // assert a valid tile size
-  assert(nx%TILE_SIZE==0);
-  assert(ny%TILE_SIZE==0);

  // "2D array" which holds the blockwise maximum wave speeds
  float* l_maximumWaveSpeedsD;
@@ -276,42 +223,10 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
  /*
   * Compute the net updates for the 'main part and the two 'boundary' parts.
   */
-  // compute the net-updates for the "main" part.
-  computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
-                                                 hNetUpdatesLeftD,  hNetUpdatesRightD,
-                                                 huNetUpdatesLeftD, huNetUpdatesRightD,
-                                                 hNetUpdatesBelowD, hNetUpdatesAboveD,
-                                                 hvNetUpdatesBelowD, hvNetUpdatesAboveD,
-                                                 l_maximumWaveSpeedsD,
-                                                 nx,ny
-                                               );
-
-  // compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
-  // edges between cell nx and ghost layer nx+1
-  dim3 dimRightBlock(1, TILE_SIZE);
-  dim3 dimRightGrid(1, ny/TILE_SIZE);
-  computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
-                                                            hNetUpdatesLeftD,  hNetUpdatesRightD,
-                                                            huNetUpdatesLeftD, huNetUpdatesRightD,
-                                                            hNetUpdatesBelowD, hNetUpdatesAboveD,
-                                                            hvNetUpdatesBelowD, hvNetUpdatesAboveD,
-                                                            l_maximumWaveSpeedsD,
-                                                            nx, ny,
-                                                            nx, 0,
-                                                            dimGrid.x, 0);
-
-  // edges between cell ny and ghost layer ny+1
-  dim3 dimTopBlock(TILE_SIZE, 1);
-  dim3 dimTopGrid(nx/TILE_SIZE, 1);
-  computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
-                                                        hNetUpdatesLeftD,  hNetUpdatesRightD,
-                                                        huNetUpdatesLeftD, huNetUpdatesRightD,
-                                                        hNetUpdatesBelowD, hNetUpdatesAboveD,
-                                                        hvNetUpdatesBelowD, hvNetUpdatesAboveD,
-                                                        l_maximumWaveSpeedsD,
-                                                        nx, ny,
-                                                        0, ny,
-                                                        0, dimGrid.y);
+
+  /*
+   * TODO: This part needs to be implemented.
+   */

  /*
   * Finalize (max reduction of the maximumWaveSpeeds-array.)
@@ -344,28 +259,9 @@ void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
 * @param i_deltaT time step size.
 */
 void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
-  //! definition of one CUDA-block. Typical size are 8*8 or 16*16
-  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
-
-  //! definition of the CUDA-grid.
-  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
-
-  // assert a valid tile size
-  assert(nx%TILE_SIZE==0);
-  assert(ny%TILE_SIZE==0);
-
-  // compute the update width.
-  float l_updateWidthX = i_deltaT / dx;
-  float l_updateWidthY = i_deltaT / dy;
-
-  // update the unknowns (global time step)
-  updateUnknownsKernel<<<dimGrid,dimBlock>>>( hNetUpdatesLeftD, hNetUpdatesRightD,
-                                              huNetUpdatesLeftD, huNetUpdatesRightD,
-                                              hNetUpdatesBelowD, hNetUpdatesAboveD,
-                                              hvNetUpdatesBelowD, hvNetUpdatesAboveD,
-                                              hd, hud, hvd,
-                                              l_updateWidthX, l_updateWidthY,
-                                              nx, ny);
+  /*
+   * TODO: This part needs to be implemented.
+   */

  // synchronize the copy layer for MPI communication
  #ifdef USEMPI

--- a/src/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src/SWE_WavePropagationBlockCuda_kernels.cu
@@ -81,189 +81,8 @@ void computeNetUpdatesKernel(
    const int i_blockOffSetX, const int i_blockOffSetY
 ) {
  /*
-   * Initialization
+   * TODO: This kernel needs to be implemented.
   */
-  //! array maximum wave speed within this CUDA-block
-  __shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
-
-  //! thread local index in the shared maximum wave speed array
-  int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
-
-  // initialize shared maximum wave speed with zero
-  l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
-
-  //! index (l_cellIndexI,l_cellIndexJ) of the cell lying on the right side of the edge/above the edge where the thread works on.
-  int l_cellIndexI, l_cellIndexJ;
-
-  // initialize the indices l_cellIndexI and l_cellIndexJ with the given offset
-  l_cellIndexI = i_offsetX;
-  l_cellIndexJ = i_offsetY;
-
-  //! array which holds the thread local net-updates.
-  float l_netUpdates[5];
-
-  //! location of the thread local cells in the global CUDA-arrays.
-  int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
-
-  // compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
-  l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
-  l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
-
-  /*
-   * Computation of horizontal net-updates
-   */
-  if(i_offsetY == 0) { //horizontal edges?
-
-    // compute the locations of the thread local cells in the global CUDA-arrays.
-    l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+2);
-    l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
-
-    // compute the net-updates
-    fWaveComputeNetUpdates( 9.81,
-                            i_h[l_leftCellPosition],  i_h[l_rightCellPosition],
-                            i_hu[l_leftCellPosition], i_hu[l_rightCellPosition],
-                            i_b[l_leftCellPosition],  i_b[l_rightCellPosition],
-                            l_netUpdates
-                           );
-
-    // compute the location of the net-updates in the global CUDA-arrays.
-    l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
-
-    // store the horizontal net-updates (thread local net-updates -> device net-updates)
-    o_hNetUpdatesLeftD[l_netUpdatePosition]  = l_netUpdates[0]; o_hNetUpdatesRightD[l_netUpdatePosition]  = l_netUpdates[1];
-    o_huNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[2]; o_huNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[3];
-
-    // store the maximum wave speed in the shared array
-    l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = l_netUpdates[4];
-  }
-
-  // synchronize the threads before the vertical edges (optimization)
-  __syncthreads();
-
-  /*
-   * Computation of vertical net-updates
-   */
-  if(i_offsetX == 0) { //vertical edges?
-    // compute the locations of the thread local cells in the global CUDA-arrays.
-    l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+2);
-    l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
-
-    // compute the net-updates
-    fWaveComputeNetUpdates( 9.81,
-                            i_h[l_leftCellPosition], i_h[l_rightCellPosition],
-                            i_hv[l_leftCellPosition], i_hv[l_rightCellPosition],
-                            i_b[l_leftCellPosition], i_b[l_rightCellPosition],
-                            l_netUpdates
-                           );
-
-    // compute the location of the net-updates in the global CUDA-arrays.
-    l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
-
-    // store the vertical net-updates (thread local net-updates -> device net-updates)
-    o_hNetUpdatesBelowD[l_netUpdatePosition]  = l_netUpdates[0]; o_hNetUpdatesAboveD[l_netUpdatePosition]  = l_netUpdates[1];
-    o_hvNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[2]; o_hvNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[3];
-
-    // store the maximum wave speed in the shared array
-    l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax(l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_netUpdates[4]);
-  }
-
-  /*
-   * Compute the maximum observed wave speed
-   */
-  // wait for all threads to finish
-  __syncthreads();
-
-  // initialize reduction block size with the original block size
-  int reductionBlockDimX = blockDim.x;
-  int reductionBlockDimY = blockDim.y;
-
-  // do the reduction
-  while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
-    //! reduction partner for a thread
-    int reductionPartner = 0;
-
-    // split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
-    if(reductionBlockDimX != 1) {
-      reductionBlockDimX /= 2; //reduce column wise
-      reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
-    }
-    else if(reductionBlockDimY != 1) {
-      reductionBlockDimY /= 2; //reduce row wise
-      reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
-    }
-#ifndef NDEBUG
-#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
-#warning skipping printf command, which was introduced for compute capability >= 2.0
-#else
-    else {
-      printf("computeNetUpdatesKernel(...): There was an error in the reducing procedure!\n");
-    }
-#endif
-#endif
-    if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
-      //execute the reduction routine (maximum)
-      l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
-                                                           l_maxWaveSpeedShared[reductionPartner]
-                                                         );
-    }
-    // wait for all threads to finish
-    __syncthreads();
-  }
-
-  if(threadIdx.x == 0 && threadIdx.y == 0) {
-    /**
-     * Position of the maximum wave speed in the global device array.
-     *
-     *   In the 'main' part (e.g. gridDim = nx/TILE_SIZEm ny/TILE_SIZE) the position
-     *   is simply given by the blockId in x- and y-direction with a stride of gridDim.x + 1. The +1 results from the
-     *   speeds in the 'boundary' case, see below.
-     *
-     *   In the 'boundary' case, where the edges lie between the computational domain and the right/top ghost layer,
-     *   this is more complicated. In this case block offsets in x- and y-direction are used. The offsets define how
-     *   many blocks in the resp. direction have to be added to get a valid result.
-     *     Computational domain - right ghost layer: In this case the dimension of the grid in x-direction is 1.
-     *     Computational domain - top ghost layer: In this case the dimension of the grid in y-direction is 1.
-     *
-     *   Same Example as in SWE_WavePropagationBlockCuda::computeNumericalFluxes(), assume the CUDA-grid/-blocks has
-     *   the following layout:
-     * <pre>
-     *                                                          *
-     *                                                         **        top ghost layer,
-     *                        * block 8 * block 9 * block 10* ********   cell ids
-     *                        ******************************** **
-     *                        *         *         *         *   *
-     *                        *  block  *  block  *  block  * b
-     *                        *    4    *    5    *    6    * 7
-     *                        *         *         *         *
-     *                        ********************************
-     *                        *         *         *         *
-     *                        *  block  *  block  *  block  * b
-     *                    *   *    0    *    1    *    2    * 3
-     *     bottom         **  *         *         *         *
-     *     ghost     ******** ********************************
-     *     layer          **
-     *                    *   *                              *
-     *                       ***                            ***
-     *                        *                              *
-     *                        *                              *
-     *                  left ghost layer              right ghost layer
-     * </pre>
-     *   This results in a 'main' part containing of (3*2) blocks and two 'boundary' parts containing
-     *   of (1*2) blocks and (3*1) blocks.
-     *
-     *   The maximum wave speed array on the device represents therefore logically a (4 * 3)-1 2D-array (-1: no block on the top right).
-     *   The 'main' part writes into cells 0, 1, 2, 4, 5 and 6.
-     *   The 'computational domain - right ghost layer' part writes into 3 and 7 with offset in x-direction = 3
-     *   The 'computational domain - top ghost layer'  part writes into 8, 9, 10 with offset in y-direction = 2
-     *
-     */
-    int l_maxWaveSpeedDevicePosition = computeOneDPositionKernel( i_blockOffSetX + blockIdx.x,
-                                                                  i_blockOffSetY + blockIdx.y,
-                                                                  max(i_blockOffSetY+1, gridDim.y+1) );
-
-    // write the block local maximum wave speed to the device array
-    o_maximumWaveSpeeds[ l_maxWaveSpeedDevicePosition ] = l_maxWaveSpeedShared[0];
-  }
 }

 /**
@@ -295,66 +114,8 @@ void updateUnknownsKernel(
    const float i_updateWidthX, const float i_updateWidthY,
    const int i_nX, const int i_nY ) {
  /*
-   * Initialization
+   * TODO: This kernel needs to be implemented.
   */
-  //! cell indices (l_cellIndexI,l_cellIndexJ) of the cell which the thread updates.
-  int l_cellIndexI, l_cellIndexJ;
-
-  //! location of the thread local cell in the global CUDA-arrays.
-  int l_cellPosition;
-
-  // compute the thread local cell indices (start at cell (1,1))
-  l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1;
-  l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1;
-
-  // compute the global cell position
-  l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
-
-#ifndef NDEBUG
-#if defined(__CUDA_ARCH__) & (__CUDA_ARCH__ < 200)
-#warning skipping printf command, which was introduced for compute capability >= 2.0
-#else
-  if(l_cellPosition > (i_nX+2)*(i_nY+2))
-    printf("Warning: cellPosition(%i) > (i_nx+2)*(i_ny+2)\n", l_cellPosition);
-#endif
-#endif
-
-  //! positions of the net-updates in the global CUDA-arrays.
-  int l_netUpdatesLeftPosition, l_netUpdatesRightPosition, l_netUpdatesBelowPosition, l_netUpdatesAbovePosition;
-
-  /**
-   *  Compute the positions of the net updates relative to a given cell
-   *
-   * <pre>
-   *                  netUpdateRight(i-1, j)
-   *
-   *                           |           |
-   *                           |           |
-   *                           |           |
-   *  netUpdateBelow(i,j) -----*************-----
-   *                           *           *
-   *                           * cell(i,j) *
-   *                           *           *
-   *                      -----*************------ netUpdateAbove(i, j-1)
-   *                           |           |
-   *                           |           |
-   *                           |           |
-   *                                netUpdatesLeft(i,j)
-   * </pre>
-   */
-  l_netUpdatesRightPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
-  l_netUpdatesLeftPosition  = computeOneDPositionKernel(l_cellIndexI,   l_cellIndexJ, i_nY+1);
-
-  l_netUpdatesAbovePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
-  l_netUpdatesBelowPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ,   i_nY+1);
-
-  //update the cell values
-  io_h[l_cellPosition] -=  i_updateWidthX * ( i_hNetUpdatesRightD[ l_netUpdatesRightPosition ] +  i_hNetUpdatesLeftD[  l_netUpdatesLeftPosition ] )
-                         + i_updateWidthY * ( i_hNetUpdatesAboveD[ l_netUpdatesAbovePosition ] +  i_hNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
-
-  io_hu[l_cellPosition] -= i_updateWidthX * ( i_huNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_huNetUpdatesLeftD[ l_netUpdatesLeftPosition ] );
-
-  io_hv[l_cellPosition] -= i_updateWidthY * ( i_hvNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hvNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
 }

 /**

--- a/src_skeleton/SWE_WavePropagationBlockCuda.cu
+++ b/src_skeleton/SWE_WavePropagationBlockCuda.cu
-/**
- * @file
- * This file is part of SWE.
- *
- * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
- *
- * @section LICENSE
- *
- * SWE is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * SWE is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
- *
- *
- * @section DESCRIPTION
- *
- * SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
- */
-
-#include "SWE_WavePropagationBlockCuda.hh"
-#include "SWE_BlockCUDA.hh"
-
-#include "SWE_WavePropagationBlockCuda_kernels.hh"
-
-#include <cassert>
-
-#ifndef STATICLOGGER
-#define STATICLOGGER
-#include "tools/Logger.hpp"
-static tools::Logger s_sweLogger;
-#endif
-
-// CUDA-C includes
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-// Thrust library (used for the final maximum reduction in the method computeNumericalFluxes(...))
-#include <thrust/device_vector.h>
-
-
-/**
- * Constructor of a SWE_WavePropagationBlockCuda.
- *
- * Allocates the variables for the simulation:
- *   Please note: The definition of indices changed in contrast to the CPU-Implementation.
- *
- *   unknowns hd,hud,hvd,bd stored on the CUDA device are defined for grid indices [0,..,nx+1]*[0,..,ny+1] (-> Abstract class SWE_BlockCUDA)
- *     -> computational domain is [1,..,nx]*[1,..,ny]
- *     -> plus ghost cell layer
- *
- *   net-updates are defined for edges with indices [0,..,nx]*[0,..,ny] for horizontal and vertical edges for simplicity (one layer is not necessary).
- *
- *   A left/right net update with index (i-1,j) is located on the edge between
- *   cells with index (i-1,j) and (i,j):
- * <pre>
- *   *********************
- *   *         *         *
- *   * (i-1,j) *  (i,j)  *
- *   *         *         *
- *   *********************
- *
- *             *
- *            ***
- *           *****
- *             *
- *             *
- *   NetUpdatesLeft(i-1,j)
- *             or
- *   NetUpdatesRight(i-1,j)
- * </pre>
- *
- *   A below/above net update with index (i, j-1) is located on the edge between
- *   cells with index (i, j-1) and (i,j):
- * <pre>
- *   ***********
- *   *         *
- *   * (i, j)  *   *
- *   *         *  **  NetUpdatesBelow(i,j-1)
- *   *********** *****         or
- *   *         *  **  NetUpdatesAbove(i,j-1)
- *   * (i,j-1) *   *
- *   *         *
- *   ***********
- * </pre>
- * @param i_offsetX spatial offset of the block in x-direction.
- * @param i_offsetY spatial offset of the offset in y-direction.
- * @param i_cudaDevice ID of the CUDA-device, which should be used.
- */
-SWE_WavePropagationBlockCuda::SWE_WavePropagationBlockCuda( const float i_offsetX,
-                                                            const float i_offsetY,
-                                                            const int i_cudaDevice ): SWE_BlockCUDA(i_offsetX, i_offsetY, i_cudaDevice) {
-  // compute the size of one 1D net-update array.
-  int sizeOfNetUpdates = (nx+1)*(ny+1)*sizeof(float);
-
-  // allocate CUDA memory for the net-updates
-  cudaMalloc((void**)&hNetUpdatesLeftD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for hNetUpdatesLeftD");
-
-  cudaMalloc((void**)&hNetUpdatesRightD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for hNetUpdatesRightD");
-
-  cudaMalloc((void**)&huNetUpdatesLeftD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for huNetUpdatesLeftD");
-
-  cudaMalloc((void**)&huNetUpdatesRightD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for huNetUpdatesRightD");
-
-  cudaMalloc((void**)&hNetUpdatesBelowD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for hNetUpdatesBelowD");
-
-  cudaMalloc((void**)&hNetUpdatesAboveD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for hNetUpdatesAboveD");
-
-  cudaMalloc((void**)&hvNetUpdatesBelowD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for hvNetUpdatesBelowD");
-
-  cudaMalloc((void**)&hvNetUpdatesAboveD, sizeOfNetUpdates);
-  checkCUDAError("allocate device memory for hNetUpdatesAboveD");
-}
-
-/**
- * Destructor of a SWE_WavePropagationBlockCuda.
- *
- * Frees all of the memory, which was allocated within the constructor.
- * Resets the CUDA device: Useful if error occured and printf is used on the device (buffer).
- */
-SWE_WavePropagationBlockCuda::~SWE_WavePropagationBlockCuda() {
-  // free the net-updates memory
-  cudaFree(hNetUpdatesLeftD);
-  cudaFree(hNetUpdatesRightD);
-  cudaFree(huNetUpdatesLeftD);
-  cudaFree(huNetUpdatesRightD);
-
-  cudaFree(hNetUpdatesBelowD);
-  cudaFree(hNetUpdatesAboveD);
-  cudaFree(hvNetUpdatesBelowD);
-  cudaFree(hvNetUpdatesAboveD);
-
-  // reset the cuda device
-  s_sweLogger.printString("Resetting the CUDA devices");
-  cudaDeviceReset();
-}
-
-/**
- * Compute a single global time step of a given time step width.
- * Remark: The user has to take care about the time step width. No additional check is done. The time step width typically available
- *         after the computation of the numerical fluxes (hidden in this method).
- *
- * First the net-updates are computed.
- * Then the cells are updated with the net-updates and the given time step width.
- *
- * @param i_dT time step width in seconds.
- */
-__host__
-void SWE_WavePropagationBlockCuda::simulateTimestep(float i_dT) {
- // Compute the numerical fluxes/net-updates in the wave propagation formulation.
- computeNumericalFluxes();
-
- // Update the unknowns with the net-updates.
- updateUnknowns(i_dT);
-}
-
-/**
- * perform forward-Euler time steps, starting with simulation time tStart,:
- * until simulation time tEnd is reached; 
- * device-global variables hd, hud, hvd are updated;
- * unknowns h, hu, hv in main memory are not updated.
- * Ghost layers and bathymetry sources are updated between timesteps.
- * intended as main simulation loop between two checkpoints
- */
-__host__
-float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
-  float t = tStart;
-  do {
-     // set values in ghost cells:
-     setGhostLayer();
-     
-     // Compute the numerical fluxes/net-updates in the wave propagation formulation.
-     computeNumericalFluxes();
-
-     // Update the unknowns with the net-updates.
-     updateUnknowns(maxTimestep);
-	 
-	 t += maxTimestep;
-  } while(t < tEnd);
-
-  return t;
-}
-
-
-/**
- * Compute the numerical fluxes (net-update formulation here) on all edges.
- *
- * The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
- * To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
- *   In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
- */
-void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
-  /*
-   * Initialization.
-   */
-
-  /*
-   * TODO: This part needs to be implemented.
-   */
-
-  // "2D array" which holds the blockwise maximum wave speeds
-  float* l_maximumWaveSpeedsD;
-
-  // size of the maximum wave speed array (dimension of the grid + ghost layers, without the top right block), sizeof(float) not included
-  int l_sizeMaxWaveSpeeds = ((dimGrid.x+1)*(dimGrid.y+1)-1);
-  cudaMalloc((void**)&l_maximumWaveSpeedsD, (l_sizeMaxWaveSpeeds*sizeof(float)) );
-
-
-  /*
-   * Compute the net updates for the 'main part and the two 'boundary' parts.
-   */
-
-  /*
-   * TODO: This part needs to be implemented.
-   */
-
-  /*
-   * Finalize (max reduction of the maximumWaveSpeeds-array.)
-   *
-   * The Thrust library is used in this step.
-   * An optional kernel could be written for the maximum reduction.
-   */
-  // Thrust pointer to the device array
-  thrust::device_ptr<float> l_thrustDevicePointer(l_maximumWaveSpeedsD);
-
-  // use Thrusts max_element-function for the maximum reduction
-  thrust::device_ptr<float> l_thrustDevicePointerMax = thrust::max_element(l_thrustDevicePointer, l_thrustDevicePointer+l_sizeMaxWaveSpeeds);
-
-  // get the result from the device
-  float l_maximumWaveSpeed = l_thrustDevicePointerMax[0];
-
-  // free the max wave speeds array on the device
-  cudaFree(l_maximumWaveSpeedsD);
-
-  // set the maximum time step for this SWE_WavePropagationBlockCuda
-  maxTimestep = std::min( dx/l_maximumWaveSpeed, dy/l_maximumWaveSpeed );
-
-  // CFL = 0.5
-  maxTimestep *= (float)0.4;
-}
-
-/**
- * Update the cells with a given global time step.
- *
- * @param i_deltaT time step size.
- */
-void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
-  /*
-   * TODO: This part needs to be implemented.
-   */
-
-  // synchronize the copy layer for MPI communication
-  #ifdef USEMPI
-  synchCopyLayerBeforeRead();
-  #endif
-}
--- a/src_skeleton/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src_skeleton/SWE_WavePropagationBlockCuda_kernels.cu
-/**
- * @file
- * This file is part of SWE.
- *
- * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
- *
- * @section LICENSE
- *
- * SWE is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * SWE is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
- *
- *
- * @section DESCRIPTION
- *
- * CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
- */
-
-#include "SWE_BlockCUDA.hh"
-#include "SWE_WavePropagationBlockCuda_kernels.hh"
-
-#include <cmath>
-#include <cstdio>
-
-#include "solvers/FWaveCuda.h"
-
-/**
- * The compute net-updates kernel calls the solver for a defined CUDA-Block and does a reduction over the computed wave speeds within this block.
- *
- * Remark: In overall we have nx+1 / ny+1 edges.
- *         Therefore the edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer"
- *         will not be computed in a typical call of the function:
- *           computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
- *                                                          hNetUpdatesLeftD,  hNetUpdatesRightD,
- *                                                          huNetUpdatesLeftD, huNetUpdatesRightD,
- *                                                          hNetUpdatesBelowD, hNetUpdatesAboveD,
- *                                                          hvNetUpdatesBelowD, hvNetUpdatesAboveD,
- *                                                          l_maximumWaveSpeedsD,
- *                                                          i_nx, i_ny
- *                                                        );
- *         To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
- *
- *
- * @param i_h water heights (CUDA-array).
- * @param i_hu momentums in x-direction (CUDA-array).
- * @param i_hv momentums in y-direction (CUDA-array).
- * @param i_b bathymetry values (CUDA-array).
- * @param o_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
- * @param o_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
- * @param o_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
- * @param o_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
- * @param o_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
- * @param o_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
- * @param o_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
- * @param o_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
- * @param o_maximumWaveSpeeds maximum wave speed which occurred within the CUDA-block is written here (CUDA-array).
- * @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
- * @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
- * @param i_offsetX cell/edge offset in x-direction.
- * @param i_offsetY cell/edge offset in y-direction.
- */
-__global__
-void computeNetUpdatesKernel(
-    const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
-    float* o_hNetUpdatesLeftD,   float* o_hNetUpdatesRightD,
-    float* o_huNetUpdatesLeftD,  float* o_huNetUpdatesRightD,
-    float* o_hNetUpdatesBelowD,  float* o_hNetUpdatesAboveD,
-    float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
-    float* o_maximumWaveSpeeds,
-    const int i_nX, const int i_nY,
-    const int i_offsetX, const int i_offsetY,
-    const int i_blockOffSetX, const int i_blockOffSetY
-) {
-  /*
-   * TODO: This kernel needs to be implemented.
-   */
-}
-
-/**
- * The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
- *
- * @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
- * @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
- * @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
- * @param i_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
- * @param i_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
- * @param i_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
- * @param i_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
- * @param i_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
- * @param io_h water heights (CUDA-array).
- * @param io_hu momentums in x-direction (CUDA-array).
- * @param io_hv momentums in y-direction (CUDA-array).
- * @param i_updateWidthX update width in x-direction.
- * @param i_updateWidthY update width in y-direction.
- * @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
- * @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
- */
-__global__
-void updateUnknownsKernel(
-    const float* i_hNetUpdatesLeftD,   const float* i_hNetUpdatesRightD,
-    const float* i_huNetUpdatesLeftD,  const float* i_huNetUpdatesRightD,
-    const float* i_hNetUpdatesBelowD,  const float* i_hNetUpdatesAboveD,
-    const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
-    float* io_h, float* io_hu, float* io_hv,
-    const float i_updateWidthX, const float i_updateWidthY,
-    const int i_nX, const int i_nY ) {
-  /*
-   * TODO: This kernel needs to be implemented.
-   */
-}
-
-/**
- * Compute the position of 2D coordinates in a 1D array.
- *   array[i][j] -> i * ny + j
- *
- * @param i_i row index.
- * @param i_j column index.
- * @param i_ny #(cells in y-direction).
- * @return 1D index.
- */
-__device__
-inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_ny) {
-  return i_i*i_ny + i_j;
-}