Added the CUDA-files. Changed a few routines accordingly.

92ed477e · breuera · f935a3b2 · 92ed477e · 92ed477e · 92ed477e
Commit 92ed477e authored Jun 24, 2012 by breuera
14 changed files
--- a/src/SWE_BlockCUDA.cu
+++ b/src/SWE_BlockCUDA.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#include <math.h>
+#include "tools/help.hh"
+#include "SWE_BlockCUDA.hh"
+#include "SWE_BlockCUDA_kernels.hh"
+
+//const int TILE_SIZE=16;
+//const int TILE_SIZE=8;
+
+// #include "SWE_BlockCUDA_kernels.cu"
+
+/*
+ * helper function to read CUDA error codes
+ * (implementation in swe.cu */
+void checkCUDAError(const char *msg)
+{
+    cudaError_t err = cudaGetLastError();
+    if( cudaSuccess != err)
+    {
+        fprintf(stderr, "\nCuda error (%s): %s.\n", msg, cudaGetErrorString( err) );
+        exit(-1);
+    }
+}
+
+/*
+ * helper function to read CUDA error codes
+ * (implementation in swe.cu */
+void tryCUDA(cudaError_t err, const char *msg)
+{
+    if( cudaSuccess != err)
+    {
+        fprintf(stderr, "\nCuda error (%s): %s.\n", msg, cudaGetErrorString( err) );
+        exit(-1);
+    }
+}
+
+/**
+ * Constructor: allocate variables for simulation
+ *
+ * unknowns h,hu,hv,b are defined on grid indices [0,..,nx+1]*[0,..,ny+1]
+ * -> computational domain is [1,..,nx]*[1,..,ny]
+ * -> plus ghost cell layer
+ *
+ * flux terms are defined for edges with indices [0,..,nx]*[1,..,ny]
+ * or [1,..,nx]*[0,..,ny] (for horizontal/vertical edges)
+ * Flux term with index (i,j) is located on the edge between 
+ * cells with index (i,j) and (i+1,j) or (i,j+1)
+ *
+ * bathymetry source terms are defined for cells with indices [1,..,nx]*[1,..,ny]
+ */
+SWE_BlockCUDA::SWE_BlockCUDA(float _offsetX, float _offsetY)
+ : SWE_Block(_offsetX,_offsetY)
+{
+
+  if (nx % TILE_SIZE != 0) {
+    cout << "WARNING: nx not a multiple of TILE_SIZE  -> will lead to crashes!" 
+         << endl << flush;
+  };
+  if (ny % TILE_SIZE != 0) {
+    cout << "WARNING: ny not a multiple of TILE_SIZE  -> will lead to crashes!" 
+         << endl << flush;
+  };
+
+  int size = (nx+2)*(ny+2)*sizeof(float);
+  // allocate CUDA memory for unknows h,hu,hv and bathymetry b
+  cudaMalloc((void**)&hd, size); 
+     checkCUDAError("allocate device memory for h");
+  cudaMalloc((void**)&hud, size);
+     checkCUDAError("allocate device memory for hu");
+  cudaMalloc((void**)&hvd, size);
+     checkCUDAError("allocate device memory for hv");
+  cudaMalloc((void**)&bd, size); 
+     checkCUDAError("allocate device memory for bd");
+
+//----> COULD BE IMPLEMENTED AS (INEFFICIENT) REFERENCE IMPLEMENTATION
+//   // allocate CUDA unknowns: maxmimum height and velocity
+//   size = (nx/TILE_SIZE)*(ny/TILE_SIZE)*sizeof(float);
+//   cudaMalloc((void**)&maxhd, size);
+//      checkCUDAError("allocate device memory for maxhd");
+//   cudaMalloc((void**)&maxvd, size);
+//      checkCUDAError("allocate device memory for maxvd");
+
+  // allocate consecutive memory for 2 columns with three unknowns each
+  // (h, hu, hv, excluding b) for copy/ghost layer at bottom/top boundary
+  size = nx+2;
+  bottomLayer = new float[6*size];
+  bottomGhostLayer = new SWE_Block1D( bottomLayer, bottomLayer+size, bottomLayer+(2*size), size );
+  bottomCopyLayer  = new SWE_Block1D( bottomLayer+(3*size), bottomLayer+(4*size), bottomLayer+(5*size), size );
+  
+  // same for top boundary:
+  topLayer = new float[6*size];
+  topGhostLayer = new SWE_Block1D( topLayer, topLayer+size, topLayer+(2*size), size );
+  topCopyLayer  = new SWE_Block1D( topLayer+(3*size), topLayer+(4*size), topLayer+(5*size), size );
+
+  // allocate resp. memory on the CUDA device
+  cudaMalloc((void**)&bottomLayerDevice, 6*size*sizeof(float));
+     checkCUDAError("allocate device memory for bottom copy/ghost layer");
+  cudaMalloc((void**)&topLayerDevice, 6*size*sizeof(float));
+     checkCUDAError("allocate device memory for top copy/ghost layer");
+
+}
+
+/**
+ * Destructor: de-allocate all variables
+ */
+SWE_BlockCUDA::~SWE_BlockCUDA() {
+  cudaFree(hd); cudaFree(hud); cudaFree(hvd); cudaFree(bd);
+//  cudaFree(maxhd); cudaFree(maxvd);
+  
+  // free memory for copy/ghost layers
+  delete bottomLayer;
+  delete topLayer;
+  
+  cudaFree(bottomLayerDevice);
+  cudaFree(topLayerDevice);
+  
+}
+
+//==================================================================
+// methods for simulation
+//==================================================================
+
+/**
+ * set the values of all ghost cells depending on the specifed 
+ * boundary conditions
+ */
+void SWE_BlockCUDA::setBoundaryConditions() {
+#ifdef DBG
+ cout << "Call kernel to compute h in ghost layer corner (for visualisation only) " 
+      << flush << endl;
+#endif
+  // Fill ghost layer corner cells
+  kernelHdBufferEdges<<<1,1>>>(hd, nx, ny);
+
+#ifdef DBG
+ cout << "Call kernel to compute left/right boundaries " << flush << endl;
+#endif
+//   synchWaterHeightAfterWrite();
+//   synchDischargeAfterWrite();
+
+  if (boundary[BND_LEFT] == PASSIVE || boundary[BND_LEFT] == CONNECT) {
+     // nothing to be done: 
+     // ghost values are copied by SWE_BlockCUDA::synchGhostLayerAfterWrite(...)
+  }
+  else {
+     dim3 dimBlock(1,TILE_SIZE);
+     dim3 dimGrid(1,ny/TILE_SIZE);
+     kernelLeftBoundary<<<dimGrid,dimBlock>>>(
+        hd,hud,hvd,nx,ny,boundary[BND_LEFT]);
+  };
+
+  if (boundary[BND_RIGHT] == PASSIVE || boundary[BND_RIGHT] == CONNECT) {
+     // nothing to be done: 
+     // ghost values are copied by SWE_BlockCUDA::synchGhostLayerAfterWrite(...)
+  }
+  else {
+     dim3 dimBlock(1,TILE_SIZE);
+     dim3 dimGrid(1,ny/TILE_SIZE);
+     kernelRightBoundary<<<dimGrid,dimBlock>>>(
+        hd,hud,hvd,nx,ny,boundary[BND_RIGHT]);
+  };
+
+#ifdef DBG
+  cout << "Call kernel to compute bottom/top boundaries " << flush << endl;
+#endif
+  switch (boundary[BND_BOTTOM]) {
+    case CONNECT:
+    {
+      // set ghost layer data in auxiliary data structure for ghost layer:
+      for(int i=0; i<=nx+1; i++) {
+        bottomGhostLayer->h[i]  = neighbour[BND_BOTTOM]->h[i];
+        bottomGhostLayer->hu[i] = neighbour[BND_BOTTOM]->hu[i];
+        bottomGhostLayer->hv[i] = neighbour[BND_BOTTOM]->hv[i];
+      };
+    }
+    case PASSIVE: /* also executed for case CONNECT */ 
+    {
+      // copy ghost layer data from buffer bottomLayerDevice
+      // into bottom ghost layer of unknowns
+      dim3 dimBlock(TILE_SIZE,1);
+      dim3 dimGrid(nx/TILE_SIZE,1);
+      kernelBottomGhostBoundary<<<dimGrid,dimBlock>>>(
+         hd,hud,hvd,bottomLayerDevice,nx,ny);
+      break;
+    }
+    default: 
+    {
+      // set simple boundary conditions (OUTFLOW, WALL) by resp. kernel:
+      dim3 dimBlock(TILE_SIZE,1);
+      dim3 dimGrid(nx/TILE_SIZE,1);
+      kernelBottomBoundary<<<dimGrid,dimBlock>>>(
+         hd,hud,hvd,nx,ny,boundary[BND_BOTTOM]);
+    }
+  };
+
+  switch (boundary[BND_TOP]) {
+    case CONNECT:
+    {
+      // set ghost layer data in auxiliary data structure for ghost layer:
+      for(int i=0; i<=nx+1; i++) {
+        topGhostLayer->h[i]  = neighbour[BND_TOP]->h[i];
+        topGhostLayer->hu[i] = neighbour[BND_TOP]->hu[i];
+        topGhostLayer->hv[i] = neighbour[BND_TOP]->hv[i];
+      };
+    }
+    case PASSIVE: /* also executed for case CONNECT */
+    {
+      // copy ghost layer data from buffer bottomLayerDevice
+      // into bottom ghost layer of unknowns
+      dim3 dimBlock(TILE_SIZE,1);
+      dim3 dimGrid(nx/TILE_SIZE,1);
+      kernelTopGhostBoundary<<<dimGrid,dimBlock>>>(
+         hd,hud,hvd,topLayerDevice,nx,ny);
+      break;
+    }
+    default:
+    { 
+      // set simple boundary conditions (OUTFLOW, WALL) by resp. kernel: 
+      dim3 dimBlock(TILE_SIZE,1);
+      dim3 dimGrid(nx/TILE_SIZE,1);
+      kernelTopBoundary<<<dimGrid,dimBlock>>>(
+         hd,hud,hvd,nx,ny,boundary[BND_TOP]);
+    }
+  };
+}
+
+/**
+ * synchronise the ghost layer content of h, hu, and hv in main memory 
+ * with device memory and auxiliary data structures, i.e. transfer 
+ * memory from main/auxiliary memory into device memory
+ */
+void SWE_BlockCUDA::synchGhostLayerAfterWrite() {
+
+  if (boundary[BND_LEFT] == PASSIVE || boundary[BND_LEFT] == CONNECT) {
+#ifdef DBG
+  cout << "Set left passive boundary" << endl;
+#endif
+     // transfer h, hu, and hv from left ghost layer to resp. device memory
+     cudaMemcpy(hd, h[0], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("left ghost layer not transferred to device");
+     cudaMemcpy(hud, hu[0], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("left ghost layer not transferred to device");
+     cudaMemcpy(hvd, hv[0], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("left ghost layer not transferred to device");
+  };
+  if (boundary[BND_RIGHT] == PASSIVE || boundary[BND_RIGHT] == CONNECT) {
+#ifdef DBG
+  cout << "Set right passive boundary" << endl;
+#endif
+     // transfer h, hu, and hv from right ghost layer to resp. device memory
+     cudaMemcpy(hd+((nx+1)*(ny+2)), h[nx+1], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("right ghost layer not transferred to device");
+     cudaMemcpy(hud+((nx+1)*(ny+2)), hu[nx+1], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("right ghost layer not transferred to device");
+     cudaMemcpy(hvd+((nx+1)*(ny+2)), hv[nx+1], (ny+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("right ghost layer not transferred to device");
+  };
+
+  // bottom and top boundary ghost layers are replicated (for efficiency reasons)
+  // in the  memory regions starting from bottomLayer and topLayer
+  // -> these need to be transfered to device memory
+  if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
+#ifdef DBG
+  cout << "Set bottom passive boundary" << endl;
+  cout << " * transfer " << 3*(nx+2)*sizeof(float) << " bytes." << endl;
+#endif
+     // transfer bottom ghost layer 
+     // (3 arrays - h, hu, hv - of nx+2 floats, consecutive in memory)
+     cudaMemcpy(bottomLayerDevice, bottomLayer, 3*(nx+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("bottom ghost layer not transferred to device");
+  };
+
+  if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
+#ifdef DBG
+  cout << "Set top passive boundary" << endl;
+  cout << " * transfer " << 3*(nx+2)*sizeof(float) << " bytes." << endl;
+#endif
+     // transfer top ghost layer
+     // (3 arrays - h, hu, hv - of nx+2 floats, consecutive in memory)
+     cudaMemcpy(topLayerDevice, topLayer, 3*(nx+2)*sizeof(float), cudaMemcpyHostToDevice);
+     checkCUDAError("top ghost layer not transferred to device");
+  };
+
+}
+
+/**
+ * Update (for heterogeneous computing) variables h, hu, hv in copy layers
+ * before an external access to the unknowns 
+ * (only required for PASSIVE and CONNECT boundaries)
+ * - copy (up-to-date) content from device memory into main memory
+ */
+void SWE_BlockCUDA::synchCopyLayerBeforeRead() {
+#ifdef DBG
+  cout << "Transfer copy layers from device to main memory" << flush << endl;
+#endif
+
+  // copy values in copy layers to main memory
+  // left copy layer:
+  int offset = ny+2;
+  cudaMemcpy(h[1], hd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("left ghost layer not transferred to device");
+  cudaMemcpy(hu[1], hud+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("left ghost layer not transferred to device");
+  cudaMemcpy(hv[1], hvd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("left ghost layer not transferred to device");
+  // right copy layer
+  offset = nx*(ny+2);
+  cudaMemcpy(h[nx], hd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("right ghost layer not transferred to device");
+  cudaMemcpy(hu[nx], hud+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("right ghost layer not transferred to device");
+  cudaMemcpy(hv[nx], hvd+offset, (ny+2)*sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("right ghost layer not transferred to device");
+
+  int size = 3*(nx+2);
+  // bottom copy layer
+  if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
+     dim3 dimBlock(TILE_SIZE,1);
+     dim3 dimGrid(nx/TILE_SIZE,1);
+     kernelBottomCopyLayer<<<dimGrid,dimBlock>>>(
+        hd,hud,hvd,bottomLayerDevice+size,nx,ny);
+
+     cudaMemcpy(bottomLayer+size, bottomLayerDevice+size, size*sizeof(float), cudaMemcpyDeviceToHost);
+     checkCUDAError("bottom copy layer not transferred from device");
+  };
+
+  // top copy layer
+  if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
+     dim3 dimBlock(TILE_SIZE,1);
+     dim3 dimGrid(nx/TILE_SIZE,1);
+     kernelTopCopyLayer<<<dimGrid,dimBlock>>>(
+        hd,hud,hvd,topLayerDevice+size,nx,ny);
+     
+     cudaMemcpy(topLayer+size, topLayerDevice+size, size*sizeof(float), cudaMemcpyDeviceToHost);
+     checkCUDAError("top copy layer not transferred from device");
+  };
+
+}
+
+/**
+ * register the row or column layer next to a boundary as a "copy layer",
+ * from which values will be copied into the ghost layer or a neighbour;
+ * @return	a SWE_Block1D object that contains row variables h, hu, and hv
+ */
+SWE_Block1D* SWE_BlockCUDA::registerCopyLayer(BoundaryEdge edge){
+
+  // for TOP and BOTTOM layer, the implementation is identical to that in SWE_Block
+  // for LEFT and RIGHT layer, separate layers are used that avoid strided copies 
+  // when transferring memory between host and device memory
+  switch (edge) {
+    case BND_LEFT:
+      return new SWE_Block1D( h.getColProxy(1), hu.getColProxy(1), hv.getColProxy(1));
+    case BND_RIGHT:
+      return new SWE_Block1D( h.getColProxy(nx), hu.getColProxy(nx), hv.getColProxy(nx));
+    case BND_BOTTOM:
+      // transfer bottom ghost and copy layer to extra SWE_Block1D
+      for(int i=0; i<=nx+1; i++) {
+        bottomGhostLayer->h[i]  = h[i][0];
+        bottomGhostLayer->hu[i] = hu[i][0];
+        bottomGhostLayer->hv[i] = hv[i][0];
+        bottomCopyLayer->h[i]  = h[i][1];
+        bottomCopyLayer->hu[i] = hu[i][1];
+        bottomCopyLayer->hv[i] = hv[i][1];
+      };
+      return bottomCopyLayer;
+    case BND_TOP:
+      // transfer bottom ghost and copy layer to extra SWE_Block1D
+      for(int i=0; i<=nx+1; i++) {
+        topGhostLayer->h[i]  = h[i][ny+1];
+        topGhostLayer->hu[i] = hu[i][ny+1];
+        topGhostLayer->hv[i] = hv[i][ny+1];
+        topCopyLayer->h[i]  = h[i][ny];
+        topCopyLayer->hu[i] = hu[i][ny];
+        topCopyLayer->hv[i] = hv[i][ny];
+      };
+      return topCopyLayer;
+  };
+  return NULL;
+}
+
+/**
+ * "grab" the ghost layer at the specific boundary in order to set boundary values 
+ * in this ghost layer externally. 
+ * The boundary conditions at the respective ghost layer is set to PASSIVE, 
+ * such that the grabbing program component is responsible to provide correct 
+ * values in the ghost layer, for example by receiving data from a remote 
+ * copy layer via MPI communication. 
+ * @param	specified edge
+ * @return	a SWE_Block1D object that contains row variables h, hu, and hv
+ */
+SWE_Block1D* SWE_BlockCUDA::grabGhostLayer(BoundaryEdge edge){
+
+  // for TOP and BOTTOM layer, the implementation is identical to that in SWE_Block
+  // for LEFT and RIGHT layer, separate layers are used that avoid strided copies 
+  // when transferring memory between host and device memory
+  boundary[edge] = PASSIVE;
+  switch (edge) {
+    case BND_LEFT:
+      return new SWE_Block1D( h.getColProxy(0), hu.getColProxy(0), hv.getColProxy(0));
+    case BND_RIGHT:
+      return new SWE_Block1D( h.getColProxy(nx+1), hu.getColProxy(nx+1), hv.getColProxy(nx+1));
+    case BND_BOTTOM:
+      return bottomGhostLayer;
+    case BND_TOP:
+      return topGhostLayer;
+  };
+  return NULL;
+}
+
+
+
+//==================================================================
+// protected member functions for memory model: 
+// in case of temporary variables (especial in non-local memory, for 
+// example on accelerators), the main variables h, hu, hv, and b 
+// are not necessarily updated after each time step.
+// The following methods are called to synchronise before or after 
+// external read or write to the variables.
+//==================================================================
+
+/**
+ * Update all temporary and non-local (for heterogeneous computing) variables
+ * after an external update of the main variables h, hu, hv, and b.
+ */
+void SWE_BlockCUDA::synchAfterWrite() {
+  // update h, hu, hv, b in device memory
+  synchWaterHeightAfterWrite();
+  synchDischargeAfterWrite();
+  synchBathymetryAfterWrite();
+
+  // update the auxiliary data structures for copy and ghost layers 
+  // at bottom (and top, see below) boundaries
+  // -> only required for PASSIVE and CONNECT boundaries
+  if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
+     // transfer bottom ghost and copy layer to extra SWE_Block1D
+     for(int i=0; i<=nx+1; i++) {
+       bottomGhostLayer->h[i]  = h[i][0];
+       bottomGhostLayer->hu[i] = hu[i][0];
+       bottomGhostLayer->hv[i] = hv[i][0];
+       bottomCopyLayer->h[i]  = h[i][1];
+       bottomCopyLayer->hu[i] = hu[i][1];
+       bottomCopyLayer->hv[i] = hv[i][1];
+     };
+  };
+  
+  if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
+     // transfer bottom ghost and copy layer to extra SWE_Block1D
+     for(int i=0; i<=nx+1; i++) {
+       topGhostLayer->h[i]  = h[i][ny+1];
+       topGhostLayer->hu[i] = hu[i][ny+1];
+       topGhostLayer->hv[i] = hv[i][ny+1];
+       topCopyLayer->h[i]  = h[i][ny];
+       topCopyLayer->hu[i] = hu[i][ny];
+       topCopyLayer->hv[i] = hv[i][ny];
+     };
+  };
+
+}
+
+/**
+ * Update temporary and non-local (for heterogeneous computing) variables
+ * after an external update of the water height h
+ */
+void SWE_BlockCUDA::synchWaterHeightAfterWrite() {
+#ifdef DBG
+  cout << "Load water height h into device memory" << flush << endl;
+#endif
+  int size = (nx+2)*(ny+2)*sizeof(float);
+  cudaMemcpy(hd,h.elemVector(), size, cudaMemcpyHostToDevice);
+     checkCUDAError("memory of h not transferred");
+}
+
+/**
+ * Update temporary and non-local (for heterogeneous computing) variables
+ * after an external update of the discharge variables hu and hv
+ */
+void SWE_BlockCUDA::synchDischargeAfterWrite() {
+#ifdef DBG
+  cout << "Load discharge hu and hv into device memory" << flush << endl;
+#endif
+  int size = (nx+2)*(ny+2)*sizeof(float);
+  cudaMemcpy(hud,hu.elemVector(), size, cudaMemcpyHostToDevice);
+     checkCUDAError("memory of hu not transferred");
+  cudaMemcpy(hvd,hv.elemVector(), size, cudaMemcpyHostToDevice);
+     checkCUDAError("memory of hv not transferred");
+}
+
+/**
+ * Update temporary and non-local (for heterogeneous computing) variables
+ * after an external update of the bathymetry b
+ */
+void SWE_BlockCUDA::synchBathymetryAfterWrite() {
+#ifdef DBG
+  cout << "Load bathymetry unknowns into device memory" << flush << endl;
+#endif
+  int size = (nx+2)*(ny+2)*sizeof(float);
+  cudaMemcpy(bd,b.elemVector(), size, cudaMemcpyHostToDevice);
+     checkCUDAError("memory of b not transferred");
+  
+//  computeBathymetrySources();
+}
+
+/**
+ * Update the main variables h, hu, hv, and b before an external read access:
+ * copies current content of the respective device variables hd, hud, hvd, bd
+ */
+void SWE_BlockCUDA::synchBeforeRead() {
+   synchWaterHeightBeforeRead();
+   synchDischargeBeforeRead();
+   synchBathymetryBeforeRead();
+
+/* --- the following part is excluded:
+   --- it should not be necessary to update the auxiliary data structures 
+   --- for top/bottom copy/ghost layers in main memory: these need to be 
+   --- kept consistent by class SWE_BlockCUDA (in particular, they cannot be 
+   --- changed externally).
+   --- */  
+//   if (boundary[BND_BOTTOM] == PASSIVE || boundary[BND_BOTTOM] == CONNECT) {
+//      // transfer bottom ghost and copy layer to extra SWE_Block1D
+//      for(int i=0; i<=nx+1; i++) {
+//        h[i][0] = bottomGhostLayer->h[i]; 
+//        hu[i][0]= bottomGhostLayer->hu[i];
+//        hv[i][0]= bottomGhostLayer->hv[i];
+//        h[i][1] = bottomCopyLayer->h[i];  
+//        hu[i][1]= bottomCopyLayer->hu[i]; 
+//        hv[i][1]= bottomCopyLayer->hv[i]; 
+//      };
+//   };
+//   
+//   if (boundary[BND_TOP] == PASSIVE || boundary[BND_TOP] == CONNECT) {
+//      // transfer bottom ghost and copy layer to extra SWE_Block1D
+//      for(int i=0; i<=nx+1; i++) {
+//        h[i][ny+1]  = topGhostLayer->h[i]; 
+//        hu[i][ny+1] = topGhostLayer->hu[i];
+//        hv[i][ny+1] = topGhostLayer->hv[i];
+//        h[i][ny]  = topCopyLayer->h[i];  
+//        hu[i][ny] = topCopyLayer->hu[i]; 
+//        hv[i][ny] = topCopyLayer->hv[i]; 
+//      };
+//   };
+
+}
+
+/**
+ * Update temporary and non-local (for heterogeneous computing) variables
+ * before an external access to the water height h
+ */
+void SWE_BlockCUDA::synchWaterHeightBeforeRead() {
+  int size = (nx+2)*(ny+2)*sizeof(float);
+#ifdef DBG
+  cout << "Copy water height h from device" << flush << endl;
+#endif
+  cudaMemcpy(h.elemVector(),hd, size, cudaMemcpyDeviceToHost);
+     checkCUDAError("memory of h not transferred");
+
+#ifdef DBG
+  cout << "Set water height in ghost-layer corner cells" << flush << endl;
+#endif
+  // only required for visualisation: set values in corner ghost cells
+  h[0][0] = h[1][1];
+  h[0][ny+1] = h[1][ny];
+  h[nx+1][0] = h[nx][1];
+  h[nx+1][ny+1] = h[nx][ny];
+}
+
+/**
+ * Update temporary and non-local (for heterogeneous computing) variables
+ * before an external access to the discharge variables hu and hv
+ */
+void SWE_BlockCUDA::synchDischargeBeforeRead() {
+  int size = (nx+2)*(ny+2)*sizeof(float);
+#ifdef DBG
+  cout << "Copy discharge hu and hv from device" << flush << endl;
+#endif
+  cudaMemcpy(hu.elemVector(),hud, size, cudaMemcpyDeviceToHost);
+     checkCUDAError("memory of hu not transferred");
+  cudaMemcpy(hv.elemVector(),hvd, size, cudaMemcpyDeviceToHost);
+     checkCUDAError("memory of hv not transferred");
+
+}
+
+/**
+ * Update temporary and non-local (for heterogeneous computing) variables
+ * before an external access to the bathymetry b
+ */
+void SWE_BlockCUDA::synchBathymetryBeforeRead() {
+  int size = (nx+2)*(ny+2)*sizeof(float);
+#ifdef DBG
+  cout << "Copy water bathymetry b from device" << flush << endl;
+#endif
+  cudaMemcpy(b.elemVector(),bd, size, cudaMemcpyDeviceToHost);
+     checkCUDAError("memory of b not transferred");
+}
+
+
+
+//==================================================================
+
+/**
+ * overload operator<< such that data can be written via cout <<
+ * -> needs to be declared as friend to be allowed to access private data
+ */
+ostream& operator<<(ostream& os, const SWE_BlockCUDA& swe) {
+
+  os << "Grid dimensions: " << swe.nx << "x" << swe.ny << endl;
+
+  cout << "Water height:" << endl;
+  for(int j=swe.ny+1; j>=0; j--) {
+    for(int i=0; i<=swe.nx+1; i++) {
+      os << swe.h[i][j] << "  ";
+    };
+    os << endl;
+  };
+
+  cout << "Momentum in x-direction:" << endl;
+  for(int j=swe.ny+1; j>=0; j--) {
+    for(int i=0; i<=swe.nx+1; i++) {
+      os << swe.hu[i][j] << "  ";
+    };
+    os << endl;
+  };
+
+  cout << "Momentum in y-direction:" << endl;
+  for(int j=swe.ny+1; j>=0; j--) {
+    for(int i=0; i<=swe.nx+1; i++) {
+      os << swe.hv[i][j] << "  ";
+    };
+    os << endl;
+  };
+
+#ifdef DBG
+  cout << "Ghost/Copy Layer bottom:" << endl;
+     for(int i=0; i<=swe.nx+1; i++) {
+       os << swe.bottomGhostLayer->h[i]  << "  "; 
+       os << swe.bottomGhostLayer->hu[i] << "  ";
+       os << swe.bottomGhostLayer->hv[i] << "  ";
+       os << swe.bottomCopyLayer->h[i]  << "  ";  
+       os << swe.bottomCopyLayer->hu[i] << "  "; 
+       os << swe.bottomCopyLayer->hv[i] << "  "; 
+       cout << endl;
+     };
+  
+  cout << "Ghost/Copy Layer top:" << endl;
+     for(int i=0; i<=swe.nx+1; i++) {
+       os << swe.topGhostLayer->h[i]  << "  "; 
+       os << swe.topGhostLayer->hu[i] << "  ";
+       os << swe.topGhostLayer->hv[i] << "  ";
+       os << swe.topCopyLayer->h[i]  << "  ";  
+       os << swe.topCopyLayer->hu[i] << "  "; 
+       os << swe.topCopyLayer->hv[i] << "  "; 
+       cout << endl;
+     };
+#endif
+  
+  os << flush;
+
+  return os;
+}
+
--- a/src/SWE_BlockCUDA.hh
+++ b/src/SWE_BlockCUDA.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_BLOCKCUDA_HH
+#define __SWE_BLOCKCUDA_HH
+
+#include <iostream>
+#include <stdio.h>
+#include <fstream>
+#include <cuda_runtime.h>
+#include "tools/help.hh"
+#include "SWE_Block.hh"
+
+using namespace std;
+
+void checkCUDAError(const char *msg);
+void tryCUDA(cudaError_t err, const char *msg);
+
+const int TILE_SIZE=16;
+//const int TILE_SIZE=8;
+
+/**
+ * SWE_BlockCUDA extends the base class SWE_Block towards  
+ * a base class for a CUDA implementation of the shallow water equations.
+ * It adds the respective variables in GPU memory, and provides 
+ * methods for data transfer between main and GPU memory.
+ */
+class SWE_BlockCUDA : public SWE_Block {
+
+  public:
+    // Constructor und Destructor
+    SWE_BlockCUDA(float _offsetX = 0, float _offsetY = 0);
+    virtual ~SWE_BlockCUDA();
+    
+  // object methods
+
+// ---> COULD BE IMPLEMENTED TO PROVIDE A DEFAULT IMPLEMENTATION
+//     // determine maximum possible time step
+//     virtual float getMaxTimestep();
+
+    // deliver a pointer to proxy class that represents 
+    // the layer that is copied to an external ghost layer 
+    virtual SWE_Block1D* registerCopyLayer(BoundaryEdge edge);
+    // "grab" the ghost layer in order to set these values externally
+    virtual SWE_Block1D* grabGhostLayer(BoundaryEdge edge);
+
+    // access to CUDA variables
+    /**
+     *  @return	pointer to the array #hd (water height) in device memory 
+     */
+    const float* getCUDA_waterHeight() { return hd; };
+    /**
+     *  @return	pointer to the array #hb (bathymetry) in device memory 
+     */
+    const float* getCUDA_bathymetry() { return bd; };
+
+  protected:
+     
+    // synchronisation Methods
+    virtual void synchAfterWrite();
+    virtual void synchWaterHeightAfterWrite();
+    virtual void synchDischargeAfterWrite();
+    virtual void synchBathymetryAfterWrite();
+    virtual void synchGhostLayerAfterWrite();
+
+    virtual void synchBeforeRead();
+    virtual void synchWaterHeightBeforeRead();
+    virtual void synchDischargeBeforeRead();
+    virtual void synchBathymetryBeforeRead();
+    virtual void synchCopyLayerBeforeRead();
+    
+    // set boundary conditions in ghost layers (set boundary conditions)
+    virtual void setBoundaryConditions();
+
+    // define arrays for main unknowns in CUDA global memory: 
+    // hd, hud, hvd, and bd are CUDA arrays corresp. to h, hu, hv, and b
+    float* hd;
+    float* hud;
+    float* hvd;
+    float* bd;
+	
+  private:
+     
+    // separate memory to hold bottom and top ghost and copy layer 
+    // in main memory allowing non-strided access
+    float* bottomLayer;
+    float* topLayer;
+    SWE_Block1D* bottomGhostLayer;
+    SWE_Block1D* bottomCopyLayer;
+    SWE_Block1D* topGhostLayer;
+    SWE_Block1D* topCopyLayer;
+    // and resp. memory on the CUDA device:
+    float* bottomLayerDevice;
+    float* topLayerDevice;
+
+    // helper arrays: store maximum height and velocities to determine time step
+    float* maxhd;
+    float* maxvd;
+
+    // overload operator<< such that data can be written via cout <<
+    // -> needs to be declared as friend to be allowed to access private data
+    friend ostream& operator<< (ostream& os, const SWE_BlockCUDA& swe);
+
+};
+
+ostream& operator<< (ostream& os, const SWE_BlockCUDA& swe);
+
+/**
+    Return index of hd[i][j] in linearised array
+	@param i,j		x- and y-coordinate of grid cell
+	@param ny		grid size in y-direction (without ghost layers)
+*/	
+inline __device__
+int getCellCoord(int x, int y, int ny) {
+   return x*(ny+2) + y;
+}
+
+
+/**
+    Return index of edge-data Fhd[i][j] or Ghd[i][j] in linearised array
+	@param i,j		x- and y-coordinate of grid cell
+	@param ny		grid size in y-direction (without ghost layers)
+*/	
+inline __device__
+int getEdgeCoord(int x, int y, int ny) {
+   return x*(ny+1) + y;
+}
+
+/**
+    Return index of a specific element in the arrays of bathymetry source terms
+	@param i,j		x- and y-coordinate of grid cell
+	@param ny		grid size in y-direction (without ghost layers)
+*/	
+inline __device__
+int getBathyCoord(int x, int y, int ny) {
+   return x*ny + y;
+}
+
+
+#endif
--- a/src/SWE_BlockCUDA_kernels.cu
+++ b/src/SWE_BlockCUDA_kernels.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#include "SWE_BlockCUDA.hh"
+#include "SWE_BlockCUDA_kernels.hh"
+
+/**
+    Sets corner values of hd (only needed for visualization)
+	@param hd		h-values on device
+*/	
+__global__
+void kernelHdBufferEdges(float* hd, int nx, int ny)
+{ 
+  hd[getCellCoord(0   ,0   ,ny)] = hd[getCellCoord(1 ,1 ,ny)];
+  hd[getCellCoord(0   ,ny+1,ny)] = hd[getCellCoord(1 ,ny,ny)];
+  hd[getCellCoord(nx+1,0   ,ny)] = hd[getCellCoord(nx,1 ,ny)];
+  hd[getCellCoord(nx+1,ny+1,ny)] = hd[getCellCoord(nx,ny,ny)];
+	
+  //Corresponding C-Code:
+  //h[0][0] = h[1][1];
+  //h[0][ny+1] = h[1][ny];
+  //h[nx+1][0] = h[nx][1];
+  //h[nx+1][ny+1] = h[nx][ny];
+}
+
+
+//******************************************************************
+// kernels to implement boundary conditions
+//******************************************************************
+
+/**
+ * CUDA kernel to set left boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.y and threadIdx.y loop over the boundary elements
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelLeftBoundary(float* hd, float* hud, float* hvd,
+                        int nx, int ny, BoundaryType bound)
+{
+  int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
+  int ghost = getCellCoord(0,j,ny);
+  int inner = getCellCoord(1,j,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = (bound==WALL) ? -hud[inner] : hud[inner];
+  hvd[ghost] = hvd[inner];
+}
+
+/**
+ * CUDA kernel to set right boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.y and threadIdx.y loop over the boundary elements
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelRightBoundary(float* hd, float* hud, float* hvd,
+                         int nx, int ny, BoundaryType bound)
+{
+  int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
+  int ghost = getCellCoord(nx+1,j,ny);
+  int inner = getCellCoord(nx  ,j,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = (bound==WALL) ? -hud[inner] : hud[inner];
+  hvd[ghost] = hvd[inner];
+}
+
+
+/**
+ * CUDA kernel to set bottom boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelBottomBoundary(float* hd, float* hud, float* hvd,
+                          int nx, int ny, BoundaryType bound)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,0,ny);
+  int inner = getCellCoord(i,1,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = hud[inner];
+  hvd[ghost] = (bound==WALL) ? -hvd[inner] : hvd[inner]; 
+}
+
+/**
+ * CUDA kernel to set bottom boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ */
+__global__
+void kernelTopBoundary(float* hd, float* hud, float* hvd,
+                       int nx, int ny, BoundaryType bound)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,ny+1,ny);
+  int inner = getCellCoord(i,ny  ,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = hud[inner];
+  hvd[ghost] = (bound==WALL) ? -hvd[inner] : hvd[inner]; 
+}
+
+/**
+ * CUDA kernel to set bottom boundary layer according to the external 
+ * ghost layer status (conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements.
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelBottomGhostBoundary(float* hd, float* hud, float* hvd,
+                               float* bottomGhostLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,0,ny);
+
+  hd[ghost]  = bottomGhostLayer[i];
+  hud[ghost] = bottomGhostLayer[(nx+2)+i];
+  hvd[ghost] = bottomGhostLayer[2*(nx+2)+i];
+}
+
+/**
+ * CUDA kernel to set top boundary layer according to the external 
+ * ghost layer status (conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelTopGhostBoundary(float* hd, float* hud, float* hvd,
+                            float* topGhostLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,ny+1,ny);
+  
+  hd[ghost]  = topGhostLayer[i];
+  hud[ghost] = topGhostLayer[(nx+2)+i];
+  hvd[ghost] = topGhostLayer[2*(nx+2)+i];
+}
+
+/**
+ * CUDA kernel to update bottom copy layer according 
+ * (for boundary conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements.
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelBottomCopyLayer(float* hd, float* hud, float* hvd,
+                           float* bottomCopyLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int copy = getCellCoord(i,1,ny);
+
+  bottomCopyLayer[i]          = hd[copy];  
+  bottomCopyLayer[(nx+2)+i]   = hud[copy]; 
+  bottomCopyLayer[2*(nx+2)+i] = hvd[copy]; 
+}
+
+/**
+ * CUDA kernel to set top boundary layer according to the external 
+ * ghost layer status (conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelTopCopyLayer(float* hd, float* hud, float* hvd,
+                        float* topCopyLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int copy = getCellCoord(i,ny,ny);
+  
+  topCopyLayer[i]          = hd[copy];  
+  topCopyLayer[(nx+2)+i]   = hud[copy]; 
+  topCopyLayer[2*(nx+2)+i] = hvd[copy]; 
+}
+
+
+
+// //******************************************************************
+// // kernels to implement boundary conditions
+// //******************************************************************
+// 
+// 
+// /**
+//  * CUDA kernel for maximum reduction
+//  * required to compute maximum water height and velocities to determine 
+//  * allow time step
+//  */
+// __global__ 
+// void kernelMaximum(float* maxhd, float* maxvd, int start, int size) {
+//   int tx = start+threadIdx.x;
+//   for (int i=size>>1; i>0; i>>=1) {
+//      __syncthreads();
+//      if (tx < i) {
+//         if( maxhd[tx] < maxhd[tx+i] ) maxhd[tx] = maxhd[tx+i];
+//         if( maxvd[tx] < maxvd[tx+i] ) maxvd[tx] = maxvd[tx+i];
+//      };
+//   };
+// }
+// 
+// 
--- a/src/SWE_BlockCUDA_kernels.hh
+++ b/src/SWE_BlockCUDA_kernels.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_BLOCKCUDAKERNELS_HH
+#define __SWE_BLOCKCUDAKERNELS_HH
+
+// declaration of CUDA kernels
+__global__ 
+void kernelHdBufferEdges(float* hd, int nx, int ny);
+
+__global__ 
+void kernelMaximum(float* maxhd, float* maxvd, int start, int size);
+
+__global__
+void kernelLeftBoundary(float* hd, float* hud, float* hvd,
+                        int nx, int ny, BoundaryType bound);
+__global__
+void kernelRightBoundary(float* hd, float* hud, float* hvd,
+                         int nx, int ny, BoundaryType bound);
+__global__
+void kernelBottomBoundary(float* hd, float* hud, float* hvd,
+                          int nx, int ny, BoundaryType bound);
+__global__
+void kernelTopBoundary(float* hd, float* hud, float* hvd,
+                       int nx, int ny, BoundaryType bound);
+__global__
+void kernelBottomGhostBoundary(float* hd, float* hud, float* hvd,
+                               float* bottomGhostLayer, int nx, int ny);
+__global__
+void kernelTopGhostBoundary(float* hd, float* hud, float* hvd,
+                            float* topGhostLayer, int nx, int ny);
+__global__
+void kernelBottomCopyLayer(float* hd, float* hud, float* hvd,
+                           float* bottomCopyLayer, int nx, int ny);
+__global__
+void kernelTopCopyLayer(float* hd, float* hud, float* hvd,
+                        float* topCopyLayer, int nx, int ny);
+
+#endif
--- a/src/SWE_RusanovBlockCUDA.cu
+++ b/src/SWE_RusanovBlockCUDA.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#include <math.h>
+#include "tools/help.hh"
+#include "SWE_BlockCUDA.hh"
+#include "SWE_RusanovBlockCUDA.hh"
+
+//const int TILE_SIZE=16;
+//const int TILE_SIZE=8;
+
+// #include "SWE_BlockCUDA_kernels.cu"
+#include "SWE_RusanovBlockCUDA_kernels.hh"
+
+/**
+ * Constructor: allocate variables for simulation
+ *
+ * unknowns h,hu,hv,b are defined on grid indices [0,..,nx+1]*[0,..,ny+1]
+ * -> computational domain is [1,..,nx]*[1,..,ny]
+ * -> plus ghost cell layer
+ *
+ * flux terms are defined for edges with indices [0,..,nx]*[1,..,ny]
+ * or [1,..,nx]*[0,..,ny] (for horizontal/vertical edges)
+ * Flux term with index (i,j) is located on the edge between 
+ * cells with index (i,j) and (i+1,j) or (i,j+1)
+ *
+ * bathymetry source terms are defined for cells with indices [1,..,nx]*[1,..,ny]
+ */
+SWE_RusanovBlockCUDA::SWE_RusanovBlockCUDA(float _offsetX, float _offsetY)
+ : SWE_BlockCUDA(_offsetX,_offsetY)
+#ifdef DBG
+ , Fh(nx+1,ny+1), Fhu(nx+1,ny+1), Fhv(nx+1,ny+1),
+   Gh(nx+1,ny+1), Ghu(nx+1,ny+1), Ghv(nx+1,ny+1)
+#endif
+{
+
+  int size = (nx+1)*(ny+1)*sizeof(float);
+  // allocate CUDA memory for flow unknows
+  cudaMalloc((void**)&Fhd, size); 
+     checkCUDAError("allocate device memory for Fh");
+  cudaMalloc((void**)&Fhud, size);
+     checkCUDAError("allocate device memory for Fhu");
+  cudaMalloc((void**)&Fhvd, size);
+     checkCUDAError("allocate device memory for Fhv");
+
+  // allocate CUDA memory for flow unknows
+  cudaMalloc((void**)&Ghd, size); 
+     checkCUDAError("allocate device memory for Fh");
+  cudaMalloc((void**)&Ghud, size);
+     checkCUDAError("allocate device memory for Ghu");
+  cudaMalloc((void**)&Ghvd, size);
+     checkCUDAError("allocate device memory for Ghv");
+
+  // allocate CUDA unknowns: bathymetry source terms 
+  size = nx*ny*sizeof(float);
+  cudaMalloc((void**)&Bxd, size); 
+     checkCUDAError("allocate device memory for Bxd");
+  cudaMalloc((void**)&Byd, size);
+     checkCUDAError("allocate device memory for Byd");
+
+  // allocate CUDA unknowns: maxmimum height and velocity
+  size = (nx/TILE_SIZE)*(ny/TILE_SIZE)*sizeof(float);
+  cudaMalloc((void**)&maxhd, size);
+     checkCUDAError("allocate device memory for maxhd");
+  cudaMalloc((void**)&maxvd, size);
+     checkCUDAError("allocate device memory for maxvd");
+
+}
+
+/**
+ * Destructor: de-allocate all variables
+ */
+SWE_RusanovBlockCUDA::~SWE_RusanovBlockCUDA() {
+  cudaFree(Fhd); cudaFree(Fhud); cudaFree(Fhvd);
+  cudaFree(Ghd); cudaFree(Ghud); cudaFree(Ghvd);
+  cudaFree(Bxd); cudaFree(Byd);
+  cudaFree(maxhd); cudaFree(maxvd);
+  
+}
+
+
+//==================================================================
+// methods for simulation
+//==================================================================
+
+/**
+ * compute the largest allowed time step for the current grid block;
+ * will be stored in the member variable SWE_Block::maxTimestep.
+ * This specific CUDA implementation requires velocity data precomputed
+ * by the kernel kernelEulerTimestep, and therefore needs to be called
+ * directly after calling this kernel.
+ */
+void SWE_RusanovBlockCUDA::computeMaxTimestepCUDA() {
+
+  float hmax = 0.0;
+  float vmax = 0.0;
+  float meshSize = (dx<dy) ? dx : dy;
+   
+  int numTiles = (nx/TILE_SIZE)*(ny/TILE_SIZE);
+#ifdef DBG
+  cout << "Number of tiles: " << numTiles << " -> ";
+#endif
+  // use 512 threads for maxmimum computation ...
+  int threads = 512;
+  // ... or the largest power-of-two smaller than numTiles:
+  while (threads > numTiles) threads >>=1;
+#ifdef DBG
+  cout << "use " << threads << " threads per tile" << endl << flush;
+#endif
+  
+  // loop over arrays of maxhd and maxvd to determine maximum
+  // GPU maximum kernel called for 512 (or threads) elements at a time
+  // CPU maintains maximum of all groups-of-512
+  for(int tile=numTiles; tile>0; tile -= threads) {
+     float newmax = 0;
+     dim3 dimBlock(threads,1,1);
+     dim3 dimGrid(1,1);
+
+     // Compute maximum of arrays maxhd and maxvd
+     // (for elements in range [tile-threads .. tile-1])
+     int start = tile-threads;
+     // use element range [0 .. threads-1], if tile-threads < 0
+     if (start<0) start=0; 
+//   cout << "Call kernel to compute time step " << start << flush << endl;
+     kernelMaximum<<<dimGrid,dimBlock>>>(maxhd,maxvd,start,threads);
+     checkCUDAError("compute maximum height and velocity");
+
+     cudaMemcpy(&newmax,maxhd, sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("memory of hmax not transferred");
+     if (newmax>hmax) hmax = newmax;
+     cudaMemcpy(&newmax,maxvd, sizeof(float), cudaMemcpyDeviceToHost);
+  checkCUDAError("memory of vmax not transferred");
+     if (newmax>vmax) vmax = newmax;
+     
+#ifdef DBG
+     cout << "Current maxmimum of tile " << start << ":"
+          << hmax << ',' << vmax << endl << flush;
+#endif
+  };
+
+#ifdef DBG
+  cout << "Computed Time step (CUDA): " 
+     << " from " << hmax << " and " << vmax << " -> "
+     << meshSize/(sqrt(g*hmax) + vmax) << endl << flush;
+#endif
+  
+  // sqrt(g*hmax) + vmax is the velocity of a characteristic shallow-water wave
+  // such that a wave must not propagate farther than dx in a single time step
+  // (uses 0.5 as a pessimistic factor to reduce time step size)
+  maxTimestep = 0.5*meshSize/(sqrt(g*hmax) + vmax);
+}
+
+
+/**
+ * Depending on the current values of h, hu, hv (incl. ghost layers)
+ * update these unknowns in each grid cell
+ * (ghost layers and bathymetry are not updated).
+ * The Rusanov CUDA-implementation of simulateTimestep
+ * subsequently calls the functions computeNumericalFluxes (to compute
+ * all fluxes on grid edges), and updateUnknowns (to update the variables
+ * according to flux values, typically according to an Euler time step).
+ * @param dt  size of the time step
+ */
+void SWE_RusanovBlockCUDA::simulateTimestep(float dt) {
+
+  // compute the numerical fluxes on all edges
+  computeNumericalFluxes();
+
+  // update the unknowns according to an Euler timestep
+  updateUnknowns(dt);
+
+}
+
+/**
+ * perform forward-Euler time steps, starting with simulation time tStart,:
+ * until simulation time tEnd is reached; 
+ * device-global variables hd, hud, hvd are updated;
+ * unknowns h, hu, hv in main memory are not updated.
+ * Ghost layers and bathymetry sources are updated between timesteps.
+ * intended as main simulation loop between two checkpoints
+ */
+__host__
+float SWE_RusanovBlockCUDA::simulate(float tStart, float tEnd) {
+  float t = tStart;
+  do {
+     // set values in ghost cells:
+     setGhostLayer();
+     
+     // execute Euler time step:
+     simulateTimestep(maxTimestep);
+
+     t += maxTimestep;
+
+#ifdef DBG
+     cout << "Simulation at time " << t << endl << flush;
+#endif
+  } while(t < tEnd);
+
+  return t;
+}
+
+/**
+ * implements interface function updateUnknowns:
+ * based on the (Rusanov) fluxes computed on each edge
+ * (and stored in the variables Fh, Gh, etc.);
+ * compute the balance terms for each cell, and update the
+ * unknowns according to an Euler time step.
+ * It will force an update of the copy layer in the main
+ * memory by calling synchCopyLayerBeforeRead(),
+ * and provide an compute the maximum allowed time step size
+ * by calling computeMaxTimestepCUDA().
+ * @param dt  size of the time step.
+ */
+__host__
+void SWE_RusanovBlockCUDA::updateUnknowns(float dt) {
+
+  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
+#ifdef DBG
+cout << "Call kernel for Euler timestep " << flush << endl;
+#endif
+  kernelEulerTimestep<<<dimGrid,dimBlock>>>(hd,hud,hvd,
+                                            Fhd,Fhud,Fhvd,Ghd,Ghud,Ghvd,
+              Bxd,Byd,
+              maxhd,maxvd,
+              nx,ny,dt,1.0f/dx,1.0f/dy);
+
+
+  computeMaxTimestepCUDA();
+
+  synchCopyLayerBeforeRead();
+}
+
+/**
+ * compute the flux terms on all edges
+ */
+void SWE_RusanovBlockCUDA::computeNumericalFluxes() {
+
+  // time step required to compute Lax Friedrichs constant dx/dt
+  float dt = 2*maxTimestep;
+
+  // compute bathymetry source terms (depend on h)
+  computeBathymetrySources();
+
+  // compute F-fluxes, i.e. all fluxes in x-direction (on left/right edges)
+  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
+#ifdef DBG
+cout << "Call kernel to compute F-fluxes " << flush << endl;
+#endif
+  kernelComputeFluxesF<<<dimGrid,dimBlock>>>(hd,hud,hvd, Fhd,Fhud,Fhvd,
+            ny,g,dx/dt,1);
+
+  dim3 dimLeftBlock(1,TILE_SIZE);
+  dim3 dimLeftGrid(1,ny/TILE_SIZE);
+  kernelComputeFluxesF<<<dimLeftGrid,dimLeftBlock>>>(hd,hud,hvd, Fhd,Fhud,Fhvd,
+                 ny,g,dx/dt,0);
+  
+  // compute G-fluxes, i.e. all fluxes in y-direction (on top/bottom edges)
+#ifdef DBG
+cout << "Call kernel to compute G-fluxes " << flush << endl;
+#endif
+  kernelComputeFluxesG<<<dimGrid,dimBlock>>>(hd,hud,hvd, Ghd,Ghud,Ghvd,
+            ny,g,dy/dt,1);
+
+  dim3 dimTopBlock(TILE_SIZE,1);
+  dim3 dimTopGrid(nx/TILE_SIZE,1);
+  kernelComputeFluxesG<<<dimTopGrid,dimTopBlock>>>(hd,hud,hvd, Ghd,Ghud,Ghvd,
+               ny,g,dy/dt,0);
+
+// cout << (*this) << flush;
+
+}
+
+/**
+ * compute the bathymetry source terms in all cells
+ */
+void SWE_RusanovBlockCUDA::computeBathymetrySources() {
+
+  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
+#ifdef DBG
+cout << "Call kernel to compute bathymetry sources" << flush << endl;
+#endif
+  kernelComputeBathymetrySources<<<dimGrid,dimBlock>>>(hd,bd,Bxd,Byd,ny,g);
+
+// cout << (*this) << flush;
+
+}
+
+
+//==================================================================
+
+/**
+ * overload operator<< such that data can be written via cout <<
+ * -> needs to be declared as friend to be allowed to access private data
+ */
+ostream& operator<<(ostream& os, const SWE_RusanovBlockCUDA& swe) {
+
+  os << "Grid dimensions: " << swe.nx << "x" << swe.ny << endl;
+
+  cout << "Water height:" << endl;
+  for(int j=swe.ny+1; j>=0; j--) {
+    for(int i=0; i<=swe.nx+1; i++) {
+      os << swe.h[i][j] << "  ";
+    };
+    os << endl;
+  };
+
+  cout << "Momentum in x-direction:" << endl;
+  for(int j=swe.ny+1; j>=0; j--) {
+    for(int i=0; i<=swe.nx+1; i++) {
+      os << swe.hu[i][j] << "  ";
+    };
+    os << endl;
+  };
+
+  cout << "Momentum in y-direction:" << endl;
+  for(int j=swe.ny+1; j>=0; j--) {
+    for(int i=0; i<=swe.nx+1; i++) {
+      os << swe.hv[i][j] << "  ";
+    };
+    os << endl;
+  };
+
+#ifdef DBG
+  cout << "Ghost/Copy Layer bottom:" << endl;
+     for(int i=0; i<=swe.nx+1; i++) {
+       os << swe.bottomGhostLayer->h[i]  << "  "; 
+       os << swe.bottomGhostLayer->hu[i] << "  ";
+       os << swe.bottomGhostLayer->hv[i] << "  ";
+       os << swe.bottomCopyLayer->h[i]  << "  ";  
+       os << swe.bottomCopyLayer->hu[i] << "  "; 
+       os << swe.bottomCopyLayer->hv[i] << "  "; 
+       cout << endl;
+     };
+  
+  cout << "Ghost/Copy Layer top:" << endl;
+     for(int i=0; i<=swe.nx+1; i++) {
+       os << swe.topGhostLayer->h[i]  << "  "; 
+       os << swe.topGhostLayer->hu[i] << "  ";
+       os << swe.topGhostLayer->hv[i] << "  ";
+       os << swe.topCopyLayer->h[i]  << "  ";  
+       os << swe.topCopyLayer->hu[i] << "  "; 
+       os << swe.topCopyLayer->hv[i] << "  "; 
+       cout << endl;
+     };
+#endif
+
+
+// #ifdef DBG
+//   cout << "Fluss - Wellenhoehe:" << endl;
+//   for(int i=0; i<=swe.nx; i++) {
+//     for(int j=1; j<=swe.ny; j++) {
+//       os << swe.Fh[i][j] << "  ";
+//     };
+//     os << endl;
+//   };
+// 
+//   cout << "Fluss - Durchfluss in x-Richtung:" << endl;
+//   for(int i=0; i<=swe.nx; i++) {
+//     for(int j=1; j<=swe.ny; j++) {
+//       os << swe.Fhu[i][j] << "  ";
+//     };
+//     os << endl;
+//   };
+// 
+//   cout << "Fluss - Durchfluss in y-Richtung:" << endl;
+//   for(int i=0; i<=swe.nx; i++) {
+//     for(int j=1; j<=swe.ny; j++) {
+//       os << swe.Fhv[i][j] << "  ";
+//     };
+//     os << endl;
+//   };
+// 
+//   cout << "Fluss - Wellenhoehe:" << endl;
+//   for(int i=1; i<=swe.nx; i++) {
+//     for(int j=0; j<=swe.ny; j++) {
+//       os << swe.Gh[i][j] << "  ";
+//     };
+//     os << endl;
+//   };
+// 
+//   cout << "Fluss - Durchfluss in x-Richtung:" << endl;
+//   for(int i=1; i<=swe.nx; i++) {
+//     for(int j=0; j<=swe.ny; j++) {
+//       os << swe.Ghu[i][j] << "  ";
+//     };
+//     os << endl;
+//   };
+// 
+//   cout << "Fluss - Durchfluss in y-Richtung:" << endl;
+//   for(int i=1; i<=swe.nx; i++) {
+//     for(int j=0; j<=swe.ny; j++) {
+//       os << swe.Ghv[i][j] << "  ";
+//     };
+//     os << endl;
+//   };
+// #endif
+  
+  os << flush;
+
+  return os;
+}
+
--- a/src/SWE_RusanovBlockCUDA.hh
+++ b/src/SWE_RusanovBlockCUDA.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_RUSANOVBLOCKCUDA_HH
+#define __SWE_RUSANOVBLOCKCUDA_HH
+
+#include <iostream>
+#include <stdio.h>
+#include <fstream>
+#include <cuda_runtime.h>
+#include "tools/help.hh"
+#include "SWE_Block.hh"
+#include "SWE_BlockCUDA.hh"
+
+using namespace std;
+
+/**
+ * SWE_RusanovBlockCUDA extends the base class SWE_BlockCUDA, 
+ * and provides a concrete CUDA implementation of a simple 
+ * shallow water model based on Rusanov Flux computation on the 
+ * edges and explicit time stepping.
+ */
+class SWE_RusanovBlockCUDA : public SWE_BlockCUDA {
+
+  public:
+    // Constructor und Destructor
+    SWE_RusanovBlockCUDA(float _offsetX = 0, float _offsetY = 0);
+    virtual ~SWE_RusanovBlockCUDA();
+    
+  // object methods
+
+    virtual void computeNumericalFluxes();
+    // simulate for specified time range
+    // execute Euler time step
+    virtual void updateUnknowns(float dt);
+    /// execute a single time step of the simulation
+    virtual void simulateTimestep(float dt);
+    // compute flux terms on edges
+    virtual float simulate(float tStart, float tEnd);
+    
+  private:
+     
+    // compute bathymetry source terms
+    void computeBathymetrySources();
+
+    // determine maximum possible time step
+    void computeMaxTimestepCUDA();
+
+    // arrays to hold the values of the flux terms at cell edges
+    float* Fhd;
+    float* Fhud;
+    float* Fhvd;
+    float* Ghd;
+    float* Ghud;
+    float* Ghvd;
+
+    // arrays to hold the bathymetry source terms for the hu and hv equations
+    float* Bxd;
+    float* Byd;
+    
+    // helper arrays: store maximum height and velocities to determine time step
+    float* maxhd;
+    float* maxvd;
+
+    // overload operator<< such that data can be written via cout <<
+    // -> needs to be declared as friend to be allowed to access private data
+    friend ostream& operator<< (ostream& os, const SWE_RusanovBlockCUDA& swe);
+
+#ifdef DBG
+    // --- only required for debugging purposes ---
+    // arrays for fluxes for h,hu,hv in main memory
+    Float2D Fh; 
+    Float2D Fhu;
+    Float2D Fhv;
+    Float2D Gh; 
+    Float2D Ghu;
+    Float2D Ghv;
+    // dump fluxes for h,hu,hv from CUDA device memory into main memory
+    void cudaDumpFlux();
+#endif
+    
+};
+
+ostream& operator<< (ostream& os, const SWE_RusanovBlockCUDA& swe);
+
+#endif
--- a/src/SWE_RusanovBlockCUDA_kernels.cu
+++ b/src/SWE_RusanovBlockCUDA_kernels.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#include "SWE_BlockCUDA.hh"
+#include "SWE_RusanovBlockCUDA_kernels.hh"
+
+//******************************************************************
+// kernels to implement Euler time-stepping
+//******************************************************************
+
+inline __device__
+float computeFlux(float fLow, float fHigh, float xiLow, float xiHigh, float llf) {
+  // local Lax-Friedrich
+  return 0.5f*(fLow+fHigh) - 0.5f*llf*(xiHigh-xiLow);
+}
+
+/**
+ * computes the flux vector components Fhd, Fhud and Fhvd for a single 
+ * edge by calling the function computeFlux 
+ */
+__global__
+void kernelComputeFluxesF(float* hd, float* hud, float* hvd,
+                          float* Fhd, float* Fhud, float* Fhvd,
+                          int ny, float g, float llf, int istart)
+{
+   int i = istart + TILE_SIZE*blockIdx.x + threadIdx.x;
+   int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
+   int iL = getCellCoord(i,j,ny);	// index of left cell
+   int iR = getCellCoord(i+1,j,ny);	// index of right cell
+   int iEdge = getEdgeCoord(i,j,ny);	// index of current edge
+
+   float upwind = max( fabs(hud[iL]/hd[iL]), fabs(hud[iR]/hd[iR]) ); 
+   Fhd[iEdge] = computeFlux( hud[iL], hud[iR], hd[iL], hd[iR], upwind );
+   Fhud[iEdge] = computeFlux( hud[iL]*hud[iL]/hd[iL] + 0.5f*g*hd[iL]*hd[iL],
+            		      hud[iR]*hud[iR]/hd[iR] + 0.5f*g*hd[iR]*hd[iR],
+	        	      hud[iL], 
+	        	      hud[iR], 
+                              llf );
+   Fhvd[iEdge] = computeFlux( hud[iL]*hvd[iL]/hd[iL],hud[iR]*hvd[iR]/hd[iR], 
+                              hvd[iL], hvd[iR], 
+                              llf );
+}
+
+/**
+ * computes the flux vector components Ghd, Ghud and Ghvd for a single 
+ * edge by calling the function computeFlux 
+ */
+__global__
+void kernelComputeFluxesG(float* hd, float* hud, float* hvd,
+                          float* Ghd, float* Ghud, float* Ghvd,
+                          int ny, float g, float llf, int jstart)
+{
+   int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+   int j = jstart + TILE_SIZE*blockIdx.y + threadIdx.y;
+   int iB = getCellCoord(i,j  ,ny);
+   int iT = getCellCoord(i,j+1,ny);
+   int iEdge = getEdgeCoord(i,j,ny);
+
+   float upwind = max( fabs(hvd[iB]/hd[iB]), fabs(hvd[iT]/hd[iT]) ); 
+   Ghd[iEdge] = computeFlux( hvd[iB], hvd[iT], hd[iB], hd[iT], upwind );
+   Ghud[iEdge] = computeFlux( hud[iB]*hvd[iB]/hd[iB],hud[iT]*hvd[iT]/hd[iT], 
+                              hud[iB], hud[iT], 
+                              llf );
+   Ghvd[iEdge] = computeFlux( hvd[iB]*hvd[iB]/hd[iB] + 0.5f*g*hd[iB]*hd[iB],
+                              hvd[iT]*hvd[iT]/hd[iT] + 0.5f*g*hd[iT]*hd[iT],
+			      hvd[iB], hvd[iT], 
+                              llf );
+}
+
+/**
+ * computes the bathymetry source terms for the hu and hv equation for 
+ * a given cell in the resp. array elements Bxd and Byd
+ */
+__global__
+void kernelComputeBathymetrySources(float* hd, float* bd, float* Bxd, float* Byd, 
+                                    int ny, float g)
+{
+// Note: different index ranges for h and b vs. Bxd, Byd: 
+//       [0..nx+]x[0..ny+1] vs. [1..nx]x[1..ny]
+// Note: indices for Bxd, Byd shifted to start with 0
+   int i = TILE_SIZE*blockIdx.x + threadIdx.x;
+   int j = TILE_SIZE*blockIdx.y + threadIdx.y;
+   
+   // compute indices of involved array elements
+   int ij = getBathyCoord(i,j,ny);
+   int left  = getCellCoord(i  ,j+1,ny); // index of left cell (arrays hd,bd)
+   int right = getCellCoord(i+2,j+1,ny); // index of right cell (array hd,bb)
+
+   Bxd[ij] = g * 0.5f*(hd[right] + hd[left]) * 0.5f*(bd[right] - bd[left]);
+
+   int bot = getCellCoord(i+1,j,ny);   // index of left cell (arrays hd,bd)
+   int top = getCellCoord(i+1,j+2,ny); // index of right cell (array hd,bb)
+
+   Byd[ij] = g * 0.5f*(hd[top] + hd[bot]) * 0.5f*(bd[top] - bd[bot]);
+
+}
+
+
+/**
+ * CUDA kernel for Euler time step
+ */
+__global__
+void kernelEulerTimestep(float* hd, float* hud, float* hvd,
+                         float* Fhd, float* Fhud, float* Fhvd,
+                         float* Ghd, float* Ghud, float* Ghvd,
+			 float* Bxd, float* Byd,
+			 float* maxhd, float* maxvd,
+                         int nx, int ny, float dt, float dxi, float dyi)
+{
+
+   __shared__ float Fds[TILE_SIZE+1][TILE_SIZE+1];
+   __shared__ float Gds[TILE_SIZE+1][TILE_SIZE+1];
+
+   int tx = threadIdx.x;
+   int ty = threadIdx.y;
+   
+   int i = 1 + TILE_SIZE*blockIdx.x + tx;
+   int j = 1 + TILE_SIZE*blockIdx.y + ty;
+   int iElem = getCellCoord(i,j,ny);   // index of current cell
+   int iEdge = getEdgeCoord(i,j,ny);   // index of right/top Edge
+   int iLeft = getEdgeCoord(i-1,j,ny); // index of left Edge
+   int iBot  = getEdgeCoord(i,j-1,ny); // index of bottom Edge
+   
+   float h;
+   float hu;
+   float hv;
+
+   // copy flux unknowns from global into local memory
+   // -> for fluxes corresponding to variable h
+   Fds[tx+1][ty] = Fhd[iEdge];
+   Gds[tx][ty+1] = Ghd[iEdge];
+   if (tx==0) Fds[tx][ty] = Fhd[iLeft];
+   if (ty==0) Gds[tx][ty] = Ghd[iBot];
+   __syncthreads();
+
+   // compute new value of h from fluxes
+   h = hd[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi 
+           	        +(Gds[tx][ty+1]-Gds[tx][ty])*dyi );
+   __syncthreads();
+
+   // copy flux unknowns from global into local memory
+   // -> for fluxes corresponding to variable hu
+   Fds[tx+1][ty] = Fhud[iEdge];
+   Gds[tx][ty+1] = Ghud[iEdge];
+   if (tx==0) Fds[tx][ty] = Fhud[iLeft];
+   if (ty==0) Gds[tx][ty] = Ghud[iBot];
+   __syncthreads();
+
+   // compute new value of hu from fluxes
+   hu = hud[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi 
+           	          +(Gds[tx][ty+1]-Gds[tx][ty])*dyi 
+			  + Bxd[getBathyCoord(i-1,j-1,ny)]*dxi );
+   __syncthreads();
+
+   // copy flux unknowns from global into local memory
+   // -> for fluxes corresponding to variable hv
+   Fds[tx+1][ty] = Fhvd[iEdge];
+   Gds[tx][ty+1] = Ghvd[iEdge];
+   if (tx==0) Fds[tx][ty] = Fhvd[iLeft];
+   if (ty==0) Gds[tx][ty] = Ghvd[iBot];
+   __syncthreads();
+
+   // compute new value of hv from fluxes
+   hv = hvd[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi 
+           	          +(Gds[tx][ty+1]-Gds[tx][ty])*dyi 
+			  + Byd[getBathyCoord(i-1,j-1,ny)]*dyi );
+   __syncthreads();
+
+   /* precompute maxmimal height and velocity per thread block 
+    * (for computation of allowed time step size)
+    */
+
+   // compute absolute values of h and absolute velocity
+   hd[iElem] = h; Fds[tx][ty] = h;
+   hud[iElem] = hu; hu = (h>0.0) ? fabs(hu/h) : 0.0;
+   hvd[iElem] = hv; hv = (h>0.0) ? fabs(hv/h) : 0.0; 
+   Gds[tx][ty] = (hu>hv) ? hu : hv;
+   
+   // parallel reduction on thread block:
+   // determine maximum wave height and velocity
+   // step 1: reduction in ty-direction
+   for (i=TILE_SIZE>>1; i>0; i>>=1) {
+      __syncthreads();
+      if (ty < i) {
+         if( Fds[tx][ty] < Fds[tx][ty+i]) Fds[tx][ty] = Fds[tx][ty+i];
+         if( Gds[tx][ty] < Gds[tx][ty+i]) Gds[tx][ty] = Gds[tx][ty+i];
+      };
+   };
+   // step 2: reduction in ty-direction
+   for (i=TILE_SIZE>>1; i>0; i>>=1) {
+      __syncthreads();
+      if ((tx < i) && (ty==0)) {
+         if( Fds[tx][ty] < Fds[tx+i][ty]) Fds[tx][ty] = Fds[tx+i][ty];
+         if( Gds[tx][ty] < Gds[tx+i][ty]) Gds[tx][ty] = Gds[tx+i][ty];
+      };
+   };
+   // save maxima in array maxhd and maxvd
+   
+   if ((tx == 0) && (ty==0)) {
+      j = blockIdx.x*(nx/TILE_SIZE)+blockIdx.y;
+      maxhd[j] = Fds[0][0];
+      maxvd[j] = Gds[0][0];
+   };
+   
+}
+
+
+//******************************************************************
+// kernels to implement boundary conditions
+//******************************************************************
+
+
+/**
+ * CUDA kernel for maximum reduction
+ * required to compute maximum water height and velocities to determine 
+ * allow time step
+ */
+__global__ 
+void kernelMaximum(float* maxhd, float* maxvd, int start, int size) {
+  int tx = start+threadIdx.x;
+  for (int i=size>>1; i>0; i>>=1) {
+     __syncthreads();
+     if (tx < i) {
+        if( maxhd[tx] < maxhd[tx+i] ) maxhd[tx] = maxhd[tx+i];
+        if( maxvd[tx] < maxvd[tx+i] ) maxvd[tx] = maxvd[tx+i];
+     };
+  };
+}
+
+
+
--- a/src/SWE_RusanovBlockCUDA_kernels.hh
+++ b/src/SWE_RusanovBlockCUDA_kernels.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_RUSANOVBLOCKCUDAKERNELS_HH
+#define __SWE_RUSANOVBLOCKCUDAKERNELS_HH
+
+//******************************************************************
+// kernels to implement Euler time-stepping
+//******************************************************************
+
+__global__
+void kernelComputeFluxesF(float* hd, float* hud, float* hvd,
+                          float* Fhd, float* Fhud, float* Fhvd,
+                          int ny, float g, float llf, int istart);
+
+__global__
+void kernelComputeFluxesG(float* hd, float* hud, float* hvd,
+                          float* Ghd, float* Ghud, float* Ghvd,
+                          int ny, float g, float llf, int jstart);
+
+__global__
+void kernelComputeBathymetrySources(float* hd, float* bd, float* Bxd, float* Byd, 
+                                    int ny, float g);
+
+__global__
+void kernelEulerTimestep(float* hd, float* hud, float* hvd,
+                         float* Fhd, float* Fhud, float* Fhvd,
+                         float* Ghd, float* Ghud, float* Ghvd,
+			 float* Bxd, float* Byd,
+			 float* maxhd, float* maxvd,
+                         int nx, int ny, float dt, float dxi, float dyi);
+
+
+__global__ 
+void kernelMaximum(float* maxhd, float* maxvd, int start, int size);
+
+#endif
+
+
+
--- a/src/SWE_WavePropagationBlockCuda.cu
+++ b/src/SWE_WavePropagationBlockCuda.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
+ */
+
+#include "SWE_WavePropagationBlockCuda.hh"
+#include "SWE_BlockCUDA.hh"
+
+#include "SWE_WavePropagationBlockCuda_kernels.hh"
+
+#include <cassert>
+
+#ifndef STATICLOGGER
+#define STATICLOGGER
+#include "tools/Logger.hpp"
+static tools::Logger s_sweLogger;
+#endif
+
+// CUDA-C includes
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+// Thrust library (used for the final maximum reduction in the method computeNumericalFluxes(...))
+#include <thrust/device_vector.h>
+
+
+/**
+ * Constructor of a SWE_WavePropagationBlockCuda.
+ *
+ * Allocates the variables for the simulation:
+ *   Please note: The definition of indices changed in contrast to the CPU-Implementation.
+ *
+ *   unknowns hd,hud,hvd,bd stored on the CUDA device are defined for grid indices [0,..,nx+1]*[0,..,ny+1] (-> Abstract class SWE_BlockCUDA)
+ *     -> computational domain is [1,..,nx]*[1,..,ny]
+ *     -> plus ghost cell layer
+ *
+ *   net-updates are defined for edges with indices [0,..,nx]*[0,..,ny] for horizontal and vertical edges for simplicity (one layer is not necessary).
+ *
+ *   A left/right net update with index (i-1,j) is located on the edge between
+ *   cells with index (i-1,j) and (i,j):
+ *
+ *   *********************
+ *   *         *         *
+ *   * (i-1,j) *  (i,j)  *
+ *   *         *         *
+ *   *********************
+ *
+ *             *
+ *            ***
+ *           *****
+ *             *
+ *             *
+ *   NetUpdatesLeft(i-1,j)
+ *             or
+ *   NetUpdatesRight(i-1,j)
+ *
+ *
+ *   A below/above net update with index (i, j-1) is located on the edge between
+ *   cells with index (i, j-1) and (i,j):
+ *
+ *   ***********
+ *   *         *
+ *   * (i, j)  *   *
+ *   *         *  **  NetUpdatesBelow(i,j-1)
+ *   *********** *****         or
+ *   *         *  **  NetUpdatesAbove(i,j-1)
+ *   * (i,j-1) *   *
+ *   *         *
+ *   ***********
+ *
+ * @param i_offsetX spatial offset of the block in x-direction.
+ * @param i_offsetY spatial offset of the offset in y-direction.
+ */
+SWE_WavePropagationBlockCuda::SWE_WavePropagationBlockCuda( const float i_offsetX,
+                                                            const float i_offsetY ): SWE_BlockCUDA(i_offsetX,i_offsetY) {
+  // compute the size of one 1D net-update array.
+  int sizeOfNetUpdates = (nx+1)*(ny+1)*sizeof(float);
+
+  // allocate CUDA memory for the net-updates
+  cudaMalloc((void**)&hNetUpdatesLeftD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesLeftD");
+
+  cudaMalloc((void**)&hNetUpdatesRightD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesRightD");
+
+  cudaMalloc((void**)&huNetUpdatesLeftD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for huNetUpdatesLeftD");
+
+  cudaMalloc((void**)&huNetUpdatesRightD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for huNetUpdatesRightD");
+
+  cudaMalloc((void**)&hNetUpdatesBelowD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesBelowD");
+
+  cudaMalloc((void**)&hNetUpdatesAboveD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesAboveD");
+
+  cudaMalloc((void**)&hvNetUpdatesBelowD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hvNetUpdatesBelowD");
+
+  cudaMalloc((void**)&hvNetUpdatesAboveD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesAboveD");
+}
+
+/**
+ * Destructor of a SWE_WavePropagationBlockCuda.
+ *
+ * Frees all of the memory, which was allocated within the constructor.
+ * Resets the CUDA device: Useful if error occured and printf is used on the device (buffer).
+ */
+SWE_WavePropagationBlockCuda::~SWE_WavePropagationBlockCuda() {
+  // free the net-updates memory
+  cudaFree(hNetUpdatesLeftD);
+  cudaFree(hNetUpdatesRightD);
+  cudaFree(huNetUpdatesLeftD);
+  cudaFree(huNetUpdatesRightD);
+
+  cudaFree(hNetUpdatesBelowD);
+  cudaFree(hNetUpdatesAboveD);
+  cudaFree(hvNetUpdatesBelowD);
+  cudaFree(hvNetUpdatesAboveD);
+
+  // reset the cuda device
+  s_sweLogger.printString("Resetting the CUDA devices");
+  cudaDeviceReset();
+}
+
+/**
+ * Compute a single global time step of a given time step width.
+ * Remark: The user has to take care about the time step width. No additional check is done. The time step width typically available
+ *         after the computation of the numerical fluxes (hidden in this method).
+ *
+ * First the net-updates are computed.
+ * Then the cells are updated with the net-updates and the given time step width.
+ *
+ * @param i_dT time step width in seconds.
+ */
+__host__
+void SWE_WavePropagationBlockCuda::simulateTimestep(float i_dT) {
+ // Compute the numerical fluxes/net-updates in the wave propagation formulation.
+ computeNumericalFluxes();
+
+ // Update the unknowns with the net-updates.
+ updateUnknowns(i_dT);
+}
+
+/**
+ * Compute the numerical fluxes (net-update formulation here) on all edges.
+ *
+ * The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
+ * To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
+ *   In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
+ *
+ *   TODO: A splitting or direct computation of the time step width might increase the total time step size.
+ *     Example:
+ *       dx = 11, dy = 6;
+ *       max wave speed in x-direction: 10
+ *       max wave speed in y-direction: 5.5
+ *       max wave speed in both direction: 10
+ *
+ *       => maximum time step (current implementation): min(11/10, 6/10) = 0.6
+ *       => maximum time step (splitting the dimensions): min(11/10, 6/5.5) = 1.09..
+ */
+void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
+  /*
+   * Initialization.
+   */
+  //! definition of one CUDA-block. Typical size are 8*8 or 16*16
+  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+
+  /**
+   * Definition of the "main" CUDA-grid.
+   * This grid covers only edges 0..#(edges in x-direction)-2 and 0..#(edges in y-direction)-2.
+   *
+   * An example with a computational domain of size
+   *  nx = 24, ny = 16
+   * with a 1 cell ghost layer would result in a grid with
+   *  (nx+2)*(ny+2) = (26*18)
+   * cells and
+   *  (nx+1)*(ny+1) = (25*17)
+   * edges.
+   *
+   * The CUDA-blocks (here 8*8) mentioned above would cover all edges except
+   * the ones lying between the computational domain and the right/top ghost layer:
+   *
+   *                                                          *
+   *                                                         **        top ghost layer,
+   *                                                        ********   cell ids
+   *                        *******************************  **        = (*, ny+1)
+   *                        *         *         *         *   *
+   *                        *   8*8   *   8*8   *   8*8   *
+   *                        *  block  *  block  *  block  *
+   *                        *         *         *         *
+   *                        *******************************
+   *                        *         *         *         *
+   *                        *   8*8   *   8*8   *   8*8   *
+   *                    *   *  block  *  block  *  block  *
+   *     bottom         **  *         *         *         *
+   *     ghost     ******** *******************************
+   *     layer,         **
+   *     cell ids       *   *                              *
+   *     =(*,0)            ***                            ***
+   *                        *                              *
+   *                        *                              *
+   *                  left ghost layer,             right ghost layer,
+   *                  cell ids = (0,*)             cell ids = (nx+1, *)
+   */
+  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
+
+  // assert a valid tile size
+  assert(nx%TILE_SIZE==0);
+  assert(ny%TILE_SIZE==0);
+
+  // "2D array" which holds the blockwise maximum wave speeds
+  float* l_maximumWaveSpeedsD;
+
+  // size of the maximum wave speed array (dimension of the grid + ghost layers, without the top right block), sizeof(float) not included
+  int l_sizeMaxWaveSpeeds = ((dimGrid.x+1)*(dimGrid.y+1)-1);
+  cudaMalloc((void**)&l_maximumWaveSpeedsD, (l_sizeMaxWaveSpeeds*sizeof(float)) );
+
+
+  /*
+   * Compute the net updates for the 'main part and the two 'boundary' parts.
+   */
+  // compute the net-updates for the "main" part.
+  computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
+                                                 hNetUpdatesLeftD,  hNetUpdatesRightD,
+                                                 huNetUpdatesLeftD, huNetUpdatesRightD,
+                                                 hNetUpdatesBelowD, hNetUpdatesAboveD,
+                                                 hvNetUpdatesBelowD, hvNetUpdatesAboveD,
+                                                 l_maximumWaveSpeedsD,
+                                                 nx,ny
+                                               );
+
+  // compute the "remaining" net updates (edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer")
+  // edges between cell nx and ghost layer nx+1
+  dim3 dimRightBlock(1, TILE_SIZE);
+  dim3 dimRightGrid(1, ny/TILE_SIZE);
+  computeNetUpdatesKernel<<<dimRightGrid, dimRightBlock>>>( hd, hud, hvd, bd,
+                                                            hNetUpdatesLeftD,  hNetUpdatesRightD,
+                                                            huNetUpdatesLeftD, huNetUpdatesRightD,
+                                                            hNetUpdatesBelowD, hNetUpdatesAboveD,
+                                                            hvNetUpdatesBelowD, hvNetUpdatesAboveD,
+                                                            l_maximumWaveSpeedsD,
+                                                            nx, ny,
+                                                            nx, 0,
+                                                            dimGrid.x, 0);
+
+  // edges between cell ny and ghost layer ny+1
+  dim3 dimTopBlock(TILE_SIZE, 1);
+  dim3 dimTopGrid(nx/TILE_SIZE, 1);
+  computeNetUpdatesKernel<<<dimTopGrid, dimTopBlock>>>( hd, hud, hvd, bd,
+                                                        hNetUpdatesLeftD,  hNetUpdatesRightD,
+                                                        huNetUpdatesLeftD, huNetUpdatesRightD,
+                                                        hNetUpdatesBelowD, hNetUpdatesAboveD,
+                                                        hvNetUpdatesBelowD, hvNetUpdatesAboveD,
+                                                        l_maximumWaveSpeedsD,
+                                                        nx, ny,
+                                                        0, ny,
+                                                        0, dimGrid.y);
+
+  /*
+   * Finalize (max reduction of the maximumWaveSpeeds-array.)
+   *
+   * The Thrust library is used in this step.
+   * An optional kernel could be written for the maximum reduction.
+   */
+  // Thrust pointer to the device array
+  thrust::device_ptr<float> l_thrustDevicePointer(l_maximumWaveSpeedsD);
+
+  // use Thrusts max_element-function for the maximum reduction
+  thrust::device_ptr<float> l_thrustDevicePointerMax = thrust::max_element(l_thrustDevicePointer, l_thrustDevicePointer+l_sizeMaxWaveSpeeds);
+
+  // get the result from the device
+  float l_maximumWaveSpeed = l_thrustDevicePointerMax[0];
+
+  // free the max wave speeds array on the device
+  cudaFree(l_maximumWaveSpeedsD);
+
+  // set the maximum time step for this SWE_WavePropagationBlockCuda
+  maxTimestep = std::min( dx/l_maximumWaveSpeed, dy/l_maximumWaveSpeed );
+
+  // CFL = 0.5
+  maxTimestep *= (float)0.4;
+}
+
+/**
+ * Update the cells with a given global time step.
+ *
+ * @param i_deltaT time step size.
+ */
+void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
+  //! definition of one CUDA-block. Typical size are 8*8 or 16*16
+  dim3 dimBlock(TILE_SIZE,TILE_SIZE);
+
+  //! definition of the CUDA-grid.
+  dim3 dimGrid(nx/TILE_SIZE,ny/TILE_SIZE);
+
+  // assert a valid tile size
+  assert(nx%TILE_SIZE==0);
+  assert(ny%TILE_SIZE==0);
+
+  // compute the update width.
+  float l_updateWidthX = i_deltaT / dx;
+  float l_updateWidthY = i_deltaT / dy;
+
+  // update the unknowns (global time step)
+  updateUnknownsKernel<<<dimGrid,dimBlock>>>( hNetUpdatesLeftD, hNetUpdatesRightD,
+                                              huNetUpdatesLeftD, huNetUpdatesRightD,
+                                              hNetUpdatesBelowD, hNetUpdatesAboveD,
+                                              hvNetUpdatesBelowD, hvNetUpdatesAboveD,
+                                              hd, hud, hvd,
+                                              l_updateWidthX, l_updateWidthY,
+                                              nx, ny);
+}
--- a/src/SWE_WavePropagationBlockCuda.hh
+++ b/src/SWE_WavePropagationBlockCuda.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
+ */
+
+#ifndef SWEWAVEPROPAGATIONBLOCKCUDA_HH_
+#define SWEWAVEPROPAGATIONBLOCKCUDA_HH_
+
+#include <cassert>
+#include "SWE_BlockCUDA.hh"
+
+/**
+ * SWE_WavePropagationBlockCuda is an implementation of the SWE_BlockCuda abstract class.
+ * It uses a wave propagation solver which is defined with the pre-compiler flag WAVE_PROPAGATION_SOLVER (see above).
+ *
+ * Possible wave propagation solvers are:
+ *  F-Wave, <strike>Approximate Augmented Riemann, Hybrid (f-wave + augmented).</strike>
+ *  (details can be found in the corresponding source files)
+ */
+class SWE_WavePropagationBlockCuda: public SWE_BlockCUDA {
+  //private:
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the left).
+    float* hNetUpdatesLeftD;
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the right).
+    float* hNetUpdatesRightD;
+
+    //! "2D array" which holds the net-updates for the momentum in x-direction (wave propagating to the left).
+    float* huNetUpdatesLeftD;
+    //! "2D array" which holds the net-updates for the momentum in x-direction (wave propagating to the right).
+    float* huNetUpdatesRightD;
+
+
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the top).
+    float* hNetUpdatesBelowD;
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the bottom).
+    float* hNetUpdatesAboveD;
+
+    //! "2D array" which holds the net-updates for the momentum in y-direction (wave propagating to the top).
+    float* hvNetUpdatesBelowD;
+    //! "2D array" which holds the net-updates for the momentum in y-direction (wave propagating to the bottom).
+    float* hvNetUpdatesAboveD;
+
+  public:
+    // constructor of SWE_WavePropagationBlockCuda
+    SWE_WavePropagationBlockCuda( const float i_offsetX = 0,
+                                  const float i_offsetY = 0 );
+
+    // destructor of SWE_WavePropagationBlockCuda
+    ~SWE_WavePropagationBlockCuda();
+
+    // compute a single time step (net-updates + update of the cells).
+    void simulateTimestep( float i_dT );
+
+    // TODO: Not implemented
+    float simulate(float, float) {
+      assert(false);
+      return 0;
+    };
+
+    // TODO: not implemented, max time step reduction is done in each call of computeNumericalFluxes(...)
+    void computeMaxTimestep() {
+      assert(false);
+    };
+
+    // compute the numerical fluxes (net-update formulation here).
+    void computeNumericalFluxes();
+
+    // compute the new cell values.
+    void updateUnknowns(const float i_deltaT);
+};
+
+#endif /* SWEWAVEPROPAGATIONBLOCKCUDA_HH_ */
--- a/src/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src/SWE_WavePropagationBlockCuda_kernels.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
+ */
+
+#include "SWE_BlockCUDA.hh"
+#include "SWE_WavePropagationBlockCuda_kernels.hh"
+
+#include <cmath>
+#include <cstdio>
+
+#include "solvers/FWaveCuda.h"
+
+/**
+ * The compute net-updates kernel calls the solver for a defined CUDA-Block and does a reduction over the computed wave speeds within this block.
+ *
+ * Remark: In overall we have nx+1 / ny+1 edges.
+ *         Therefore the edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer"
+ *         will not be computed in a typical call of the function:
+ *           computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
+ *                                                          hNetUpdatesLeftD,  hNetUpdatesRightD,
+ *                                                          huNetUpdatesLeftD, huNetUpdatesRightD,
+ *                                                          hNetUpdatesBelowD, hNetUpdatesAboveD,
+ *                                                          hvNetUpdatesBelowD, hvNetUpdatesAboveD,
+ *                                                          l_maximumWaveSpeedsD,
+ *                                                          i_nx, i_ny
+ *                                                        );
+ *         To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
+ *
+ *
+ * @param i_h water heights (CUDA-array).
+ * @param i_hu momentums in x-direction (CUDA-array).
+ * @param i_hv momentums in y-direction (CUDA-array).
+ * @param i_b bathymetry values (CUDA-array).
+ * @param o_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
+ * @param o_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
+ * @param o_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
+ * @param o_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
+ * @param o_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
+ * @param o_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
+ * @param o_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param o_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param o_maximumWaveSpeeds maximum wave speed which occurred within the CUDA-block is written here (CUDA-array).
+ * @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
+ * @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
+ * @param i_offsetX cell/edge offset in x-direction.
+ * @param i_offsetY cell/edge offset in y-direction.
+ */
+__global__
+void computeNetUpdatesKernel(
+    const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
+    float* o_hNetUpdatesLeftD,   float* o_hNetUpdatesRightD,
+    float* o_huNetUpdatesLeftD,  float* o_huNetUpdatesRightD,
+    float* o_hNetUpdatesBelowD,  float* o_hNetUpdatesAboveD,
+    float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
+    float* o_maximumWaveSpeeds,
+    const int i_nX, const int i_nY,
+    const int i_offsetX, const int i_offsetY,
+    const int i_blockOffSetX, const int i_blockOffSetY
+) {
+  /*
+   * Initialization
+   */
+  //! array maximum wave speed within this CUDA-block
+  __shared__ float l_maxWaveSpeedShared[TILE_SIZE*TILE_SIZE];
+
+  //! thread local index in the shared maximum wave speed array
+  int l_maxWaveSpeedPosition = computeOneDPositionKernel(threadIdx.x, threadIdx.y, blockDim.y);
+
+  // initialize shared maximum wave speed with zero
+  l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = (float) 0.0;
+
+  //! index (l_cellIndexI,l_cellIndexJ) of the cell lying on the right side of the edge/above the edge where the thread works on.
+  int l_cellIndexI, l_cellIndexJ;
+
+  // initialize the indices l_cellIndexI and l_cellIndexJ with the given offset
+  l_cellIndexI = i_offsetX;
+  l_cellIndexJ = i_offsetY;
+
+  //! array which holds the thread local net-updates.
+  float l_netUpdates[5];
+
+  //! location of the thread local cells in the global CUDA-arrays.
+  int l_leftCellPosition, l_rightCellPosition, l_netUpdatePosition;
+
+  // compute (l_cellIndexI,l_cellIndexJ) for the cell lying right/above of the edge
+  l_cellIndexI += blockDim.x * blockIdx.x + threadIdx.x + 1; //+1: start at cell with index (1,0)
+  l_cellIndexJ += blockDim.y * blockIdx.y + threadIdx.y + 1; //+1: start at cell with index (1,1)
+
+  /*
+   * Computation of horizontal net-updates
+   */
+  if(i_offsetY == 0) { //horizontal edges?
+
+    // compute the locations of the thread local cells in the global CUDA-arrays.
+    l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+2);
+    l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
+
+    // compute the net-updates
+    fWaveComputeNetUpdates( 9.81,
+                            i_h[l_leftCellPosition],  i_h[l_rightCellPosition],
+                            i_hu[l_leftCellPosition], i_hu[l_rightCellPosition],
+                            i_b[l_leftCellPosition],  i_b[l_rightCellPosition],
+                            l_netUpdates
+                           );
+
+    // compute the location of the net-updates in the global CUDA-arrays.
+    l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
+
+    // store the horizontal net-updates (thread local net-updates -> device net-updates)
+    o_hNetUpdatesLeftD[l_netUpdatePosition]  = l_netUpdates[0]; o_hNetUpdatesRightD[l_netUpdatePosition]  = l_netUpdates[1];
+    o_huNetUpdatesLeftD[l_netUpdatePosition] = l_netUpdates[2]; o_huNetUpdatesRightD[l_netUpdatePosition] = l_netUpdates[3];
+
+    // store the maximum wave speed in the shared array
+    l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = l_netUpdates[4];
+  }
+
+  // synchronize the threads before the vertical edges (optimization)
+  __syncthreads();
+
+  /*
+   * Computation of vertical net-updates
+   */
+  if(i_offsetX == 0) { //vertical edges?
+    // compute the locations of the thread local cells in the global CUDA-arrays.
+    l_leftCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+2);
+    l_rightCellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
+
+    // compute the net-updates
+    fWaveComputeNetUpdates( 9.81,
+                            i_h[l_leftCellPosition], i_h[l_rightCellPosition],
+                            i_hv[l_leftCellPosition], i_hv[l_rightCellPosition],
+                            i_b[l_leftCellPosition], i_b[l_rightCellPosition],
+                            l_netUpdates
+                           );
+
+    // compute the location of the net-updates in the global CUDA-arrays.
+    l_netUpdatePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
+
+    // store the vertical net-updates (thread local net-updates -> device net-updates)
+    o_hNetUpdatesBelowD[l_netUpdatePosition]  = l_netUpdates[0]; o_hNetUpdatesAboveD[l_netUpdatePosition]  = l_netUpdates[1];
+    o_hvNetUpdatesBelowD[l_netUpdatePosition] = l_netUpdates[2]; o_hvNetUpdatesAboveD[l_netUpdatePosition] = l_netUpdates[3];
+
+    // store the maximum wave speed in the shared array
+    l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax(l_maxWaveSpeedShared[l_maxWaveSpeedPosition], l_netUpdates[4]);
+  }
+
+  /*
+   * Compute the maximum observed wave speed
+   */
+  // wait for all threads to finish
+  __syncthreads();
+
+  // initialize reduction block size with the original block size
+  int reductionBlockDimX = blockDim.x;
+  int reductionBlockDimY = blockDim.y;
+
+  // do the reduction
+  while(reductionBlockDimX != 1 || reductionBlockDimY != 1) { // if the reduction block size == 1*1 (1 cell) -> done.
+    //! reduction partner for a thread
+    int reductionPartner = 0;
+
+    // split the block in the x-direction (size in x-dir. > 1) or y-direction (size in x-dir. == 1, size in y-dir. > 1)
+    if(reductionBlockDimX != 1) {
+      reductionBlockDimX /= 2; //reduce column wise
+      reductionPartner = computeOneDPositionKernel(threadIdx.x + reductionBlockDimX, threadIdx.y, blockDim.y);
+    }
+    else if(reductionBlockDimY != 1) {
+      reductionBlockDimY /= 2; //reduce row wise
+      reductionPartner = computeOneDPositionKernel(threadIdx.x, threadIdx.y+reductionBlockDimY, blockDim.y);
+    }
+#ifndef NDEBUG
+    else {
+      printf("computeNetUpdatesKernel(...): There was an error in the reducing procedure!\n");
+    }
+#endif
+    if(threadIdx.x < reductionBlockDimX && threadIdx.y < reductionBlockDimY) { // use only half the threads in each reduction
+      //execute the reduction routine (maximum)
+      l_maxWaveSpeedShared[l_maxWaveSpeedPosition] = fmax( l_maxWaveSpeedShared[l_maxWaveSpeedPosition],
+                                                           l_maxWaveSpeedShared[reductionPartner]
+                                                         );
+    }
+    // wait for all threads to finish
+    __syncthreads();
+  }
+
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    /**
+     * Position of the maximum wave speed in the global device array.
+     *
+     *   In the 'main' part (e.g. gridDim = nx/TILE_SIZEm ny/TILE_SIZE) the position
+     *   is simply given by the blockId in x- and y-direction with a stride of gridDim.x + 1. The +1 results from the
+     *   speeds in the 'boundary' case, see below.
+     *
+     *   In the 'boundary' case, where the edges lie between the computational domain and the right/top ghost layer,
+     *   this is more complicated. In this case block offsets in x- and y-direction are used. The offsets define how
+     *   many blocks in the resp. direction have to be added to get a valid result.
+     *     Computational domain - right ghost layer: In this case the dimension of the grid in x-direction is 1.
+     *     Computational domain - top ghost layer: In this case the dimension of the grid in y-direction is 1.
+     *
+     *   Same Example as in SWE_WavePropagationBlockCuda::computeNumericalFluxes(), assume the CUDA-grid/-blocks has
+     *   the following layout:
+     *
+     *                                                          *
+     *                                                         **        top ghost layer,
+     *                        * block 8 * block 9 * block 10* ********   cell ids
+     *                        ******************************** **
+     *                        *         *         *         *   *
+     *                        *  block  *  block  *  block  * b
+     *                        *    4    *    5    *    6    * 7
+     *                        *         *         *         *
+     *                        ********************************
+     *                        *         *         *         *
+     *                        *  block  *  block  *  block  * b
+     *                    *   *    0    *    1    *    2    * 3
+     *     bottom         **  *         *         *         *
+     *     ghost     ******** ********************************
+     *     layer          **
+     *                    *   *                              *
+     *                       ***                            ***
+     *                        *                              *
+     *                        *                              *
+     *                  left ghost layer              right ghost layer
+     *
+     *   This results in a 'main' part containing of (3*2) blocks and two 'boundary' parts containing
+     *   of (1*2) blocks and (3*1) blocks.
+     *
+     *   The maximum wave speed array on the device represents therefore logically a (4 * 3)-1 2D-array (-1: no block on the top right).
+     *   The 'main' part writes into cells 0, 1, 2, 4, 5 and 6.
+     *   The 'computational domain - right ghost layer' part writes into 3 and 7 with offset in x-direction = 3
+     *   The 'computational domain - top ghost layer'  part writes into 8, 9, 10 with offset in y-direction = 2
+     *
+     */
+    int l_maxWaveSpeedDevicePosition = computeOneDPositionKernel( i_blockOffSetX + blockIdx.x,
+                                                                  i_blockOffSetY + blockIdx.y,
+                                                                  max(i_blockOffSetY+1, gridDim.y+1) );
+
+    // write the block local maximum wave speed to the device array
+    o_maximumWaveSpeeds[ l_maxWaveSpeedDevicePosition ] = l_maxWaveSpeedShared[0];
+  }
+}
+
+/**
+ * The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
+ *
+ * @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
+ * @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
+ * @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
+ * @param i_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
+ * @param i_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
+ * @param i_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
+ * @param i_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param i_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param io_h water heights (CUDA-array).
+ * @param io_hu momentums in x-direction (CUDA-array).
+ * @param io_hv momentums in y-direction (CUDA-array).
+ * @param i_updateWidthX update width in x-direction.
+ * @param i_updateWidthY update width in y-direction.
+ * @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
+ * @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
+ */
+__global__
+void updateUnknownsKernel(
+    const float* i_hNetUpdatesLeftD,   const float* i_hNetUpdatesRightD,
+    const float* i_huNetUpdatesLeftD,  const float* i_huNetUpdatesRightD,
+    const float* i_hNetUpdatesBelowD,  const float* i_hNetUpdatesAboveD,
+    const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
+    float* io_h, float* io_hu, float* io_hv,
+    const float i_updateWidthX, const float i_updateWidthY,
+    const int i_nX, const int i_nY ) {
+  /*
+   * Initialization
+   */
+  //! cell indices (l_cellIndexI,l_cellIndexJ) of the cell which the thread updates.
+  int l_cellIndexI, l_cellIndexJ;
+
+  //! location of the thread local cell in the global CUDA-arrays.
+  int l_cellPosition;
+
+  // compute the thread local cell indices (start at cell (1,1))
+  l_cellIndexI = blockDim.x * blockIdx.x + threadIdx.x + 1;
+  l_cellIndexJ = blockDim.y * blockIdx.y + threadIdx.y + 1;
+
+  // compute the global cell position
+  l_cellPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ, i_nY+2);
+
+#ifndef NDEBUG
+  if(l_cellPosition > (i_nX+2)*(i_nY+2))
+    printf("Warning: cellPosition(%i) > (i_nx+2)*(i_ny+2)\n", l_cellPosition);
+#endif
+
+  //! positions of the net-updates in the global CUDA-arrays.
+  int l_netUpdatesLeftPosition, l_netUpdatesRightPosition, l_netUpdatesBelowPosition, l_netUpdatesAbovePosition;
+
+  /**
+   *  Compute the positions of the net updates relative to a given cell
+   *
+   *
+   *                  netUpdateRight(i-1, j)
+   *
+   *                           |           |
+   *                           |           |
+   *                           |           |
+   *  netUpdateBelow(i,j) -----*************-----
+   *                           *           *
+   *                           * cell(i,j) *
+   *                           *           *
+   *                      -----*************------ netUpdateAbove(i, j-1)
+   *                           |           |
+   *                           |           |
+   *                           |           |
+   *                                netUpdatesLeft(i,j)
+   *
+   */
+  l_netUpdatesRightPosition = computeOneDPositionKernel(l_cellIndexI-1, l_cellIndexJ, i_nY+1);
+  l_netUpdatesLeftPosition  = computeOneDPositionKernel(l_cellIndexI,   l_cellIndexJ, i_nY+1);
+
+  l_netUpdatesAbovePosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ-1, i_nY+1);
+  l_netUpdatesBelowPosition = computeOneDPositionKernel(l_cellIndexI, l_cellIndexJ,   i_nY+1);
+
+  //update the cell values
+  io_h[l_cellPosition] -=  i_updateWidthX * ( i_hNetUpdatesRightD[ l_netUpdatesRightPosition ] +  i_hNetUpdatesLeftD[  l_netUpdatesLeftPosition ] )
+                         + i_updateWidthY * ( i_hNetUpdatesAboveD[ l_netUpdatesAbovePosition ] +  i_hNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
+
+  io_hu[l_cellPosition] -= i_updateWidthX * ( i_huNetUpdatesRightD[ l_netUpdatesRightPosition ] + i_huNetUpdatesLeftD[ l_netUpdatesLeftPosition ] );
+
+  io_hv[l_cellPosition] -= i_updateWidthY * ( i_hvNetUpdatesAboveD[ l_netUpdatesAbovePosition ] + i_hvNetUpdatesBelowD[ l_netUpdatesBelowPosition ] );
+}
+
+/**
+ * Compute the position of 2D coordinates in a 1D array.
+ *   array[i][j] -> i * ny + j
+ *
+ * @param i_i row index.
+ * @param i_j column index.
+ * @param i_ny #(cells in y-direction).
+ * @return 1D index.
+ */
+__device__
+inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_ny) {
+  return i_i*i_ny + i_j;
+}
--- a/src/SWE_WavePropagationBlockCuda_kernels.hh
+++ b/src/SWE_WavePropagationBlockCuda_kernels.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
+ */
+
+#ifndef SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_
+#define SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_
+
+// CUDA-kernel which computes the net-updates
+__global__
+void computeNetUpdatesKernel(
+    const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
+    float* o_hNetUpdatesLeftD,   float* o_hNetUpdatesRightD,
+    float* o_huNetUpdatesLeftD,  float* o_huNetUpdatesRightD,
+    float* o_hNetUpdatesBelowD,  float* o_hNetUpdatesAboveD,
+    float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
+    float* o_maximumWaveSpeeds,
+    const int i_nx, const int i_ny,
+    const int i_offsetX = 0, const int i_offsetY = 0,
+    const int i_blockOffSetX = 0, const int i_blockOffSetY = 0
+);
+
+// CUDA-kernel which updates the unknowns
+__global__
+void updateUnknownsKernel(
+    const float* i_hNetUpdatesLeftD,   const float* i_hNetUpdatesRightD,
+    const float* i_huNetUpdatesLeftD,  const float* i_huNetUpdatesRightD,
+    const float* i_hNetUpdatesBelowD,  const float* i_hNetUpdatesAboveD,
+    const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
+    float* io_h, float* io_hu, float* io_hv,
+    const float i_updateWidthX, const float i_updateWidthY,
+    const int i_nx, const int i_ny
+);
+
+// CUDA-kernel which computes the 1D position in an array from a given 2D index
+__device__
+inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_nx);
+
+#endif /* SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_ */
--- a/src/examples/swe_wavepropagation.cpp
+++ b/src/examples/swe_wavepropagation.cpp
@@ -31,7 +31,11 @@
 #include <string>

 #include "../SWE_Block.hh"
+#ifndef CUDA
 #include "../SWE_WavePropagationBlock.hh"
+#else
+#include "../SWE_WavePropagationBlockCuda.hh"
+#endif
 #include "../scenarios/SWE_simple_scenarios.h"
 #include "../tools/Logger.hpp"

@@ -91,7 +95,11 @@ int main( int argc, char** argv ) {
  l_originY = l_scenario.getBoundaryPos(BND_BOTTOM);

  // create a single wave propagation block
+  #ifndef CUDA
  SWE_WavePropagationBlock l_wavePropgationBlock(l_originX, l_originY);
+  #else
+  SWE_WavePropagationBlockCuda l_wavePropgationBlock(l_originX, l_originY);
+  #endif

  // initialize the wave propgation block
  l_wavePropgationBlock.initScenario(l_scenario);
@@ -115,7 +123,7 @@ int main( int argc, char** argv ) {
  /**
   * Simulation.
   */
-  // print a start message and reset the wall clock time
+  // print the start message and reset the wall clock time
  s_sweLogger.printStartMessage();
  s_sweLogger.initWallClockTime(time(NULL));

@@ -165,7 +173,7 @@ int main( int argc, char** argv ) {
  s_sweLogger.printStatisticsMessage();

  // print the cpu time
-  s_sweLogger.printCpuTime("CPU time");
+  s_sweLogger.printCpuTime("CPU/GPU time");

  // print the wall clock time (includes plotting)
  s_sweLogger.printWallClockTime(time(NULL));

--- a/src/scenarios/SWE_simple_scenarios.h
+++ b/src/scenarios/SWE_simple_scenarios.h
@@ -28,7 +28,7 @@
 #ifndef __SWE_SIMPLE_SCENARIOS_H
 #define __SWE_SIMPLE_SCENARIOS_H

-#include <math.h>
+#include <cmath>

 #include "SWE_Scenario.h"

@@ -56,12 +56,39 @@ class SWE_BathymetryDamBreakScenario : public SWE_Scenario {
  public:

    float getBathymetry(float x, float y) { 
-       // return ( sqrt( (x-0.3f)*(x-0.3f) + (y-0.8f)*(y-0.8f) ) < 0.1f ) ? 0.1f: 0.0f;
-       return ( sqrt( (x-0.5f)*(x-0.5f) + (y-0.5f)*(y-0.5f) ) < 0.1f ) ? 0.1f: 0.0f;
+       return ( std::sqrt( (x-500.f)*(x-500.f) + (y-500.f)*(y-500.f) ) < 50.f ) ? -250.f: -260.f;
    };
-    virtual float endSimulation() { return 0.2f; };
+    virtual float endSimulation() { return (float) 15; };

    virtual BoundaryType getBoundaryType(BoundaryEdge edge) { return OUTFLOW; };
+
+    /** Get the boundary positions
+     *
+     * @param i_edge which edge
+     * @return value in the corresponding dimension
+     */
+    float getBoundaryPos(BoundaryEdge i_edge) {
+       if ( i_edge == BND_LEFT )
+         return (float)0;
+       else if ( i_edge == BND_RIGHT)
+         return (float)1000;
+       else if ( i_edge == BND_BOTTOM )
+         return (float)0;
+       else
+         return (float)1000;
+    };
+
+    /**
+     * Get the water height at a specific location.
+     *
+     * @param i_positionX position relative to the origin of the bathymetry grid in x-direction
+     * @param i_positionY position relative to the origin of the bathymetry grid in y-direction
+     * @return water height (before the initial displacement)
+     */
+    float getWaterHeight( float i_positionX,
+                          float i_positionY ) {
+      return (float) 270;
+    }
 };

 /**