Added the CUDA-files. Changed a few routines accordingly.

92ed477e · breuera · f935a3b2 · 92ed477e · 92ed477e · 92ed477e
Commit 92ed477e authored Jun 24, 2012 by breuera
14 changed files
--- a/src/SWE_BlockCUDA.cu
+++ b/src/SWE_BlockCUDA.cu
--- a/src/SWE_BlockCUDA.hh
+++ b/src/SWE_BlockCUDA.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_BLOCKCUDA_HH
+#define __SWE_BLOCKCUDA_HH
+
+#include <iostream>
+#include <stdio.h>
+#include <fstream>
+#include <cuda_runtime.h>
+#include "tools/help.hh"
+#include "SWE_Block.hh"
+
+using namespace std;
+
+void checkCUDAError(const char *msg);
+void tryCUDA(cudaError_t err, const char *msg);
+
+const int TILE_SIZE=16;
+//const int TILE_SIZE=8;
+
+/**
+ * SWE_BlockCUDA extends the base class SWE_Block towards  
+ * a base class for a CUDA implementation of the shallow water equations.
+ * It adds the respective variables in GPU memory, and provides 
+ * methods for data transfer between main and GPU memory.
+ */
+class SWE_BlockCUDA : public SWE_Block {
+
+  public:
+    // Constructor und Destructor
+    SWE_BlockCUDA(float _offsetX = 0, float _offsetY = 0);
+    virtual ~SWE_BlockCUDA();
+    
+  // object methods
+
+// ---> COULD BE IMPLEMENTED TO PROVIDE A DEFAULT IMPLEMENTATION
+//     // determine maximum possible time step
+//     virtual float getMaxTimestep();
+
+    // deliver a pointer to proxy class that represents 
+    // the layer that is copied to an external ghost layer 
+    virtual SWE_Block1D* registerCopyLayer(BoundaryEdge edge);
+    // "grab" the ghost layer in order to set these values externally
+    virtual SWE_Block1D* grabGhostLayer(BoundaryEdge edge);
+
+    // access to CUDA variables
+    /**
+     *  @return	pointer to the array #hd (water height) in device memory 
+     */
+    const float* getCUDA_waterHeight() { return hd; };
+    /**
+     *  @return	pointer to the array #hb (bathymetry) in device memory 
+     */
+    const float* getCUDA_bathymetry() { return bd; };
+
+  protected:
+     
+    // synchronisation Methods
+    virtual void synchAfterWrite();
+    virtual void synchWaterHeightAfterWrite();
+    virtual void synchDischargeAfterWrite();
+    virtual void synchBathymetryAfterWrite();
+    virtual void synchGhostLayerAfterWrite();
+
+    virtual void synchBeforeRead();
+    virtual void synchWaterHeightBeforeRead();
+    virtual void synchDischargeBeforeRead();
+    virtual void synchBathymetryBeforeRead();
+    virtual void synchCopyLayerBeforeRead();
+    
+    // set boundary conditions in ghost layers (set boundary conditions)
+    virtual void setBoundaryConditions();
+
+    // define arrays for main unknowns in CUDA global memory: 
+    // hd, hud, hvd, and bd are CUDA arrays corresp. to h, hu, hv, and b
+    float* hd;
+    float* hud;
+    float* hvd;
+    float* bd;
+	
+  private:
+     
+    // separate memory to hold bottom and top ghost and copy layer 
+    // in main memory allowing non-strided access
+    float* bottomLayer;
+    float* topLayer;
+    SWE_Block1D* bottomGhostLayer;
+    SWE_Block1D* bottomCopyLayer;
+    SWE_Block1D* topGhostLayer;
+    SWE_Block1D* topCopyLayer;
+    // and resp. memory on the CUDA device:
+    float* bottomLayerDevice;
+    float* topLayerDevice;
+
+    // helper arrays: store maximum height and velocities to determine time step
+    float* maxhd;
+    float* maxvd;
+
+    // overload operator<< such that data can be written via cout <<
+    // -> needs to be declared as friend to be allowed to access private data
+    friend ostream& operator<< (ostream& os, const SWE_BlockCUDA& swe);
+
+};
+
+ostream& operator<< (ostream& os, const SWE_BlockCUDA& swe);
+
+/**
+    Return index of hd[i][j] in linearised array
+	@param i,j		x- and y-coordinate of grid cell
+	@param ny		grid size in y-direction (without ghost layers)
+*/	
+inline __device__
+int getCellCoord(int x, int y, int ny) {
+   return x*(ny+2) + y;
+}
+
+
+/**
+    Return index of edge-data Fhd[i][j] or Ghd[i][j] in linearised array
+	@param i,j		x- and y-coordinate of grid cell
+	@param ny		grid size in y-direction (without ghost layers)
+*/	
+inline __device__
+int getEdgeCoord(int x, int y, int ny) {
+   return x*(ny+1) + y;
+}
+
+/**
+    Return index of a specific element in the arrays of bathymetry source terms
+	@param i,j		x- and y-coordinate of grid cell
+	@param ny		grid size in y-direction (without ghost layers)
+*/	
+inline __device__
+int getBathyCoord(int x, int y, int ny) {
+   return x*ny + y;
+}
+
+
+#endif
--- a/src/SWE_BlockCUDA_kernels.cu
+++ b/src/SWE_BlockCUDA_kernels.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#include "SWE_BlockCUDA.hh"
+#include "SWE_BlockCUDA_kernels.hh"
+
+/**
+    Sets corner values of hd (only needed for visualization)
+	@param hd		h-values on device
+*/	
+__global__
+void kernelHdBufferEdges(float* hd, int nx, int ny)
+{ 
+  hd[getCellCoord(0   ,0   ,ny)] = hd[getCellCoord(1 ,1 ,ny)];
+  hd[getCellCoord(0   ,ny+1,ny)] = hd[getCellCoord(1 ,ny,ny)];
+  hd[getCellCoord(nx+1,0   ,ny)] = hd[getCellCoord(nx,1 ,ny)];
+  hd[getCellCoord(nx+1,ny+1,ny)] = hd[getCellCoord(nx,ny,ny)];
+	
+  //Corresponding C-Code:
+  //h[0][0] = h[1][1];
+  //h[0][ny+1] = h[1][ny];
+  //h[nx+1][0] = h[nx][1];
+  //h[nx+1][ny+1] = h[nx][ny];
+}
+
+
+//******************************************************************
+// kernels to implement boundary conditions
+//******************************************************************
+
+/**
+ * CUDA kernel to set left boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.y and threadIdx.y loop over the boundary elements
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelLeftBoundary(float* hd, float* hud, float* hvd,
+                        int nx, int ny, BoundaryType bound)
+{
+  int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
+  int ghost = getCellCoord(0,j,ny);
+  int inner = getCellCoord(1,j,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = (bound==WALL) ? -hud[inner] : hud[inner];
+  hvd[ghost] = hvd[inner];
+}
+
+/**
+ * CUDA kernel to set right boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.y and threadIdx.y loop over the boundary elements
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelRightBoundary(float* hd, float* hud, float* hvd,
+                         int nx, int ny, BoundaryType bound)
+{
+  int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
+  int ghost = getCellCoord(nx+1,j,ny);
+  int inner = getCellCoord(nx  ,j,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = (bound==WALL) ? -hud[inner] : hud[inner];
+  hvd[ghost] = hvd[inner];
+}
+
+
+/**
+ * CUDA kernel to set bottom boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelBottomBoundary(float* hd, float* hud, float* hvd,
+                          int nx, int ny, BoundaryType bound)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,0,ny);
+  int inner = getCellCoord(i,1,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = hud[inner];
+  hvd[ghost] = (bound==WALL) ? -hvd[inner] : hvd[inner]; 
+}
+
+/**
+ * CUDA kernel to set bottom boundary layer for conditions WALL & OUTFLOW
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ */
+__global__
+void kernelTopBoundary(float* hd, float* hud, float* hvd,
+                       int nx, int ny, BoundaryType bound)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,ny+1,ny);
+  int inner = getCellCoord(i,ny  ,ny);
+  
+  // consider only WALL & OUTFLOW boundary conditions
+  hd[ghost] = hd[inner];
+  hud[ghost] = hud[inner];
+  hvd[ghost] = (bound==WALL) ? -hvd[inner] : hvd[inner]; 
+}
+
+/**
+ * CUDA kernel to set bottom boundary layer according to the external 
+ * ghost layer status (conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements.
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelBottomGhostBoundary(float* hd, float* hud, float* hvd,
+                               float* bottomGhostLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,0,ny);
+
+  hd[ghost]  = bottomGhostLayer[i];
+  hud[ghost] = bottomGhostLayer[(nx+2)+i];
+  hvd[ghost] = bottomGhostLayer[2*(nx+2)+i];
+}
+
+/**
+ * CUDA kernel to set top boundary layer according to the external 
+ * ghost layer status (conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelTopGhostBoundary(float* hd, float* hud, float* hvd,
+                            float* topGhostLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int ghost = getCellCoord(i,ny+1,ny);
+  
+  hd[ghost]  = topGhostLayer[i];
+  hud[ghost] = topGhostLayer[(nx+2)+i];
+  hvd[ghost] = topGhostLayer[2*(nx+2)+i];
+}
+
+/**
+ * CUDA kernel to update bottom copy layer according 
+ * (for boundary conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements.
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelBottomCopyLayer(float* hd, float* hud, float* hvd,
+                           float* bottomCopyLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int copy = getCellCoord(i,1,ny);
+
+  bottomCopyLayer[i]          = hd[copy];  
+  bottomCopyLayer[(nx+2)+i]   = hud[copy]; 
+  bottomCopyLayer[2*(nx+2)+i] = hvd[copy]; 
+}
+
+/**
+ * CUDA kernel to set top boundary layer according to the external 
+ * ghost layer status (conditions PASSIVE and CONNECT)
+ * blockIdx.x and threadIdx.x loop over the boundary elements
+ * Note that diagonal elements are currently not copied!
+ * SWE_Block size ny is assumed to be a multiple of the TILE_SIZE
+ */
+__global__
+void kernelTopCopyLayer(float* hd, float* hud, float* hvd,
+                        float* topCopyLayer, int nx, int ny)
+{
+  int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+  int copy = getCellCoord(i,ny,ny);
+  
+  topCopyLayer[i]          = hd[copy];  
+  topCopyLayer[(nx+2)+i]   = hud[copy]; 
+  topCopyLayer[2*(nx+2)+i] = hvd[copy]; 
+}
+
+
+
+// //******************************************************************
+// // kernels to implement boundary conditions
+// //******************************************************************
+// 
+// 
+// /**
+//  * CUDA kernel for maximum reduction
+//  * required to compute maximum water height and velocities to determine 
+//  * allow time step
+//  */
+// __global__ 
+// void kernelMaximum(float* maxhd, float* maxvd, int start, int size) {
+//   int tx = start+threadIdx.x;
+//   for (int i=size>>1; i>0; i>>=1) {
+//      __syncthreads();
+//      if (tx < i) {
+//         if( maxhd[tx] < maxhd[tx+i] ) maxhd[tx] = maxhd[tx+i];
+//         if( maxvd[tx] < maxvd[tx+i] ) maxvd[tx] = maxvd[tx+i];
+//      };
+//   };
+// }
+// 
+// 
--- a/src/SWE_BlockCUDA_kernels.hh
+++ b/src/SWE_BlockCUDA_kernels.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_BLOCKCUDAKERNELS_HH
+#define __SWE_BLOCKCUDAKERNELS_HH
+
+// declaration of CUDA kernels
+__global__ 
+void kernelHdBufferEdges(float* hd, int nx, int ny);
+
+__global__ 
+void kernelMaximum(float* maxhd, float* maxvd, int start, int size);
+
+__global__
+void kernelLeftBoundary(float* hd, float* hud, float* hvd,
+                        int nx, int ny, BoundaryType bound);
+__global__
+void kernelRightBoundary(float* hd, float* hud, float* hvd,
+                         int nx, int ny, BoundaryType bound);
+__global__
+void kernelBottomBoundary(float* hd, float* hud, float* hvd,
+                          int nx, int ny, BoundaryType bound);
+__global__
+void kernelTopBoundary(float* hd, float* hud, float* hvd,
+                       int nx, int ny, BoundaryType bound);
+__global__
+void kernelBottomGhostBoundary(float* hd, float* hud, float* hvd,
+                               float* bottomGhostLayer, int nx, int ny);
+__global__
+void kernelTopGhostBoundary(float* hd, float* hud, float* hvd,
+                            float* topGhostLayer, int nx, int ny);
+__global__
+void kernelBottomCopyLayer(float* hd, float* hud, float* hvd,
+                           float* bottomCopyLayer, int nx, int ny);
+__global__
+void kernelTopCopyLayer(float* hd, float* hud, float* hvd,
+                        float* topCopyLayer, int nx, int ny);
+
+#endif
--- a/src/SWE_RusanovBlockCUDA.cu
+++ b/src/SWE_RusanovBlockCUDA.cu
--- a/src/SWE_RusanovBlockCUDA.hh
+++ b/src/SWE_RusanovBlockCUDA.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_RUSANOVBLOCKCUDA_HH
+#define __SWE_RUSANOVBLOCKCUDA_HH
+
+#include <iostream>
+#include <stdio.h>
+#include <fstream>
+#include <cuda_runtime.h>
+#include "tools/help.hh"
+#include "SWE_Block.hh"
+#include "SWE_BlockCUDA.hh"
+
+using namespace std;
+
+/**
+ * SWE_RusanovBlockCUDA extends the base class SWE_BlockCUDA, 
+ * and provides a concrete CUDA implementation of a simple 
+ * shallow water model based on Rusanov Flux computation on the 
+ * edges and explicit time stepping.
+ */
+class SWE_RusanovBlockCUDA : public SWE_BlockCUDA {
+
+  public:
+    // Constructor und Destructor
+    SWE_RusanovBlockCUDA(float _offsetX = 0, float _offsetY = 0);
+    virtual ~SWE_RusanovBlockCUDA();
+    
+  // object methods
+
+    virtual void computeNumericalFluxes();
+    // simulate for specified time range
+    // execute Euler time step
+    virtual void updateUnknowns(float dt);
+    /// execute a single time step of the simulation
+    virtual void simulateTimestep(float dt);
+    // compute flux terms on edges
+    virtual float simulate(float tStart, float tEnd);
+    
+  private:
+     
+    // compute bathymetry source terms
+    void computeBathymetrySources();
+
+    // determine maximum possible time step
+    void computeMaxTimestepCUDA();
+
+    // arrays to hold the values of the flux terms at cell edges
+    float* Fhd;
+    float* Fhud;
+    float* Fhvd;
+    float* Ghd;
+    float* Ghud;
+    float* Ghvd;
+
+    // arrays to hold the bathymetry source terms for the hu and hv equations
+    float* Bxd;
+    float* Byd;
+    
+    // helper arrays: store maximum height and velocities to determine time step
+    float* maxhd;
+    float* maxvd;
+
+    // overload operator<< such that data can be written via cout <<
+    // -> needs to be declared as friend to be allowed to access private data
+    friend ostream& operator<< (ostream& os, const SWE_RusanovBlockCUDA& swe);
+
+#ifdef DBG
+    // --- only required for debugging purposes ---
+    // arrays for fluxes for h,hu,hv in main memory
+    Float2D Fh; 
+    Float2D Fhu;
+    Float2D Fhv;
+    Float2D Gh; 
+    Float2D Ghu;
+    Float2D Ghv;
+    // dump fluxes for h,hu,hv from CUDA device memory into main memory
+    void cudaDumpFlux();
+#endif
+    
+};
+
+ostream& operator<< (ostream& os, const SWE_RusanovBlockCUDA& swe);
+
+#endif
--- a/src/SWE_RusanovBlockCUDA_kernels.cu
+++ b/src/SWE_RusanovBlockCUDA_kernels.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader (bader AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Univ.-Prof._Dr._Michael_Bader)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#include "SWE_BlockCUDA.hh"
+#include "SWE_RusanovBlockCUDA_kernels.hh"
+
+//******************************************************************
+// kernels to implement Euler time-stepping
+//******************************************************************
+
+inline __device__
+float computeFlux(float fLow, float fHigh, float xiLow, float xiHigh, float llf) {
+  // local Lax-Friedrich
+  return 0.5f*(fLow+fHigh) - 0.5f*llf*(xiHigh-xiLow);
+}
+
+/**
+ * computes the flux vector components Fhd, Fhud and Fhvd for a single 
+ * edge by calling the function computeFlux 
+ */
+__global__
+void kernelComputeFluxesF(float* hd, float* hud, float* hvd,
+                          float* Fhd, float* Fhud, float* Fhvd,
+                          int ny, float g, float llf, int istart)
+{
+   int i = istart + TILE_SIZE*blockIdx.x + threadIdx.x;
+   int j = 1 + TILE_SIZE*blockIdx.y + threadIdx.y;
+   int iL = getCellCoord(i,j,ny);	// index of left cell
+   int iR = getCellCoord(i+1,j,ny);	// index of right cell
+   int iEdge = getEdgeCoord(i,j,ny);	// index of current edge
+
+   float upwind = max( fabs(hud[iL]/hd[iL]), fabs(hud[iR]/hd[iR]) ); 
+   Fhd[iEdge] = computeFlux( hud[iL], hud[iR], hd[iL], hd[iR], upwind );
+   Fhud[iEdge] = computeFlux( hud[iL]*hud[iL]/hd[iL] + 0.5f*g*hd[iL]*hd[iL],
+            		      hud[iR]*hud[iR]/hd[iR] + 0.5f*g*hd[iR]*hd[iR],
+	        	      hud[iL], 
+	        	      hud[iR], 
+                              llf );
+   Fhvd[iEdge] = computeFlux( hud[iL]*hvd[iL]/hd[iL],hud[iR]*hvd[iR]/hd[iR], 
+                              hvd[iL], hvd[iR], 
+                              llf );
+}
+
+/**
+ * computes the flux vector components Ghd, Ghud and Ghvd for a single 
+ * edge by calling the function computeFlux 
+ */
+__global__
+void kernelComputeFluxesG(float* hd, float* hud, float* hvd,
+                          float* Ghd, float* Ghud, float* Ghvd,
+                          int ny, float g, float llf, int jstart)
+{
+   int i = 1 + TILE_SIZE*blockIdx.x + threadIdx.x;
+   int j = jstart + TILE_SIZE*blockIdx.y + threadIdx.y;
+   int iB = getCellCoord(i,j  ,ny);
+   int iT = getCellCoord(i,j+1,ny);
+   int iEdge = getEdgeCoord(i,j,ny);
+
+   float upwind = max( fabs(hvd[iB]/hd[iB]), fabs(hvd[iT]/hd[iT]) ); 
+   Ghd[iEdge] = computeFlux( hvd[iB], hvd[iT], hd[iB], hd[iT], upwind );
+   Ghud[iEdge] = computeFlux( hud[iB]*hvd[iB]/hd[iB],hud[iT]*hvd[iT]/hd[iT], 
+                              hud[iB], hud[iT], 
+                              llf );
+   Ghvd[iEdge] = computeFlux( hvd[iB]*hvd[iB]/hd[iB] + 0.5f*g*hd[iB]*hd[iB],
+                              hvd[iT]*hvd[iT]/hd[iT] + 0.5f*g*hd[iT]*hd[iT],
+			      hvd[iB], hvd[iT], 
+                              llf );
+}
+
+/**
+ * computes the bathymetry source terms for the hu and hv equation for 
+ * a given cell in the resp. array elements Bxd and Byd
+ */
+__global__
+void kernelComputeBathymetrySources(float* hd, float* bd, float* Bxd, float* Byd, 
+                                    int ny, float g)
+{
+// Note: different index ranges for h and b vs. Bxd, Byd: 
+//       [0..nx+]x[0..ny+1] vs. [1..nx]x[1..ny]
+// Note: indices for Bxd, Byd shifted to start with 0
+   int i = TILE_SIZE*blockIdx.x + threadIdx.x;
+   int j = TILE_SIZE*blockIdx.y + threadIdx.y;
+   
+   // compute indices of involved array elements
+   int ij = getBathyCoord(i,j,ny);
+   int left  = getCellCoord(i  ,j+1,ny); // index of left cell (arrays hd,bd)
+   int right = getCellCoord(i+2,j+1,ny); // index of right cell (array hd,bb)
+
+   Bxd[ij] = g * 0.5f*(hd[right] + hd[left]) * 0.5f*(bd[right] - bd[left]);
+
+   int bot = getCellCoord(i+1,j,ny);   // index of left cell (arrays hd,bd)
+   int top = getCellCoord(i+1,j+2,ny); // index of right cell (array hd,bb)
+
+   Byd[ij] = g * 0.5f*(hd[top] + hd[bot]) * 0.5f*(bd[top] - bd[bot]);
+
+}
+
+
+/**
+ * CUDA kernel for Euler time step
+ */
+__global__
+void kernelEulerTimestep(float* hd, float* hud, float* hvd,
+                         float* Fhd, float* Fhud, float* Fhvd,
+                         float* Ghd, float* Ghud, float* Ghvd,
+			 float* Bxd, float* Byd,
+			 float* maxhd, float* maxvd,
+                         int nx, int ny, float dt, float dxi, float dyi)
+{
+
+   __shared__ float Fds[TILE_SIZE+1][TILE_SIZE+1];
+   __shared__ float Gds[TILE_SIZE+1][TILE_SIZE+1];
+
+   int tx = threadIdx.x;
+   int ty = threadIdx.y;
+   
+   int i = 1 + TILE_SIZE*blockIdx.x + tx;
+   int j = 1 + TILE_SIZE*blockIdx.y + ty;
+   int iElem = getCellCoord(i,j,ny);   // index of current cell
+   int iEdge = getEdgeCoord(i,j,ny);   // index of right/top Edge
+   int iLeft = getEdgeCoord(i-1,j,ny); // index of left Edge
+   int iBot  = getEdgeCoord(i,j-1,ny); // index of bottom Edge
+   
+   float h;
+   float hu;
+   float hv;
+
+   // copy flux unknowns from global into local memory
+   // -> for fluxes corresponding to variable h
+   Fds[tx+1][ty] = Fhd[iEdge];
+   Gds[tx][ty+1] = Ghd[iEdge];
+   if (tx==0) Fds[tx][ty] = Fhd[iLeft];
+   if (ty==0) Gds[tx][ty] = Ghd[iBot];
+   __syncthreads();
+
+   // compute new value of h from fluxes
+   h = hd[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi 
+           	        +(Gds[tx][ty+1]-Gds[tx][ty])*dyi );
+   __syncthreads();
+
+   // copy flux unknowns from global into local memory
+   // -> for fluxes corresponding to variable hu
+   Fds[tx+1][ty] = Fhud[iEdge];
+   Gds[tx][ty+1] = Ghud[iEdge];
+   if (tx==0) Fds[tx][ty] = Fhud[iLeft];
+   if (ty==0) Gds[tx][ty] = Ghud[iBot];
+   __syncthreads();
+
+   // compute new value of hu from fluxes
+   hu = hud[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi 
+           	          +(Gds[tx][ty+1]-Gds[tx][ty])*dyi 
+			  + Bxd[getBathyCoord(i-1,j-1,ny)]*dxi );
+   __syncthreads();
+
+   // copy flux unknowns from global into local memory
+   // -> for fluxes corresponding to variable hv
+   Fds[tx+1][ty] = Fhvd[iEdge];
+   Gds[tx][ty+1] = Ghvd[iEdge];
+   if (tx==0) Fds[tx][ty] = Fhvd[iLeft];
+   if (ty==0) Gds[tx][ty] = Ghvd[iBot];
+   __syncthreads();
+
+   // compute new value of hv from fluxes
+   hv = hvd[iElem] - dt *( (Fds[tx+1][ty]-Fds[tx][ty])*dxi 
+           	          +(Gds[tx][ty+1]-Gds[tx][ty])*dyi 
+			  + Byd[getBathyCoord(i-1,j-1,ny)]*dyi );
+   __syncthreads();
+
+   /* precompute maxmimal height and velocity per thread block 
+    * (for computation of allowed time step size)
+    */
+
+   // compute absolute values of h and absolute velocity
+   hd[iElem] = h; Fds[tx][ty] = h;
+   hud[iElem] = hu; hu = (h>0.0) ? fabs(hu/h) : 0.0;
+   hvd[iElem] = hv; hv = (h>0.0) ? fabs(hv/h) : 0.0; 
+   Gds[tx][ty] = (hu>hv) ? hu : hv;
+   
+   // parallel reduction on thread block:
+   // determine maximum wave height and velocity
+   // step 1: reduction in ty-direction
+   for (i=TILE_SIZE>>1; i>0; i>>=1) {
+      __syncthreads();
+      if (ty < i) {
+         if( Fds[tx][ty] < Fds[tx][ty+i]) Fds[tx][ty] = Fds[tx][ty+i];
+         if( Gds[tx][ty] < Gds[tx][ty+i]) Gds[tx][ty] = Gds[tx][ty+i];
+      };
+   };
+   // step 2: reduction in ty-direction
+   for (i=TILE_SIZE>>1; i>0; i>>=1) {
+      __syncthreads();
+      if ((tx < i) && (ty==0)) {
+         if( Fds[tx][ty] < Fds[tx+i][ty]) Fds[tx][ty] = Fds[tx+i][ty];
+         if( Gds[tx][ty] < Gds[tx+i][ty]) Gds[tx][ty] = Gds[tx+i][ty];
+      };
+   };
+   // save maxima in array maxhd and maxvd
+   
+   if ((tx == 0) && (ty==0)) {
+      j = blockIdx.x*(nx/TILE_SIZE)+blockIdx.y;
+      maxhd[j] = Fds[0][0];
+      maxvd[j] = Gds[0][0];
+   };
+   
+}
+
+
+//******************************************************************
+// kernels to implement boundary conditions
+//******************************************************************
+
+
+/**
+ * CUDA kernel for maximum reduction
+ * required to compute maximum water height and velocities to determine 
+ * allow time step
+ */
+__global__ 
+void kernelMaximum(float* maxhd, float* maxvd, int start, int size) {
+  int tx = start+threadIdx.x;
+  for (int i=size>>1; i>0; i>>=1) {
+     __syncthreads();
+     if (tx < i) {
+        if( maxhd[tx] < maxhd[tx+i] ) maxhd[tx] = maxhd[tx+i];
+        if( maxvd[tx] < maxvd[tx+i] ) maxvd[tx] = maxvd[tx+i];
+     };
+  };
+}
+
+
+
--- a/src/SWE_RusanovBlockCUDA_kernels.hh
+++ b/src/SWE_RusanovBlockCUDA_kernels.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Michael Bader, Kaveh Rahnema, Tobias Schnabel
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * TODO
+ */
+
+#ifndef __SWE_RUSANOVBLOCKCUDAKERNELS_HH
+#define __SWE_RUSANOVBLOCKCUDAKERNELS_HH
+
+//******************************************************************
+// kernels to implement Euler time-stepping
+//******************************************************************
+
+__global__
+void kernelComputeFluxesF(float* hd, float* hud, float* hvd,
+                          float* Fhd, float* Fhud, float* Fhvd,
+                          int ny, float g, float llf, int istart);
+
+__global__
+void kernelComputeFluxesG(float* hd, float* hud, float* hvd,
+                          float* Ghd, float* Ghud, float* Ghvd,
+                          int ny, float g, float llf, int jstart);
+
+__global__
+void kernelComputeBathymetrySources(float* hd, float* bd, float* Bxd, float* Byd, 
+                                    int ny, float g);
+
+__global__
+void kernelEulerTimestep(float* hd, float* hud, float* hvd,
+                         float* Fhd, float* Fhud, float* Fhvd,
+                         float* Ghd, float* Ghud, float* Ghvd,
+			 float* Bxd, float* Byd,
+			 float* maxhd, float* maxvd,
+                         int nx, int ny, float dt, float dxi, float dyi);
+
+
+__global__ 
+void kernelMaximum(float* maxhd, float* maxvd, int start, int size);
+
+#endif
+
+
+
--- a/src/SWE_WavePropagationBlockCuda.cu
+++ b/src/SWE_WavePropagationBlockCuda.cu
--- a/src/SWE_WavePropagationBlockCuda.hh
+++ b/src/SWE_WavePropagationBlockCuda.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
+ */
+
+#ifndef SWEWAVEPROPAGATIONBLOCKCUDA_HH_
+#define SWEWAVEPROPAGATIONBLOCKCUDA_HH_
+
+#include <cassert>
+#include "SWE_BlockCUDA.hh"
+
+/**
+ * SWE_WavePropagationBlockCuda is an implementation of the SWE_BlockCuda abstract class.
+ * It uses a wave propagation solver which is defined with the pre-compiler flag WAVE_PROPAGATION_SOLVER (see above).
+ *
+ * Possible wave propagation solvers are:
+ *  F-Wave, <strike>Approximate Augmented Riemann, Hybrid (f-wave + augmented).</strike>
+ *  (details can be found in the corresponding source files)
+ */
+class SWE_WavePropagationBlockCuda: public SWE_BlockCUDA {
+  //private:
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the left).
+    float* hNetUpdatesLeftD;
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the right).
+    float* hNetUpdatesRightD;
+
+    //! "2D array" which holds the net-updates for the momentum in x-direction (wave propagating to the left).
+    float* huNetUpdatesLeftD;
+    //! "2D array" which holds the net-updates for the momentum in x-direction (wave propagating to the right).
+    float* huNetUpdatesRightD;
+
+
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the top).
+    float* hNetUpdatesBelowD;
+    //! "2D array" which holds the net-updates for the water height (wave propagating to the bottom).
+    float* hNetUpdatesAboveD;
+
+    //! "2D array" which holds the net-updates for the momentum in y-direction (wave propagating to the top).
+    float* hvNetUpdatesBelowD;
+    //! "2D array" which holds the net-updates for the momentum in y-direction (wave propagating to the bottom).
+    float* hvNetUpdatesAboveD;
+
+  public:
+    // constructor of SWE_WavePropagationBlockCuda
+    SWE_WavePropagationBlockCuda( const float i_offsetX = 0,
+                                  const float i_offsetY = 0 );
+
+    // destructor of SWE_WavePropagationBlockCuda
+    ~SWE_WavePropagationBlockCuda();
+
+    // compute a single time step (net-updates + update of the cells).
+    void simulateTimestep( float i_dT );
+
+    // TODO: Not implemented
+    float simulate(float, float) {
+      assert(false);
+      return 0;
+    };
+
+    // TODO: not implemented, max time step reduction is done in each call of computeNumericalFluxes(...)
+    void computeMaxTimestep() {
+      assert(false);
+    };
+
+    // compute the numerical fluxes (net-update formulation here).
+    void computeNumericalFluxes();
+
+    // compute the new cell values.
+    void updateUnknowns(const float i_deltaT);
+};
+
+#endif /* SWEWAVEPROPAGATIONBLOCKCUDA_HH_ */
--- a/src/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src/SWE_WavePropagationBlockCuda_kernels.cu
--- a/src/SWE_WavePropagationBlockCuda_kernels.hh
+++ b/src/SWE_WavePropagationBlockCuda_kernels.hh
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
+ */
+
+#ifndef SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_
+#define SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_
+
+// CUDA-kernel which computes the net-updates
+__global__
+void computeNetUpdatesKernel(
+    const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
+    float* o_hNetUpdatesLeftD,   float* o_hNetUpdatesRightD,
+    float* o_huNetUpdatesLeftD,  float* o_huNetUpdatesRightD,
+    float* o_hNetUpdatesBelowD,  float* o_hNetUpdatesAboveD,
+    float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
+    float* o_maximumWaveSpeeds,
+    const int i_nx, const int i_ny,
+    const int i_offsetX = 0, const int i_offsetY = 0,
+    const int i_blockOffSetX = 0, const int i_blockOffSetY = 0
+);
+
+// CUDA-kernel which updates the unknowns
+__global__
+void updateUnknownsKernel(
+    const float* i_hNetUpdatesLeftD,   const float* i_hNetUpdatesRightD,
+    const float* i_huNetUpdatesLeftD,  const float* i_huNetUpdatesRightD,
+    const float* i_hNetUpdatesBelowD,  const float* i_hNetUpdatesAboveD,
+    const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
+    float* io_h, float* io_hu, float* io_hv,
+    const float i_updateWidthX, const float i_updateWidthY,
+    const int i_nx, const int i_ny
+);
+
+// CUDA-kernel which computes the 1D position in an array from a given 2D index
+__device__
+inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_nx);
+
+#endif /* SWEWAVEPROPAGATIONBLOCKCUDAKERNELS_HH_ */
--- a/src/examples/swe_wavepropagation.cpp
+++ b/src/examples/swe_wavepropagation.cpp
@@ -31,7 +31,11 @@
 #include <string>

 #include "../SWE_Block.hh"
+#ifndef CUDA
 #include "../SWE_WavePropagationBlock.hh"
+#else
+#include "../SWE_WavePropagationBlockCuda.hh"
+#endif
 #include "../scenarios/SWE_simple_scenarios.h"
 #include "../tools/Logger.hpp"

@@ -91,7 +95,11 @@ int main( int argc, char** argv ) {
  l_originY = l_scenario.getBoundaryPos(BND_BOTTOM);

  // create a single wave propagation block
+  #ifndef CUDA
  SWE_WavePropagationBlock l_wavePropgationBlock(l_originX, l_originY);
+  #else
+  SWE_WavePropagationBlockCuda l_wavePropgationBlock(l_originX, l_originY);
+  #endif

  // initialize the wave propgation block
  l_wavePropgationBlock.initScenario(l_scenario);
@@ -115,7 +123,7 @@ int main( int argc, char** argv ) {
  /**
   * Simulation.
   */
-  // print a start message and reset the wall clock time
+  // print the start message and reset the wall clock time
  s_sweLogger.printStartMessage();
  s_sweLogger.initWallClockTime(time(NULL));

@@ -165,7 +173,7 @@ int main( int argc, char** argv ) {
  s_sweLogger.printStatisticsMessage();

  // print the cpu time
-  s_sweLogger.printCpuTime("CPU time");
+  s_sweLogger.printCpuTime("CPU/GPU time");

  // print the wall clock time (includes plotting)
  s_sweLogger.printWallClockTime(time(NULL));

--- a/src/scenarios/SWE_simple_scenarios.h
+++ b/src/scenarios/SWE_simple_scenarios.h
@@ -28,7 +28,7 @@
 #ifndef __SWE_SIMPLE_SCENARIOS_H
 #define __SWE_SIMPLE_SCENARIOS_H

-#include <math.h>
+#include <cmath>

 #include "SWE_Scenario.h"

@@ -56,12 +56,39 @@ class SWE_BathymetryDamBreakScenario : public SWE_Scenario {
  public:

    float getBathymetry(float x, float y) { 
-       // return ( sqrt( (x-0.3f)*(x-0.3f) + (y-0.8f)*(y-0.8f) ) < 0.1f ) ? 0.1f: 0.0f;
-       return ( sqrt( (x-0.5f)*(x-0.5f) + (y-0.5f)*(y-0.5f) ) < 0.1f ) ? 0.1f: 0.0f;
+       return ( std::sqrt( (x-500.f)*(x-500.f) + (y-500.f)*(y-500.f) ) < 50.f ) ? -250.f: -260.f;
    };
-    virtual float endSimulation() { return 0.2f; };
+    virtual float endSimulation() { return (float) 15; };

    virtual BoundaryType getBoundaryType(BoundaryEdge edge) { return OUTFLOW; };
+
+    /** Get the boundary positions
+     *
+     * @param i_edge which edge
+     * @return value in the corresponding dimension
+     */
+    float getBoundaryPos(BoundaryEdge i_edge) {
+       if ( i_edge == BND_LEFT )
+         return (float)0;
+       else if ( i_edge == BND_RIGHT)
+         return (float)1000;
+       else if ( i_edge == BND_BOTTOM )
+         return (float)0;
+       else
+         return (float)1000;
+    };
+
+    /**
+     * Get the water height at a specific location.
+     *
+     * @param i_positionX position relative to the origin of the bathymetry grid in x-direction
+     * @param i_positionY position relative to the origin of the bathymetry grid in y-direction
+     * @return water height (before the initial displacement)
+     */
+    float getWaterHeight( float i_positionX,
+                          float i_positionY ) {
+      return (float) 270;
+    }
 };

 /**