Added teaching skeleton for the Gene Golub SIAM Summer School 2012.

4e10624f · breuera · 8a5d3e10 · 4e10624f · 4e10624f · 4e10624f
Commit 4e10624f authored Jul 27, 2012 by breuera
3 changed files
--- a/src_skeleton/Readme.md
+++ b/src_skeleton/Readme.md
+Teaching skeleton
+===
+# CUDA skeleton
+This directory holds the CUDA skeleton for the *Gene Golub SIAM Summer School 2012*.
+There are two CUDA kernels in the file SWE_WavePropagationBlockCuda_kenels.cu, which need to be implemented:
+1. void computeNetUpdatesKernel([...])
+2. void updateUnknownsKernel([...])
+A C++-reference is available in the file [SWE_WavePropagationBlock.cpp](https://github.com/TUM-I5/SWE/blob/master/src/SWE_WavePropagationBlock.cpp), remark: The sizes of the arrays differs from the planned CUDA implementation. This is outlined in more deatail within the constructor of [SWE_WavePropagationBlockCuda][https://github.com/TUM-I5/SWE/blob/3f9a316d196005d39496ce7231a57c6cf3961ec3/src/SWE_WavePropagationBlockCuda.cu#L52).
+The [CUDA version of the f-wave solver](https://github.com/TUM-I5/swe_solvers/blob/master/src/solver/FWaveCuda.h) should be used for the computation of the net-updates.
+Theres a tested CUDA implementation of the kernels implemented as well in the src directory of SWE [src/WE_WavePropagationBlockCuda_kernels.cu](https://github.com/TUM-I5/SWE/blob/master/src/SWE_WavePropagationBlockCuda_kernels.cu). Nevertheless we recommend to work with the C++-implementation as reference only due to the improved learning effect.
+# Main file
+An example main file is located at [src/examples/swe_wavepropagation.cpp](https://github.com/TUM-I5/SWE/blob/master/src/examples/swe_wavepropagation.cpp). Details about the compile and linking process can be found in the corresponding [SCons-script](https://github.com/TUM-I5/SWE/blob/master/src/SConscript), which selects the source files.
--- a/src_skeleton/SWE_WavePropagationBlockCuda.cu
+++ b/src_skeleton/SWE_WavePropagationBlockCuda.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * SWE_Block in CUDA, which uses solvers in the wave propagation formulation.
+ */
+#include "SWE_WavePropagationBlockCuda.hh"
+#include "SWE_BlockCUDA.hh"
+#include "SWE_WavePropagationBlockCuda_kernels.hh"
+#include <cassert>
+#ifndef STATICLOGGER
+#define STATICLOGGER
+#include "tools/Logger.hpp"
+static tools::Logger s_sweLogger;
+#endif
+// CUDA-C includes
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+// Thrust library (used for the final maximum reduction in the method computeNumericalFluxes(...))
+#include <thrust/device_vector.h>
+/**
+ * Constructor of a SWE_WavePropagationBlockCuda.
+ *
+ * Allocates the variables for the simulation:
+ *   Please note: The definition of indices changed in contrast to the CPU-Implementation.
+ *
+ *   unknowns hd,hud,hvd,bd stored on the CUDA device are defined for grid indices [0,..,nx+1]*[0,..,ny+1] (-> Abstract class SWE_BlockCUDA)
+ *     -> computational domain is [1,..,nx]*[1,..,ny]
+ *     -> plus ghost cell layer
+ *
+ *   net-updates are defined for edges with indices [0,..,nx]*[0,..,ny] for horizontal and vertical edges for simplicity (one layer is not necessary).
+ *
+ *   A left/right net update with index (i-1,j) is located on the edge between
+ *   cells with index (i-1,j) and (i,j):
+ * <pre>
+ *   *********************
+ *   *         *         *
+ *   * (i-1,j) *  (i,j)  *
+ *   *         *         *
+ *   *********************
+ *
+ *             *
+ *            ***
+ *           *****
+ *             *
+ *             *
+ *   NetUpdatesLeft(i-1,j)
+ *             or
+ *   NetUpdatesRight(i-1,j)
+ * </pre>
+ *
+ *   A below/above net update with index (i, j-1) is located on the edge between
+ *   cells with index (i, j-1) and (i,j):
+ * <pre>
+ *   ***********
+ *   *         *
+ *   * (i, j)  *   *
+ *   *         *  **  NetUpdatesBelow(i,j-1)
+ *   *********** *****         or
+ *   *         *  **  NetUpdatesAbove(i,j-1)
+ *   * (i,j-1) *   *
+ *   *         *
+ *   ***********
+ * </pre>
+ * @param i_offsetX spatial offset of the block in x-direction.
+ * @param i_offsetY spatial offset of the offset in y-direction.
+ * @param i_cudaDevice ID of the CUDA-device, which should be used.
+ */
+SWE_WavePropagationBlockCuda::SWE_WavePropagationBlockCuda( const float i_offsetX,
+                                                            const float i_offsetY,
+                                                            const int i_cudaDevice ): SWE_BlockCUDA(i_offsetX, i_offsetY, i_cudaDevice) {
+  // compute the size of one 1D net-update array.
+  int sizeOfNetUpdates = (nx+1)*(ny+1)*sizeof(float);
+  // allocate CUDA memory for the net-updates
+  cudaMalloc((void**)&hNetUpdatesLeftD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesLeftD");
+  cudaMalloc((void**)&hNetUpdatesRightD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesRightD");
+  cudaMalloc((void**)&huNetUpdatesLeftD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for huNetUpdatesLeftD");
+  cudaMalloc((void**)&huNetUpdatesRightD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for huNetUpdatesRightD");
+  cudaMalloc((void**)&hNetUpdatesBelowD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesBelowD");
+  cudaMalloc((void**)&hNetUpdatesAboveD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesAboveD");
+  cudaMalloc((void**)&hvNetUpdatesBelowD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hvNetUpdatesBelowD");
+  cudaMalloc((void**)&hvNetUpdatesAboveD, sizeOfNetUpdates);
+  checkCUDAError("allocate device memory for hNetUpdatesAboveD");
+}
+/**
+ * Destructor of a SWE_WavePropagationBlockCuda.
+ *
+ * Frees all of the memory, which was allocated within the constructor.
+ * Resets the CUDA device: Useful if error occured and printf is used on the device (buffer).
+ */
+SWE_WavePropagationBlockCuda::~SWE_WavePropagationBlockCuda() {
+  // free the net-updates memory
+  cudaFree(hNetUpdatesLeftD);
+  cudaFree(hNetUpdatesRightD);
+  cudaFree(huNetUpdatesLeftD);
+  cudaFree(huNetUpdatesRightD);
+  cudaFree(hNetUpdatesBelowD);
+  cudaFree(hNetUpdatesAboveD);
+  cudaFree(hvNetUpdatesBelowD);
+  cudaFree(hvNetUpdatesAboveD);
+  // reset the cuda device
+  s_sweLogger.printString("Resetting the CUDA devices");
+  cudaDeviceReset();
+}
+/**
+ * Compute a single global time step of a given time step width.
+ * Remark: The user has to take care about the time step width. No additional check is done. The time step width typically available
+ *         after the computation of the numerical fluxes (hidden in this method).
+ *
+ * First the net-updates are computed.
+ * Then the cells are updated with the net-updates and the given time step width.
+ *
+ * @param i_dT time step width in seconds.
+ */
+__host__
+void SWE_WavePropagationBlockCuda::simulateTimestep(float i_dT) {
+ // Compute the numerical fluxes/net-updates in the wave propagation formulation.
+ computeNumericalFluxes();
+ // Update the unknowns with the net-updates.
+ updateUnknowns(i_dT);
+}
+/**
+ * perform forward-Euler time steps, starting with simulation time tStart,:
+ * until simulation time tEnd is reached; 
+ * device-global variables hd, hud, hvd are updated;
+ * unknowns h, hu, hv in main memory are not updated.
+ * Ghost layers and bathymetry sources are updated between timesteps.
+ * intended as main simulation loop between two checkpoints
+ */
+__host__
+float SWE_WavePropagationBlockCuda::simulate(float tStart, float tEnd) {
+  float t = tStart;
+  do {
+     // set values in ghost cells:
+     setGhostLayer();
+     // Compute the numerical fluxes/net-updates in the wave propagation formulation.
+     computeNumericalFluxes();
+     // Update the unknowns with the net-updates.
+     updateUnknowns(maxTimestep);
+	 t += maxTimestep;
+  } while(t < tEnd);
+  return t;
+}
+/**
+ * Compute the numerical fluxes (net-update formulation here) on all edges.
+ *
+ * The maximum wave speed is computed within the net-updates kernel for each CUDA-block.
+ * To finalize the method the Thrust-library is called, which does the reduction over all blockwise maxima.
+ *   In the wave speed reduction step the actual cell width in x- and y-direction is not taken into account.
+ */
+void SWE_WavePropagationBlockCuda::computeNumericalFluxes() {
+  /*
+   * Initialization.
+   */
+  /*
+   * TODO: This part needs to be implemented.
+   */
+  // "2D array" which holds the blockwise maximum wave speeds
+  float* l_maximumWaveSpeedsD;
+  // size of the maximum wave speed array (dimension of the grid + ghost layers, without the top right block), sizeof(float) not included
+  int l_sizeMaxWaveSpeeds = ((dimGrid.x+1)*(dimGrid.y+1)-1);
+  cudaMalloc((void**)&l_maximumWaveSpeedsD, (l_sizeMaxWaveSpeeds*sizeof(float)) );
+  /*
+   * Compute the net updates for the 'main part and the two 'boundary' parts.
+   */
+  /*
+   * TODO: This part needs to be implemented.
+   */
+  /*
+   * Finalize (max reduction of the maximumWaveSpeeds-array.)
+   *
+   * The Thrust library is used in this step.
+   * An optional kernel could be written for the maximum reduction.
+   */
+  // Thrust pointer to the device array
+  thrust::device_ptr<float> l_thrustDevicePointer(l_maximumWaveSpeedsD);
+  // use Thrusts max_element-function for the maximum reduction
+  thrust::device_ptr<float> l_thrustDevicePointerMax = thrust::max_element(l_thrustDevicePointer, l_thrustDevicePointer+l_sizeMaxWaveSpeeds);
+  // get the result from the device
+  float l_maximumWaveSpeed = l_thrustDevicePointerMax[0];
+  // free the max wave speeds array on the device
+  cudaFree(l_maximumWaveSpeedsD);
+  // set the maximum time step for this SWE_WavePropagationBlockCuda
+  maxTimestep = std::min( dx/l_maximumWaveSpeed, dy/l_maximumWaveSpeed );
+  // CFL = 0.5
+  maxTimestep *= (float)0.4;
+}
+/**
+ * Update the cells with a given global time step.
+ *
+ * @param i_deltaT time step size.
+ */
+void SWE_WavePropagationBlockCuda::updateUnknowns(const float i_deltaT) {
+  /*
+   * TODO: This part needs to be implemented.
+   */
+  // synchronize the copy layer for MPI communication
+  #ifdef USEMPI
+  synchCopyLayerBeforeRead();
+  #endif
+}
--- a/src_skeleton/SWE_WavePropagationBlockCuda_kernels.cu
+++ b/src_skeleton/SWE_WavePropagationBlockCuda_kernels.cu
+/**
+ * @file
+ * This file is part of SWE.
+ *
+ * @author Alexander Breuer (breuera AT in.tum.de, http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer)
+ *
+ * @section LICENSE
+ *
+ * SWE is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * SWE is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with SWE.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * @section DESCRIPTION
+ *
+ * CUDA Kernels for a SWE_Block, which uses solvers in the wave propagation formulation.
+ */
+#include "SWE_BlockCUDA.hh"
+#include "SWE_WavePropagationBlockCuda_kernels.hh"
+#include <cmath>
+#include <cstdio>
+#include "solvers/FWaveCuda.h"
+/**
+ * The compute net-updates kernel calls the solver for a defined CUDA-Block and does a reduction over the computed wave speeds within this block.
+ *
+ * Remark: In overall we have nx+1 / ny+1 edges.
+ *         Therefore the edges "simulation domain"/"top ghost layer" and "simulation domain"/"right ghost layer"
+ *         will not be computed in a typical call of the function:
+ *           computeNetUpdatesKernel<<<dimGrid,dimBlock>>>( hd, hud, hvd, bd,
+ *                                                          hNetUpdatesLeftD,  hNetUpdatesRightD,
+ *                                                          huNetUpdatesLeftD, huNetUpdatesRightD,
+ *                                                          hNetUpdatesBelowD, hNetUpdatesAboveD,
+ *                                                          hvNetUpdatesBelowD, hvNetUpdatesAboveD,
+ *                                                          l_maximumWaveSpeedsD,
+ *                                                          i_nx, i_ny
+ *                                                        );
+ *         To reduce the effect of branch-mispredictions the kernel provides optional offsets, which can be used to compute the missing edges.
+ *
+ *
+ * @param i_h water heights (CUDA-array).
+ * @param i_hu momentums in x-direction (CUDA-array).
+ * @param i_hv momentums in y-direction (CUDA-array).
+ * @param i_b bathymetry values (CUDA-array).
+ * @param o_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
+ * @param o_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
+ * @param o_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
+ * @param o_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
+ * @param o_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
+ * @param o_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
+ * @param o_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param o_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param o_maximumWaveSpeeds maximum wave speed which occurred within the CUDA-block is written here (CUDA-array).
+ * @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
+ * @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
+ * @param i_offsetX cell/edge offset in x-direction.
+ * @param i_offsetY cell/edge offset in y-direction.
+ */
+__global__
+void computeNetUpdatesKernel(
+    const float* i_h, const float* i_hu, const float* i_hv, const float* i_b,
+    float* o_hNetUpdatesLeftD,   float* o_hNetUpdatesRightD,
+    float* o_huNetUpdatesLeftD,  float* o_huNetUpdatesRightD,
+    float* o_hNetUpdatesBelowD,  float* o_hNetUpdatesAboveD,
+    float* o_hvNetUpdatesBelowD, float* o_hvNetUpdatesAboveD,
+    float* o_maximumWaveSpeeds,
+    const int i_nX, const int i_nY,
+    const int i_offsetX, const int i_offsetY,
+    const int i_blockOffSetX, const int i_blockOffSetY
+) {
+  /*
+   * TODO: This kernel needs to be implemented.
+   */
+}
+/**
+ * The "update unknowns"-kernel updates the unknowns in the cells with precomputed net-updates.
+ *
+ * @param i_hNetUpdatesLeftD left going net-updates for the water height (CUDA-array).
+ * @param i_hNetUpdatesRightD right going net-updates for the water height (CUDA-array).
+ * @param i_huNetUpdatesLeftD left going net-updates for the momentum in x-direction (CUDA-array).
+ * @param i_huNetUpdatesRightD right going net-updates for the momentum in x-direction (CUDA-array).
+ * @param i_hNetUpdatesBelowD downwards going net-updates for the water height (CUDA-array).
+ * @param i_hNetUpdatesAboveD upwards going net-updates for the water height (CUDA-array).
+ * @param i_hvNetUpdatesBelowD downwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param i_hvNetUpdatesAboveD upwards going net-updates for the momentum in y-direction (CUDA-array).
+ * @param io_h water heights (CUDA-array).
+ * @param io_hu momentums in x-direction (CUDA-array).
+ * @param io_hv momentums in y-direction (CUDA-array).
+ * @param i_updateWidthX update width in x-direction.
+ * @param i_updateWidthY update width in y-direction.
+ * @param i_nx number of cells within the simulation domain in x-direction (excludes ghost layers).
+ * @param i_ny number of cells within the simulation domain in y-direction (excludes ghost layers).
+ */
+__global__
+void updateUnknownsKernel(
+    const float* i_hNetUpdatesLeftD,   const float* i_hNetUpdatesRightD,
+    const float* i_huNetUpdatesLeftD,  const float* i_huNetUpdatesRightD,
+    const float* i_hNetUpdatesBelowD,  const float* i_hNetUpdatesAboveD,
+    const float* i_hvNetUpdatesBelowD, const float* i_hvNetUpdatesAboveD,
+    float* io_h, float* io_hu, float* io_hv,
+    const float i_updateWidthX, const float i_updateWidthY,
+    const int i_nX, const int i_nY ) {
+  /*
+   * TODO: This kernel needs to be implemented.
+   */
+}
+/**
+ * Compute the position of 2D coordinates in a 1D array.
+ *   array[i][j] -> i * ny + j
+ *
+ * @param i_i row index.
+ * @param i_j column index.
+ * @param i_ny #(cells in y-direction).
+ * @return 1D index.
+ */
+__device__
+inline int computeOneDPositionKernel(const int i_i, const int i_j, const int i_ny) {
+  return i_i*i_ny + i_j;
+}