Commit c22e9dbc authored by drossostum's avatar drossostum

Using Intel Cilk for Fwave

Signed-off-by: 's avatardrossostum <drevangel@mytum.de>
parent c81126f6
/**
* FWaveCuda.h
* FWaveVec.h
*
****
**** This is a C++ wrapper for the Cuda implementation of the F-Wave solver (FWave.hpp).
**** This is a vectorizable C++ implementation of the F-Wave solver (FWave.hpp).
****
*
* Created on: Nov 13, 2012
* Last Update: Nov 16, 2012
* Last Update: Dec 28, 2013
*
****
*
* Author: Sebastian Rettenberger
* Homepage: http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.
* E-Mail: rettenbs AT in.tum.de
* Some optimzations: Michael Bader
* Homepage: http://www5.in.tum.de/wiki/index.php/Michael_Bader
* E-Mail: bader AT in.tum.de
*
****
*
......@@ -25,13 +28,10 @@
* volume={24},
* number={3},
* pages={955--978},
* year={2002},
* publisher={Citeseer}}
* year={2002}}
*
* @book{leveque2002finite,
* Author = {LeVeque, R. J.},
* Date-Added = {2011-09-13 14:09:31 +0000},
* Date-Modified = {2011-10-31 09:46:40 +0000},
* Publisher = {Cambridge University Press},
* Title = {Finite Volume Methods for Hyperbolic Problems},
* Volume = {31},
......@@ -49,202 +49,260 @@
#ifndef FWAVEVEC_HPP_
#define FWAVEVEC_HPP_
//define vlength - test and then push to separate header file, common to solver and block classes
#define vlength 4
#include <cmath>
#include <cilk/cilk.h>
#include <malloc.h>
#define T float
namespace solver
{
/**
*
*/
template<typename T>
//template<typename T>
class FWaveVec
{
private:
const T dryTol;
const T gravity;
const T half_gravity; // 0.5 * gravity constant
const T sqrt_gravity; // square root of the gravity constant
const T zeroTol;
public:
FWaveVec(T i_dryTol = (T) 100,
/**
* FWaveVec Constructor, takes three problem parameters
* @param dryTol "dry tolerance": if the water height falls below dryTol, wall boundary conditions are applied (default value is 100)
* @param gravity takes the value of the gravity constant (default value is 9.81 m/s^2)
* @param zeroTol computed f-waves with an absolute value < zeroTol are treated as static waves (default value is 10^{-7})
*/
FWaveVec(T i_dryTol = (T) 1.0,
T i_gravity = (T) 9.81,
T i_zeroTol = (T) 0.0000001)
: dryTol(i_dryTol),
gravity(i_gravity),
half_gravity( (T).5 * i_gravity ),
sqrt_gravity( std::sqrt(i_gravity) ),
zeroTol(i_zeroTol)
{
}
void computeNetUpdates ( T i_hLeft, T i_hRight,
T i_huLeft, T i_huRight,
T i_bLeft, T i_bRight,
T &o_hUpdateLeft,
T &o_hUpdateRight,
T &o_huUpdateLeft,
T &o_huUpdateRight,
//E.Drossos - vectorized code using Intel Cilk
//E.Drossos - all flop counts refer to one iteration, for total num flops has to be calculated by vlength
/**
* takes the water height, discharge and bathymatry in the left and right cell
* and computes net updates (left and right going waves) according to the f-wave approach.
* It also returns the maximum wave speed.
*/
void computeNetUpdates ( T* i_hLeft, T* i_hRight,
T* i_huLeft, T* i_huRight,
T* i_bLeft, T* i_bRight,
T* o_hUpdateLeft,
T* o_hUpdateRight,
T* o_huUpdateLeft,
T* o_huUpdateRight,
T &o_maxWaveSpeed ) const
{
// determine the wet dry state and corr. values, if necessary.
if( i_hLeft < dryTol && i_hRight < dryTol ) {
if( i_hLeft[0:vlength] >= dryTol ) {
if ( i_hRight[0:vlength] < dryTol ) {
// Wet/Dry case
// Set values according to wall boundary condition
i_hLeft[0:vlength] = i_hRight[0:vlength];
i_huLeft[0:vlength] = -i_huRight[0:vlength];
i_bLeft[0:vlength] = i_bRight[0:vlength];
}
} else if ( i_hRight[0:vlength] >= dryTol ) {
// Dry/Wet case
// Set values according to wall boundary condition
i_hRight[0:vlength] = i_hLeft[0:vlength];
i_huRight[0:vlength] = -i_huLeft[0:vlength];
i_bRight[0:vlength] = i_bLeft[0:vlength];
} else {
// Dry/Dry case
// Set dummy values such that the result is zero
i_hLeft = dryTol;
i_huLeft = 0.; i_bLeft = 0.;
i_hRight = dryTol;
i_huRight = 0.; i_bRight = 0.;
} else if ( i_hLeft < dryTol ) {
i_hLeft = i_hRight;
i_huLeft = -i_huRight;
i_bLeft = i_bRight;
} else if ( i_hRight < dryTol ) {
i_hRight = i_hLeft;
i_huRight = -i_huLeft;
i_bRight = i_bLeft;
}
i_hLeft[0:vlength] = dryTol;
i_huLeft[0:vlength] = 0.; i_bLeft[0:vlength] = 0.; //can use __sec_reduce_all_zero(a[0:vlength]) instead
i_hRight[0:vlength] = dryTol;
i_huRight[0:vlength] = 0.; i_bRight[0:vlength] = 0.;
};
//! velocity on the left side of the edge
T uLeft = i_huLeft / i_hLeft;
//TODO: Use intrinsics to allocate and assure alligned
T* uLeft = (T*)malloc(vlength*sizeof(T));
uLeft[0:vlength] = (i_huLeft[0:vlength]) / (i_hLeft[0:vlength]); // 1 FLOP (div)
//! velocity on the right side of the edge
T uRight = i_huRight / i_hRight;
//TODO: Use intrinsics to allocate
T* uRight = (T*)malloc(vlength*sizeof(T));
uRight[0:vlength] = (i_huRight[0:vlength]) / (i_hRight[0:vlength]); // 1 FLOP (div)
//! wave speeds of the f-waves
T waveSpeeds0 = 0., waveSpeeds1 = 0.;
//TODO: Use intrinsics to allocate
T* waveSpeeds0 = (T*)malloc(vlength*sizeof(T));
T* waveSpeeds1 = (T*)malloc(vlength*sizeof(T));
waveSpeeds0[0:vlength] = 0.;
waveSpeeds1[0:vlength] = 0.;
//compute the wave speeds
fWaveComputeWaveSpeeds( i_hLeft, i_hRight,
i_huLeft, i_huRight,
uLeft, uRight,
i_bLeft, i_bRight,
// E.Drossos - function vectorized
fWaveComputeWaveSpeeds( &i_hLeft[0], &i_hRight[0],
&i_huLeft[0], &i_huRight[0],
&uLeft[0], &uRight[0],
&i_bLeft[0], &i_bRight[0],
waveSpeeds0, waveSpeeds1 );
&waveSpeeds0[0], &waveSpeeds1[0] ); // 20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)
//! where to store the two f-waves
T fWaves0 = 0., fWaves1 = 0.;
//! variables to store the two f-waves
T* fWaves0 = (T*)malloc(vlength*sizeof(T));
T* fWaves1 = (T*)malloc(vlength*sizeof(T));
fWaves0[0:vlength] = 0.;
fWaves1[0:vlength] = 0.;
//compute the decomposition into f-waves
fWaveComputeWaveDecomposition( i_hLeft, i_hRight,
i_huLeft, i_huRight,
uLeft, uRight,
i_bLeft, i_bRight,
// E.Drossos - function vectorized
fWaveComputeWaveDecomposition( &i_hLeft[0], &i_hRight[0],
&i_huLeft[0], &i_huRight[0],
&uLeft[0], &uRight[0],
&i_bLeft[0], &i_bRight[0],
waveSpeeds0, waveSpeeds1,
fWaves0, fWaves1);
&waveSpeeds0[0], &waveSpeeds1[0],
&fWaves0[0], &fWaves1[0]); // 23 FLOPs (incl. 1 div)
//compute the net-updates
T hUpdateLeft = 0.;
T hUpdateRight = 0.;
T huUpdateLeft = 0.;
T huUpdateRight = 0.;
//alternatively use __sec_reduce_all_zero(a[0:vlength])
o_hUpdateLeft[0:vlength] = 0.;
o_hUpdateRight[0:vlength] = 0.;
o_huUpdateLeft[0:vlength] = 0.;
o_huUpdateRight[0:vlength] = 0.;
//1st wave family
if(waveSpeeds0 < -zeroTol) { //left going
hUpdateLeft += fWaves0;
huUpdateLeft += fWaves0 * waveSpeeds0;
if(waveSpeeds0[0:vlength] < -zeroTol) { //left going
o_hUpdateLeft[0:vlength] += fWaves0[0:vlength];
o_huUpdateLeft[0:vlength] += (fWaves0[0:vlength]) * (waveSpeeds0[0:vlength]); // 3 FLOPs (assume left going wave ...)
}
else if(waveSpeeds0 > zeroTol) { //right going
hUpdateRight += fWaves0;
huUpdateRight += fWaves0 * waveSpeeds0;
else if(waveSpeeds0[0:vlength] > zeroTol) { //right going
o_hUpdateRight[0:vlength] += fWaves0[0:vlength];
o_huUpdateRight[0:vlength] += fWaves0[0:vlength] * waveSpeeds0[0:vlength];
}
else { //split waves
hUpdateLeft += (T).5*fWaves0;
huUpdateLeft += (T).5*fWaves0 * waveSpeeds0;
hUpdateRight += (T).5*fWaves0;
huUpdateRight += (T).5*fWaves0 * waveSpeeds0;
else { //split waves, if waveSpeeds0 close to 0
o_hUpdateLeft[0:vlength] += (T).5*(fWaves0[0:vlength]);
o_huUpdateLeft[0:vlength] += (T).5*(fWaves0[0:vlength] * waveSpeeds0[0:vlength]);
o_hUpdateRight[0:vlength] += (T).5*(fWaves0[0:vlength]);
o_huUpdateRight[0:vlength] += (T).5*(fWaves0[0:vlength] * waveSpeeds0[0:vlength]);
}
//2nd wave family
if(waveSpeeds1 > zeroTol) { //right going
hUpdateRight += fWaves1;
huUpdateRight += fWaves1 * waveSpeeds1;
if(waveSpeeds1[0:vlength] > zeroTol) { //right going
o_hUpdateRight[0:vlength] += fWaves1[0:vlength];
o_huUpdateRight[0:vlength] += (fWaves1[0:vlength] * waveSpeeds1[0:vlength]); // 3 FLOPs (assume right going wave ...)
}
else if(waveSpeeds1 < -zeroTol) { //left going
hUpdateLeft += fWaves1;
huUpdateLeft += fWaves1 * waveSpeeds1;
else if(waveSpeeds1[0:vlength] < -zeroTol) { //left going
o_hUpdateLeft[0:vlength] += fWaves1[0:vlength];
o_huUpdateLeft[0:vlength] += (fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
}
else { //split waves
hUpdateLeft += (T).5*fWaves1;
huUpdateLeft += (T).5*fWaves1 * waveSpeeds1;
hUpdateRight += (T).5*fWaves1;
huUpdateRight += (T).5*fWaves1 * waveSpeeds1;
o_hUpdateLeft[0:vlength] += (T).5*(fWaves1[0:vlength]);
o_huUpdateLeft[0:vlength] += (T).5*(fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
o_hUpdateRight[0:vlength] += (T).5*(fWaves1[0:vlength]);
o_huUpdateRight[0:vlength] += (T).5*(fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
}
// Set output variables
o_hUpdateLeft = hUpdateLeft;
o_hUpdateRight = hUpdateRight;
o_huUpdateLeft = huUpdateLeft;
o_huUpdateRight = huUpdateRight;
//compute maximum wave speed (-> CFL-condition)
o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) );
//o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) );
o_maxWaveSpeed = __sec_reduce_max(std::max( std::abs(waveSpeeds0[0:vlength]) , std::abs(waveSpeeds1[0:vlength]) ));
// 3 FLOPs (2 abs, 1 max)
//========================
// 54 FLOPs (3 sqrt, 4 div, 2 abs, 3 min/max)
}
inline
void fWaveComputeWaveSpeeds(
const T i_hLeft, const T i_hRight,
const T i_huLeft, const T i_huRight,
const T i_uLeft, const T i_uRight,
const T i_bLeft, const T i_bRight,
const T* i_hLeft, const T* i_hRight,
const T* i_huLeft, const T* i_huRight,
const T* i_uLeft, const T* i_uRight,
const T* i_bLeft, const T* i_bRight,
T &o_waveSpeed0, T &o_waveSpeed1 ) const
T* o_waveSpeed0, T* o_waveSpeed1 ) const
{
//compute eigenvalues of the jacobian matrices in states Q_{i-1} and Q_{i}
T characteristicSpeed0 = 0., characteristicSpeed1 = 0.;
characteristicSpeed0 = i_uLeft - std::sqrt(gravity*i_hLeft);
characteristicSpeed1 = i_uRight + std::sqrt(gravity*i_hRight);
//compute "Roe speeds"
T hRoe = (T).5 * (i_hRight + i_hLeft);
T uRoe = i_uLeft * std::sqrt(i_hLeft) + i_uRight * std::sqrt(i_hRight);
uRoe /= std::sqrt(i_hLeft) + std::sqrt(i_hRight);
T roeSpeed0 = 0., roeSpeed1 = 0.;
roeSpeed0 = uRoe - std::sqrt(gravity*hRoe);
roeSpeed1 = uRoe + std::sqrt(gravity*hRoe);
//computer eindfeldt speeds
o_waveSpeed0 = std::min(characteristicSpeed0, roeSpeed0);
o_waveSpeed1 = std::max(characteristicSpeed1, roeSpeed1);
// helper variables for sqrt of h:
// define arrays for helpers
T* sqrt_hLeft = (T*)malloc(vlength*sizeof(T));
T* sqrt_hRight = (T*)malloc(vlength*sizeof(T));
sqrt_hLeft[0:vlength] = std::sqrt(i_hLeft[0:vlength]); // 1 FLOP (sqrt)
sqrt_hRight[0:vlength] = std::sqrt(i_hRight[0:vlength]); // 1 FLOP (sqrt)
// compute eigenvalues of the jacobian matrices
// in states Q_{i-1} and Q_{i}
T* characteristicSpeed0 = (T*)malloc(vlength*sizeof(T));
T* characteristicSpeed1 = (T*)malloc(vlength*sizeof(T));
characteristicSpeed0[0:vlength] = i_uLeft[0:vlength] - sqrt_gravity * sqrt_hLeft[0:vlength]; // 2 FLOPs
characteristicSpeed1[0:vlength] = i_uRight[0:vlength] + sqrt_gravity * sqrt_hRight[0:vlength]; // 2 FLOPs
// compute "Roe averages"
// E.Drossos - Define arrays for hRoe, sqrt_hRoe, uRoe
T* hRoe = (T*)malloc(vlength*sizeof(T));
T* sqrt_hRoe = (T*)malloc(vlength*sizeof(T));
T* uRoe = (T*)malloc(vlength*sizeof(T));
hRoe[0:vlength] = (T).5 * (i_hRight[0:vlength] + i_hLeft[0:vlength]); // 2 FLOPs
sqrt_hRoe[0:vlength] = std::sqrt(hRoe[0:vlength]); // 1 FLOP (sqrt)
uRoe[0:vlength] = i_uLeft[0:vlength] * sqrt_hLeft[0:vlength] + i_uRight[0:vlength] * sqrt_hRight[0:vlength]; // 3 FLOPs
uRoe[0:vlength] /= (sqrt_hLeft[0:vlength] + sqrt_hRight[0:vlength]); // 2 FLOPs (1 div)
// compute "Roe speeds" from Roe averages
// define arrays for "Roe speeds"
T* roeSpeed0 = (T*)malloc(vlength*sizeof(T));
T* roeSpeed1 = (T*)malloc(vlength*sizeof(T));
roeSpeed0[0:vlength] = uRoe[0:vlength] - sqrt_gravity * sqrt_hRoe[0:vlength]; // 2 FLOPs
roeSpeed1[0:vlength] = uRoe[0:vlength] + sqrt_gravity * sqrt_hRoe[0:vlength]; // 2 FLOPs
// compute Eindfeldt speeds (returned as output parameters)
o_waveSpeed0[0:vlength] = std::min(characteristicSpeed0[0:vlength], roeSpeed0[0:vlength]); // 1 FLOP (min)
o_waveSpeed1[0:vlength] = std::max(characteristicSpeed1[0:vlength], roeSpeed1[0:vlength]); // 1 FLOP (max)
//==============
//20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)
}
inline
void fWaveComputeWaveDecomposition(
const T i_hLeft, const T i_hRight,
const T i_huLeft, const T i_huRight,
const T i_uLeft, const T i_uRight,
const T i_bLeft, const T i_bRight,
const T i_waveSpeed0, const T i_waveSpeed1,
const T* i_hLeft, const T* i_hRight,
const T* i_huLeft, const T* i_huRight,
const T* i_uLeft, const T* i_uRight,
const T* i_bLeft, const T* i_bRight,
const T* i_waveSpeed0, const T* i_waveSpeed1,
T &o_fWave0, T &o_fWave1 ) const
T *o_fWave0, T *o_fWave1 ) const
{
T lambdaDif = i_waveSpeed1 - i_waveSpeed0;
//compute the inverse matrix R^{-1}
T Rinv00 = 0., Rinv01 = 0., Rinv10 = 0., Rinv11 = 0.;
T oneDivLambdaDif = (T)1. / lambdaDif;
Rinv00 = oneDivLambdaDif * i_waveSpeed1;
Rinv01 = -oneDivLambdaDif;
Rinv10 = oneDivLambdaDif * -i_waveSpeed0;
Rinv11 = oneDivLambdaDif;
//right hand side
T fDif0 = 0., fDif1 = 0.;
//calculate modified (bathymetry!) flux difference
// f(Q_i) - f(Q_{i-1})
fDif0 = i_huRight - i_huLeft;
fDif1 = i_huRight * i_uRight + (T).5 * gravity * i_hRight * i_hRight
-(i_huLeft * i_uLeft + (T).5 * gravity * i_hLeft * i_hLeft);
// f(Q_i) - f(Q_{i-1}) -> serve as right hand sides
//allocate mem for fDif0, fDif1 arrays
T* fDif0 = (T*)malloc(vlength*sizeof(T));
T* fDif1 = (T*)malloc(vlength*sizeof(T));
fDif0[0:vlength] = i_huRight[0:vlength] - i_huLeft[0:vlength]; // 1 FLOP
fDif1[0:vlength] = i_huRight[0:vlength] * i_uRight[0:vlength] + half_gravity * i_hRight[0:vlength] * i_hRight[0:vlength]
-(i_huLeft[0:vlength] * i_uLeft[0:vlength] + half_gravity * i_hLeft[0:vlength] * i_hLeft[0:vlength]); // 9 FLOPs
// \delta x \Psi[2]
T psi = -gravity * (T).5 * (i_hRight + i_hLeft)*(i_bRight - i_bLeft);
fDif1 -= psi;
//solve linear equations
o_fWave0 = Rinv00 * fDif0 + Rinv01 * fDif1;
o_fWave1 = Rinv10 * fDif0 + Rinv11 * fDif1;
fDif1[0:vlength] += half_gravity * (i_hRight[0:vlength] + i_hLeft[0:vlength])*(i_bRight[0:vlength] - i_bLeft[0:vlength]); // 5 FLOPs
// solve linear system of equations to obtain f-waves:
// ( 1 1 ) ( o_fWave0 ) = ( fDif0 )
// ( i_waveSpeed0 i_waveSpeed1 ) ( o_fWave1 ) ( fDif1 )
// compute the inverse of the wave speed difference:
T* inverseSpeedDiff = (T*)malloc(vlength*sizeof(T));
inverseSpeedDiff[0:vlength] = (T)1. / ( i_waveSpeed1[0:vlength] - i_waveSpeed0[0:vlength] ); // 2 FLOPs (1 div)
// compute f-waves:
o_fWave0[0:vlength] = ( i_waveSpeed1[0:vlength] * fDif0[0:vlength] - fDif1[0:vlength] ) * inverseSpeedDiff[0:vlength]; // 3 FLOPs
o_fWave1[0:vlength] = ( -i_waveSpeed0[0:vlength] * fDif0[0:vlength] + fDif1[0:vlength] ) * inverseSpeedDiff[0:vlength]; // 3 FLOPs
//=========
//23 FLOPs in total (incl. 1 div)
}
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment