Fixed bugs in FWaveVec.hpp

Signed-off-by: Gaurav Kukreja <gaurav.kukreja@tum.de>

Fixed bugs in FWaveVec.hpp
Signed-off-by: Gaurav Kukreja <gaurav.kukreja@tum.de>
998480e1 · Gaurav Kukreja · c81126f6 · 998480e1
Commit 998480e1 authored Jan 19, 2014 by Gaurav Kukreja
Hide whitespace changes
Inline Side-by-side

Showing with 406 additions and 93 deletions

FWaveVec.hpp src/solver/FWaveVec.hpp +406 -93

No files found.
--- a/src/solver/FWaveVec.hpp
+++ b/src/solver/FWaveVec.hpp
 /**
- * FWaveCuda.h
+ * FWaveVec.h
 *
 ****
- **** This is a C++ wrapper for the Cuda implementation of the F-Wave solver (FWave.hpp).
+ **** This is a vectorizable C++ implementation of the F-Wave solver (FWave.hpp).
 ****
 *
 * Created on: Nov 13, 2012
- * Last Update: Nov 16, 2012
+ * Last Update: Dec 28, 2013
 *
 ****
 *
 *  Author: Sebastian Rettenberger
 *    Homepage: http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.
 *    E-Mail: rettenbs AT in.tum.de
+ *  Some optimzations: Michael Bader
+ *    Homepage: http://www5.in.tum.de/wiki/index.php/Michael_Bader
+ *    E-Mail: bader AT in.tum.de
 *
 ****
 *
@@ -25,13 +28,10 @@
 *            volume={24},
 *            number={3},
 *            pages={955--978},
- *            year={2002},
- *            publisher={Citeseer}}
+ *            year={2002}}
 *
 *   @book{leveque2002finite,
 *         Author = {LeVeque, R. J.},
- *         Date-Added = {2011-09-13 14:09:31 +0000},
- *         Date-Modified = {2011-10-31 09:46:40 +0000},
 *         Publisher = {Cambridge University Press},
 *         Title = {Finite Volume Methods for Hyperbolic Problems},
 *         Volume = {31},
@@ -49,7 +49,12 @@
 #ifndef FWAVEVEC_HPP_
 #define FWAVEVEC_HPP_

+//define vlength - test and then push to separate header file, common to solver and block classes
+#define vlength 8 
+
 #include <cmath>
+#include <cilk/cilk.h>
+#include <malloc.h>

 namespace solver
 {
@@ -62,19 +67,93 @@ class FWaveVec
 {
 private:
 	const T dryTol;
-	const T gravity;
+	const T half_gravity; // 0.5 * gravity constant
+	const T sqrt_gravity; // square root of the gravity constant 
 	const T zeroTol;
-
 public:
-	FWaveVec(T i_dryTol = (T) 100,
+	T* uLeft;
+	T* uRight;
+	T* waveSpeeds0;
+	T* waveSpeeds1;
+	T* fWaves0;
+	T* fWaves1;
+	T* sqrt_hLeft;
+	T* sqrt_hRight;
+	T* characteristicSpeed0;
+	T* characteristicSpeed1;
+	T* hRoe;
+	T* sqrt_hRoe;
+	T* uRoe;
+	T* roeSpeed0;
+	T* roeSpeed1;
+	T* fDif0;
+	T* fDif1;
+	T* inverseSpeedDiff;
+
+//public:
+	/**
+	 * FWaveVec Constructor, takes three problem parameters
+	 * @param dryTol "dry tolerance": if the water height falls below dryTol, wall boundary conditions are applied (default value is 100)
+	 * @param gravity takes the value of the gravity constant (default value is 9.81 m/s^2)
+	 * @param zeroTol computed f-waves with an absolute value < zeroTol are treated as static waves (default value is 10^{-7})
+	 */
+	FWaveVec(T i_dryTol = (T) 1.0,
 			 T i_gravity = (T) 9.81,
 			 T i_zeroTol = (T) 0.0000001)
 		: dryTol(i_dryTol),
-		  gravity(i_gravity),
+		  half_gravity( (T).5 * i_gravity ),
+		  sqrt_gravity( std::sqrt(i_gravity) ),
 		  zeroTol(i_zeroTol)
 	{
+		uLeft = (T*) malloc(sizeof(T) * vlength);
+		uRight = (T*) malloc(sizeof(T) * vlength);
+		waveSpeeds0 = (T*) malloc(sizeof(T) * vlength);
+		waveSpeeds1 = (T*) malloc(sizeof(T) * vlength);
+		fWaves0 = (T*) malloc(sizeof(T) * vlength);
+		fWaves1 = (T*) malloc(sizeof(T) * vlength);
+		sqrt_hLeft = (T*) malloc(sizeof(T) * vlength);
+		sqrt_hRight = (T*) malloc(sizeof(T) * vlength);
+		characteristicSpeed0 = (T*) malloc(sizeof(T) * vlength);
+		characteristicSpeed1 = (T*) malloc(sizeof(T) * vlength);
+		hRoe = (T*) malloc(sizeof(T) * vlength);
+		sqrt_hRoe = (T*) malloc(sizeof(T) * vlength);
+		uRoe = (T*) malloc(sizeof(T) * vlength);
+		roeSpeed0 = (T*) malloc(sizeof(T) * vlength);
+		roeSpeed1 = (T*) malloc(sizeof(T) * vlength);
+		fDif0 = (T*) malloc(sizeof(T) * vlength);
+		fDif1 = (T*) malloc(sizeof(T) * vlength);
+		inverseSpeedDiff = (T*) malloc(sizeof(T) * vlength);
+	}
+
+	~FWaveVec()
+	{
+		
+		free(uLeft);
+		free(uRight);
+		free(waveSpeeds0);
+		free(waveSpeeds1);
+		free(fWaves0);
+		free(fWaves1);
+		free(sqrt_hLeft);
+		free(sqrt_hRight);
+		free(characteristicSpeed0);
+		free(characteristicSpeed1);
+		free(hRoe);
+		free(sqrt_hRoe);
+		free(uRoe);
+		free(roeSpeed0);
+		free(roeSpeed1);
+		free(fDif0);
+		free(fDif1);
+		free(inverseSpeedDiff);
 	}

+
+/**
+	 * takes the water height, discharge and bathymatry in the left and right cell
+	 * and computes net updates (left and right going waves) according to the f-wave approach.
+	 * It also returns the maximum wave speed.
+	 */
 	void computeNetUpdates ( T i_hLeft,  T i_hRight,
                             T i_huLeft, T i_huRight,
                             T i_bLeft,  T i_bRight,
@@ -86,27 +165,33 @@ public:
                             T &o_maxWaveSpeed ) const
 	{
 		  // determine the wet dry state and corr. values, if necessary.
-		  if( i_hLeft < dryTol && i_hRight < dryTol ) {
-		      // Dry/Dry case
-		      // Set dummy values such that the result is zero
-		      i_hLeft = dryTol;
-		      i_huLeft = 0.; i_bLeft = 0.;
-		      i_hRight = dryTol;
-		      i_huRight = 0.; i_bRight = 0.;
-		  } else if ( i_hLeft < dryTol ) {
+		  if( i_hLeft >= dryTol ) {
+		     if ( i_hRight < dryTol ) {
+		      // Wet/Dry case
+		      // Set values according to wall boundary condition
 		      i_hLeft = i_hRight;
 		      i_huLeft = -i_huRight;
 		      i_bLeft = i_bRight;
-		  } else if ( i_hRight < dryTol ) {
+		     } 
+		  } else if ( i_hRight >= dryTol ) {
+		      // Dry/Wet case
+		      // Set values according to wall boundary condition
 		      i_hRight = i_hLeft;
 		      i_huRight = -i_huLeft;
 		      i_bRight = i_bLeft;
-		  }
+		  } else {
+		      // Dry/Dry case
+		      // Set dummy values such that the result is zero
+		      i_hLeft = dryTol;
+		      i_huLeft = 0.; i_bLeft = 0.;
+		      i_hRight = dryTol;
+		      i_huRight = 0.; i_bRight = 0.;
+		  };

 		  //! velocity on the left side of the edge
-		  T uLeft = i_huLeft / i_hLeft;
+		  T uLeft = i_huLeft / i_hLeft;                               // 1 FLOP (div)
 		  //! velocity on the right side of the edge
-		  T uRight = i_huRight / i_hRight;
+		  T uRight = i_huRight / i_hRight;                            // 1 FLOP (div)

 		  //! wave speeds of the f-waves
 		  T waveSpeeds0 = 0., waveSpeeds1 = 0.;
@@ -117,9 +202,9 @@ public:
 		                          uLeft, uRight,
 		                          i_bLeft, i_bRight,

-		                          waveSpeeds0, waveSpeeds1 );
+		                          waveSpeeds0, waveSpeeds1 );         // 20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)

-		  //! where to store the two f-waves
+		  //! variables to store the two f-waves
 		  T fWaves0 = 0., fWaves1 = 0.;

 		  //compute the decomposition into f-waves
@@ -129,56 +214,197 @@ public:
 		                                 i_bLeft, i_bRight,

 		                                 waveSpeeds0, waveSpeeds1,
-		                                 fWaves0, fWaves1);
+		                                 fWaves0, fWaves1);           // 23 FLOPs (incl. 1 div)

 		  //compute the net-updates
-		  T hUpdateLeft = 0.;
-		  T hUpdateRight = 0.;
-		  T huUpdateLeft = 0.;
-		  T huUpdateRight = 0.;
+		  o_hUpdateLeft = 0.;
+		  o_hUpdateRight = 0.;
+		  o_huUpdateLeft = 0.;
+		  o_huUpdateRight = 0.;

 		  //1st wave family
 		  if(waveSpeeds0 < -zeroTol) { //left going
-		    hUpdateLeft +=  fWaves0;
-		    huUpdateLeft += fWaves0 * waveSpeeds0;
+		    o_hUpdateLeft +=  fWaves0;
+		    o_huUpdateLeft += fWaves0 * waveSpeeds0;                    // 3 FLOPs (assume left going wave ...)
 		  }
 		  else if(waveSpeeds0 > zeroTol) { //right going
-		    hUpdateRight +=  fWaves0;
-		    huUpdateRight += fWaves0 * waveSpeeds0;
+		    o_hUpdateRight +=  fWaves0;
+		    o_huUpdateRight += fWaves0 * waveSpeeds0;
 		  }
-		  else { //split waves
-		    hUpdateLeft +=   (T).5*fWaves0;
-		    huUpdateLeft +=  (T).5*fWaves0 * waveSpeeds0;
-		    hUpdateRight +=  (T).5*fWaves0;
-		    huUpdateRight += (T).5*fWaves0 * waveSpeeds0;
+		  else { //split waves, if waveSpeeds0 close to 0
+		    o_hUpdateLeft +=   (T).5*fWaves0;
+		    o_huUpdateLeft +=  (T).5*fWaves0 * waveSpeeds0;
+		    o_hUpdateRight +=  (T).5*fWaves0;
+		    o_huUpdateRight += (T).5*fWaves0 * waveSpeeds0;
 		  }

 		  //2nd wave family
 		  if(waveSpeeds1 > zeroTol) { //right going
-		    hUpdateRight +=  fWaves1;
-		    huUpdateRight += fWaves1 * waveSpeeds1;
+		    o_hUpdateRight +=  fWaves1;
+		    o_huUpdateRight += fWaves1 * waveSpeeds1;                   // 3 FLOPs (assume right going wave ...)
 		  }
 		  else if(waveSpeeds1 < -zeroTol) { //left going
-			hUpdateLeft +=  fWaves1;
-			huUpdateLeft += fWaves1 * waveSpeeds1;
+			o_hUpdateLeft +=  fWaves1;
+			o_huUpdateLeft += fWaves1 * waveSpeeds1;
 		  }
 		  else { //split waves
-		    hUpdateLeft +=   (T).5*fWaves1;
-		    huUpdateLeft +=  (T).5*fWaves1 * waveSpeeds1;
-		    hUpdateRight +=  (T).5*fWaves1;
-		    huUpdateRight += (T).5*fWaves1 * waveSpeeds1;
+		    o_hUpdateLeft +=   (T).5*fWaves1;
+		    o_huUpdateLeft +=  (T).5*fWaves1 * waveSpeeds1;
+		    o_hUpdateRight +=  (T).5*fWaves1;
+		    o_huUpdateRight += (T).5*fWaves1 * waveSpeeds1;
+		  }
+
+		  //compute maximum wave speed (-> CFL-condition)
+		  o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) ); 
+		                                                              // 3 FLOPs (2 abs, 1 max)
+		                                                              //========================
+		                                                              // 54 FLOPs (3 sqrt, 4 div, 2 abs, 3 min/max)
+    }
+
+	//E.Drossos - vectorized code using Intel Cilk
+	//E.Drossos - all flop counts refer to one iteration, for total num flops has to be calculated by vlength
+	
+	/**
+	 * takes the water height, discharge and bathymatry in the left and right cell
+	 * and computes net updates (left and right going waves) according to the f-wave approach.
+	 * It also returns the maximum wave speed.
+	 */
+	void computeNetUpdates_cilk ( T* i_hLeft,  T* i_hRight,
+                             T* i_huLeft, T* i_huRight,
+                             T* i_bLeft,  T* i_bRight,
+
+                             T* o_hUpdateLeft,
+                             T* o_hUpdateRight,
+                             T* o_huUpdateLeft,
+                             T* o_huUpdateRight,
+                             T &o_maxWaveSpeed ) const
+	{
+		  // determine the wet dry state and corr. values, if necessary.
+		  if( i_hLeft[0:vlength] >= dryTol ) {
+		     if ( i_hRight[0:vlength] < dryTol ) {
+		      // Wet/Dry case
+		      // Set values according to wall boundary condition
+		      i_hLeft[0:vlength] = i_hRight[0:vlength];
+		      i_huLeft[0:vlength] = -i_huRight[0:vlength];
+		      i_bLeft[0:vlength] = i_bRight[0:vlength];
+		     } 
+		  } else if ( i_hRight[0:vlength] >= dryTol ) {
+		      // Dry/Wet case
+		      // Set values according to wall boundary condition
+		      i_hRight[0:vlength] = i_hLeft[0:vlength];
+		      i_huRight[0:vlength] = -i_huLeft[0:vlength];
+		      i_bRight[0:vlength] = i_bLeft[0:vlength];
+		  } else {
+		      // Dry/Dry case
+		      // Set dummy values such that the result is zero
+		      i_hLeft[0:vlength] = dryTol;
+		      i_huLeft[0:vlength] = 0.; i_bLeft[0:vlength] = 0.; //can use __sec_reduce_all_zero(a[0:vlength]) instead
+		      i_hRight[0:vlength] = dryTol;
+		      i_huRight[0:vlength] = 0.; i_bRight[0:vlength] = 0.;
+		  };
+
+		  //! velocity on the left side of the edge
+		  //TODO: Use intrinsics to allocate and assure alligned
+//		  T* uLeft = (T*)malloc(vlength*sizeof(T));
+		  uLeft[0:vlength] = (i_huLeft[0:vlength]) / (i_hLeft[0:vlength]);	// 1 FLOP (div)
+		  //! velocity on the right side of the edge
+		  //TODO: Use intrinsics to allocate
+//		  T* uRight = (T*)malloc(vlength*sizeof(T));
+		  uRight[0:vlength] = (i_huRight[0:vlength]) / (i_hRight[0:vlength]);                            // 1 FLOP (div)
+
+		  //! wave speeds of the f-waves
+		  //TODO: Use intrinsics to allocate
+//		  T* waveSpeeds0 = (T*)malloc(vlength*sizeof(T));
+//		  T* waveSpeeds1 = (T*)malloc(vlength*sizeof(T));
+		  waveSpeeds0[0:vlength] = 0.;
+		  waveSpeeds1[0:vlength] = 0.;
+
+		  //compute the wave speeds
+		  // E.Drossos - function vectorized
+		  fWaveComputeWaveSpeeds_cilk( &i_hLeft[0], &i_hRight[0],
+		                          &i_huLeft[0], &i_huRight[0],
+		                          &uLeft[0], &uRight[0],
+		                          &i_bLeft[0], &i_bRight[0],
+
+		                          &waveSpeeds0[0], &waveSpeeds1[0] );         // 20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)
+
+		  //! variables to store the two f-waves
+//		  T* fWaves0 = (T*)malloc(vlength*sizeof(T));
+//		  T* fWaves1 = (T*)malloc(vlength*sizeof(T));
+		  fWaves0[0:vlength] = 0.;
+		  fWaves1[0:vlength] = 0.;
+
+		  //compute the decomposition into f-waves
+		  // E.Drossos - function vectorized
+		  fWaveComputeWaveDecomposition_cilk( &i_hLeft[0], &i_hRight[0],
+		                                 &i_huLeft[0], &i_huRight[0],
+		                                 &uLeft[0], &uRight[0],
+		                                 &i_bLeft[0], &i_bRight[0],
+
+		                                 &waveSpeeds0[0], &waveSpeeds1[0],
+		                                 &fWaves0[0], &fWaves1[0]);           // 23 FLOPs (incl. 1 div)
+
+		  //compute the net-updates
+		  //alternatively use __sec_reduce_all_zero(a[0:vlength])
+		  o_hUpdateLeft[0:vlength] = 0.;
+		  o_hUpdateRight[0:vlength] = 0.;
+		  o_huUpdateLeft[0:vlength] = 0.;
+		  o_huUpdateRight[0:vlength] = 0.;
+
+		  //1st wave family
+		  if(waveSpeeds0[0:vlength] < -zeroTol) { //left going
+		    o_hUpdateLeft[0:vlength] +=  fWaves0[0:vlength];
+		    o_huUpdateLeft[0:vlength] += (fWaves0[0:vlength]) * (waveSpeeds0[0:vlength]);                    // 3 FLOPs (assume left going wave ...)
+		  }
+		  else if(waveSpeeds0[0:vlength] > zeroTol) { //right going
+		    o_hUpdateRight[0:vlength] +=  fWaves0[0:vlength];
+		    o_huUpdateRight[0:vlength] += fWaves0[0:vlength] * waveSpeeds0[0:vlength];
+		  }
+		  else { //split waves, if waveSpeeds0 close to 0
+		    o_hUpdateLeft[0:vlength] +=   (T).5*(fWaves0[0:vlength]);
+		    o_huUpdateLeft[0:vlength] +=  (T).5*(fWaves0[0:vlength] * waveSpeeds0[0:vlength]);
+		    o_hUpdateRight[0:vlength] +=  (T).5*(fWaves0[0:vlength]);
+		    o_huUpdateRight[0:vlength] += (T).5*(fWaves0[0:vlength] * waveSpeeds0[0:vlength]);
 		  }

-		  // Set output variables
-		  o_hUpdateLeft = hUpdateLeft;
-		  o_hUpdateRight = hUpdateRight;
-		  o_huUpdateLeft = huUpdateLeft;
-		  o_huUpdateRight = huUpdateRight;
+		  //2nd wave family
+		  if(waveSpeeds1[0:vlength] > zeroTol) { //right going
+		    o_hUpdateRight[0:vlength] +=  fWaves1[0:vlength];
+		    o_huUpdateRight[0:vlength] += (fWaves1[0:vlength] * waveSpeeds1[0:vlength]);                   // 3 FLOPs (assume right going wave ...)
+		  }
+		  else if(waveSpeeds1[0:vlength] < -zeroTol) { //left going
+			o_hUpdateLeft[0:vlength] +=  fWaves1[0:vlength];
+			o_huUpdateLeft[0:vlength] += (fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
+		  }
+		  else { //split waves
+		    o_hUpdateLeft[0:vlength] +=   (T).5*(fWaves1[0:vlength]);
+		    o_huUpdateLeft[0:vlength] +=  (T).5*(fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
+		    o_hUpdateRight[0:vlength] +=  (T).5*(fWaves1[0:vlength]);
+		    o_huUpdateRight[0:vlength] += (T).5*(fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
+		  }

 		  //compute maximum wave speed (-> CFL-condition)
-		  o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) );
+		  //o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) ); 
+		  		   
+		  //o_maxWaveSpeed = __sec_reduce_max(std::max( std::abs(waveSpeeds0[0:vlength]) , std::abs(waveSpeeds1[0:vlength]) )); 
+
+		  waveSpeeds0[0:vlength] = std::abs(waveSpeeds0[0:vlength]);
+		  waveSpeeds1[0:vlength] = std::abs(waveSpeeds1[0:vlength]);
+
+		  T o_maxWaveSpeed0;
+		  T o_maxWaveSpeed1;
+	
+                  o_maxWaveSpeed0 = __sec_reduce_max (waveSpeeds0[0:vlength]);
+		  o_maxWaveSpeed1 = __sec_reduce_max (waveSpeeds1[0:vlength]);
+
+		  o_maxWaveSpeed = std::max(o_maxWaveSpeed0, o_maxWaveSpeed1);
+		  
+		                                                              // 3 FLOPs (2 abs, 1 max)
+		                                                              //========================
+		                                                              // 54 FLOPs (3 sqrt, 4 div, 2 abs, 3 min/max)
    }

+
 	inline
 	void fWaveComputeWaveSpeeds(
            const T i_hLeft,  const T i_hRight,
@@ -188,25 +414,81 @@ public:

            T &o_waveSpeed0, T &o_waveSpeed1 ) const
 	{
-		//compute eigenvalues of the jacobian matrices in states Q_{i-1} and Q_{i}
-		T characteristicSpeed0 = 0., characteristicSpeed1 = 0.;
-		characteristicSpeed0 = i_uLeft - std::sqrt(gravity*i_hLeft);
-		characteristicSpeed1 = i_uRight + std::sqrt(gravity*i_hRight);
-
-		//compute "Roe speeds"
-		T hRoe = (T).5 * (i_hRight + i_hLeft);
-		T uRoe = i_uLeft * std::sqrt(i_hLeft) + i_uRight * std::sqrt(i_hRight);
-		uRoe /= std::sqrt(i_hLeft) + std::sqrt(i_hRight);
-
-		T roeSpeed0 = 0., roeSpeed1 = 0.;
-		roeSpeed0 = uRoe - std::sqrt(gravity*hRoe);
-		roeSpeed1 = uRoe + std::sqrt(gravity*hRoe);
-
-		//computer eindfeldt speeds
-		o_waveSpeed0 = std::min(characteristicSpeed0, roeSpeed0);
-		o_waveSpeed1 = std::max(characteristicSpeed1, roeSpeed1);
+		// helper variables for sqrt of h:
+		T sqrt_hLeft = std::sqrt(i_hLeft);                                // 1 FLOP (sqrt)
+		T sqrt_hRight = std::sqrt(i_hRight);                              // 1 FLOP (sqrt)
+
+		// compute eigenvalues of the jacobian matrices 
+		// in states Q_{i-1} and Q_{i}
+		T characteristicSpeed0 = i_uLeft - sqrt_gravity * sqrt_hLeft;     // 2 FLOPs
+		T characteristicSpeed1 = i_uRight + sqrt_gravity * sqrt_hRight;   // 2 FLOPs
+
+		// compute "Roe averages"
+		T hRoe = (T).5 * (i_hRight + i_hLeft);                            // 2 FLOPs
+		T sqrt_hRoe = std::sqrt(hRoe);                                    // 1 FLOP (sqrt)
+		T uRoe = i_uLeft * sqrt_hLeft + i_uRight * sqrt_hRight;           // 3 FLOPs
+		uRoe /= sqrt_hLeft + sqrt_hRight;                                 // 2 FLOPs (1 div)
+
+		// compute "Roe speeds" from Roe averages
+		T roeSpeed0 = uRoe - sqrt_gravity * sqrt_hRoe;                    // 2 FLOPs
+		T roeSpeed1 = uRoe + sqrt_gravity * sqrt_hRoe;                    // 2 FLOPs
+
+		// compute Eindfeldt speeds (returned as output parameters)
+		o_waveSpeed0 = std::min(characteristicSpeed0, roeSpeed0);         // 1 FLOP (min)
+		o_waveSpeed1 = std::max(characteristicSpeed1, roeSpeed1);         // 1 FLOP (max)
+		                                                                  //==============
+		                                                                  //20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)
 	}

+
+	inline
+	void fWaveComputeWaveSpeeds_cilk (
+            const T* i_hLeft,  const T* i_hRight,
+            const T* i_huLeft, const T* i_huRight,
+            const T* i_uLeft,  const T* i_uRight,
+            const T* i_bLeft,  const T* i_bRight,
+
+            T *o_waveSpeed0, T *o_waveSpeed1 ) const
+	{
+		// helper variables for sqrt of h:
+		// define arrays for helpers
+//		T* sqrt_hLeft = (T*)malloc(vlength*sizeof(T));
+//		T* sqrt_hRight = (T*)malloc(vlength*sizeof(T));
+		sqrt_hLeft[0:vlength] = std::sqrt(i_hLeft[0:vlength]);                                // 1 FLOP (sqrt)
+		sqrt_hRight[0:vlength] = std::sqrt(i_hRight[0:vlength]);                              // 1 FLOP (sqrt)
+
+		// compute eigenvalues of the jacobian matrices 
+		// in states Q_{i-1} and Q_{i}
+//		T* characteristicSpeed0 = (T*)malloc(vlength*sizeof(T));
+//		T* characteristicSpeed1 = (T*)malloc(vlength*sizeof(T));
+		characteristicSpeed0[0:vlength] = i_uLeft[0:vlength] - sqrt_gravity * sqrt_hLeft[0:vlength];     // 2 FLOPs
+		characteristicSpeed1[0:vlength] = i_uRight[0:vlength] + sqrt_gravity * sqrt_hRight[0:vlength];   // 2 FLOPs
+
+		// compute "Roe averages"
+		// E.Drossos - Define arrays for hRoe, sqrt_hRoe, uRoe
+//		T* hRoe = (T*)malloc(vlength*sizeof(T));
+//		T* sqrt_hRoe = (T*)malloc(vlength*sizeof(T));
+//		T* uRoe = (T*)malloc(vlength*sizeof(T));
+		hRoe[0:vlength] = (T).5 * (i_hRight[0:vlength] + i_hLeft[0:vlength]);					// 2 FLOPs
+		sqrt_hRoe[0:vlength] = std::sqrt(hRoe[0:vlength]);										// 1 FLOP (sqrt)
+		uRoe[0:vlength] = i_uLeft[0:vlength] * sqrt_hLeft[0:vlength] + i_uRight[0:vlength] * sqrt_hRight[0:vlength];           // 3 FLOPs
+		uRoe[0:vlength] /= (sqrt_hLeft[0:vlength] + sqrt_hRight[0:vlength]);					// 2 FLOPs (1 div)
+
+		// compute "Roe speeds" from Roe averages
+		// define arrays for "Roe speeds"
+//		T* roeSpeed0 = (T*)malloc(vlength*sizeof(T));
+//		T* roeSpeed1 = (T*)malloc(vlength*sizeof(T));
+		roeSpeed0[0:vlength] = uRoe[0:vlength] - sqrt_gravity * sqrt_hRoe[0:vlength];                    // 2 FLOPs
+		roeSpeed1[0:vlength] = uRoe[0:vlength] + sqrt_gravity * sqrt_hRoe[0:vlength];                    // 2 FLOPs
+
+		// compute Eindfeldt speeds (returned as output parameters)
+                o_waveSpeed0[0:vlength] = std::min(characteristicSpeed0[0:vlength], roeSpeed0[0:vlength]);         // 1 FLOP (min)
+                o_waveSpeed1[0:vlength] = std::max(characteristicSpeed1[0:vlength], roeSpeed1[0:vlength]);         // 1 FLOP (max)
+
+		                                                                  //20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)
+	}
+
+
 	inline
 	void fWaveComputeWaveDecomposition(
 			const T i_hLeft,  const T i_hRight,
@@ -217,37 +499,68 @@ public:

 			T &o_fWave0, T &o_fWave1 ) const
 	{
-	  T lambdaDif = i_waveSpeed1 - i_waveSpeed0;
-
-	  //compute the inverse matrix R^{-1}
-	  T Rinv00 = 0., Rinv01 = 0., Rinv10 = 0., Rinv11 = 0.;
-
-	  T oneDivLambdaDif = (T)1. /  lambdaDif;
-	  Rinv00 = oneDivLambdaDif *  i_waveSpeed1;
-	  Rinv01 = -oneDivLambdaDif;
-
-	  Rinv10 = oneDivLambdaDif * -i_waveSpeed0;
-	  Rinv11 = oneDivLambdaDif;
+	  //calculate modified (bathymetry!) flux difference
+	  // f(Q_i) - f(Q_{i-1}) -> serve as right hand sides
+	  T fDif0 = i_huRight - i_huLeft;                                        // 1 FLOP
+	  T fDif1 = i_huRight * i_uRight + half_gravity * i_hRight * i_hRight
+	          -(i_huLeft  * i_uLeft  + half_gravity * i_hLeft  * i_hLeft);   // 9 FLOPs

-	  //right hand side
-	  T fDif0 = 0., fDif1 = 0.;
+	  // \delta x \Psi[2]
+	  fDif1 += half_gravity * (i_hRight + i_hLeft)*(i_bRight - i_bLeft);     // 5 FLOPs
+
+	  // solve linear system of equations to obtain f-waves:
+	  // (       1            1      ) ( o_fWave0 ) = ( fDif0 )
+	  // ( i_waveSpeed0 i_waveSpeed1 ) ( o_fWave1 )   ( fDif1 )
+	  
+	  // compute the inverse of the wave speed difference:
+	  T inverseSpeedDiff = (T)1. / ( i_waveSpeed1 - i_waveSpeed0 );          // 2 FLOPs (1 div)
+	  // compute f-waves:
+	  o_fWave0 = (  i_waveSpeed1 * fDif0 - fDif1 ) * inverseSpeedDiff;       // 3 FLOPs
+	  o_fWave1 = ( -i_waveSpeed0 * fDif0 + fDif1 ) * inverseSpeedDiff;       // 3 FLOPs
+	                                                                         //=========
+	                                                                         //23 FLOPs in total (incl. 1 div)
+	}

+	inline
+	void fWaveComputeWaveDecomposition_cilk (
+			const T* i_hLeft,  const T* i_hRight,
+			const T* i_huLeft, const T* i_huRight,
+			const T* i_uLeft,  const T* i_uRight,
+			const T* i_bLeft,  const T* i_bRight,
+			const T* i_waveSpeed0, const T* i_waveSpeed1,
+
+			T *o_fWave0, T *o_fWave1 ) const
+	{
 	  //calculate modified (bathymetry!) flux difference
-	  // f(Q_i) - f(Q_{i-1})
-	  fDif0 = i_huRight - i_huLeft;
-	  fDif1 = i_huRight * i_uRight + (T).5 * gravity * i_hRight * i_hRight
-	          -(i_huLeft  * i_uLeft  + (T).5 * gravity * i_hLeft  * i_hLeft);
+	  // f(Q_i) - f(Q_{i-1}) -> serve as right hand sides
+	  
+	  //allocate mem for fDif0, fDif1 arrays
+//	  T* fDif0 = (T*)malloc(vlength*sizeof(T));
+//	  T* fDif1 = (T*)malloc(vlength*sizeof(T));
+	  
+	  fDif0[0:vlength] = i_huRight[0:vlength] - i_huLeft[0:vlength];                                        // 1 FLOP
+	  fDif1[0:vlength] = i_huRight[0:vlength] * i_uRight[0:vlength] + half_gravity * i_hRight[0:vlength] * i_hRight[0:vlength]
+	          -(i_huLeft[0:vlength]  * i_uLeft[0:vlength]  + half_gravity * i_hLeft[0:vlength]  * i_hLeft[0:vlength]);   // 9 FLOPs

 	  // \delta x \Psi[2]
-	  T psi = -gravity * (T).5 * (i_hRight + i_hLeft)*(i_bRight - i_bLeft);
-	  fDif1 -= psi;
-
-	  //solve linear equations
-	  o_fWave0 = Rinv00 * fDif0 + Rinv01 * fDif1;
-	  o_fWave1 = Rinv10 * fDif0 + Rinv11 * fDif1;
+	  fDif1[0:vlength] += half_gravity * (i_hRight[0:vlength] + i_hLeft[0:vlength])*(i_bRight[0:vlength] - i_bLeft[0:vlength]);     // 5 FLOPs
+
+	  // solve linear system of equations to obtain f-waves:
+	  // (       1            1      ) ( o_fWave0 ) = ( fDif0 )
+	  // ( i_waveSpeed0 i_waveSpeed1 ) ( o_fWave1 )   ( fDif1 )
+	  
+	  // compute the inverse of the wave speed difference:
+//	  T* inverseSpeedDiff = (T*)malloc(vlength*sizeof(T));
+	  inverseSpeedDiff[0:vlength] = (T)1. / ( i_waveSpeed1[0:vlength] - i_waveSpeed0[0:vlength] );          // 2 FLOPs (1 div)
+	  // compute f-waves:
+	  o_fWave0[0:vlength] = (  i_waveSpeed1[0:vlength] * fDif0[0:vlength] - fDif1[0:vlength] ) * inverseSpeedDiff[0:vlength];       // 3 FLOPs
+	  o_fWave1[0:vlength] = ( -i_waveSpeed0[0:vlength] * fDif0[0:vlength] + fDif1[0:vlength] ) * inverseSpeedDiff[0:vlength];       // 3 FLOPs
+	                                                                         //=========
+	                                                                         //23 FLOPs in total (incl. 1 div)
 	}
 };

 }

 #endif // FWAVEVEC_HPP_
+