Using Intel Cilk for Fwave

Signed-off-by: drossostum <drevangel@mytum.de>

Using Intel Cilk for Fwave
Signed-off-by: drossostum <drevangel@mytum.de>
c22e9dbc · drossostum · c81126f6 · c22e9dbc
Commit c22e9dbc authored Jan 19, 2014 by drossostum
Hide whitespace changes
Inline Side-by-side

Showing with 192 additions and 134 deletions

FWaveVec.hpp src/solver/FWaveVec.hpp +192 -134

No files found.
--- a/src/solver/FWaveVec.hpp
+++ b/src/solver/FWaveVec.hpp
 /**
- * FWaveCuda.h
+ * FWaveVec.h
 *
 ****
- **** This is a C++ wrapper for the Cuda implementation of the F-Wave solver (FWave.hpp).
+ **** This is a vectorizable C++ implementation of the F-Wave solver (FWave.hpp).
 ****
 *
 * Created on: Nov 13, 2012
- * Last Update: Nov 16, 2012
+ * Last Update: Dec 28, 2013
 *
 ****
 *
 *  Author: Sebastian Rettenberger
 *    Homepage: http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger,_M.Sc.
 *    E-Mail: rettenbs AT in.tum.de
+ *  Some optimzations: Michael Bader
+ *    Homepage: http://www5.in.tum.de/wiki/index.php/Michael_Bader
+ *    E-Mail: bader AT in.tum.de
 *
 ****
 *
@@ -25,13 +28,10 @@
 *            volume={24},
 *            number={3},
 *            pages={955--978},
- *            year={2002},
- *            publisher={Citeseer}}
+ *            year={2002}}
 *
 *   @book{leveque2002finite,
 *         Author = {LeVeque, R. J.},
- *         Date-Added = {2011-09-13 14:09:31 +0000},
- *         Date-Modified = {2011-10-31 09:46:40 +0000},
 *         Publisher = {Cambridge University Press},
 *         Title = {Finite Volume Methods for Hyperbolic Problems},
 *         Volume = {31},
@@ -49,202 +49,260 @@
 #ifndef FWAVEVEC_HPP_
 #define FWAVEVEC_HPP_

+//define vlength - test and then push to separate header file, common to solver and block classes
+#define vlength 4
+
 #include <cmath>
+#include <cilk/cilk.h>
+#include <malloc.h>

+#define T float
 namespace solver
 {

 /**
 *
 */
-template<typename T>
+//template<typename T>
 class FWaveVec
 {
 private:
 	const T dryTol;
-	const T gravity;
+	const T half_gravity; // 0.5 * gravity constant
+	const T sqrt_gravity; // square root of the gravity constant 
 	const T zeroTol;

 public:
-	FWaveVec(T i_dryTol = (T) 100,
+	/**
+	 * FWaveVec Constructor, takes three problem parameters
+	 * @param dryTol "dry tolerance": if the water height falls below dryTol, wall boundary conditions are applied (default value is 100)
+	 * @param gravity takes the value of the gravity constant (default value is 9.81 m/s^2)
+	 * @param zeroTol computed f-waves with an absolute value < zeroTol are treated as static waves (default value is 10^{-7})
+	 */
+	FWaveVec(T i_dryTol = (T) 1.0,
 			 T i_gravity = (T) 9.81,
 			 T i_zeroTol = (T) 0.0000001)
 		: dryTol(i_dryTol),
-		  gravity(i_gravity),
+		  half_gravity( (T).5 * i_gravity ),
+		  sqrt_gravity( std::sqrt(i_gravity) ),
 		  zeroTol(i_zeroTol)
 	{
 	}

-	void computeNetUpdates ( T i_hLeft,  T i_hRight,
-                             T i_huLeft, T i_huRight,
-                             T i_bLeft,  T i_bRight,
-
-                             T &o_hUpdateLeft,
-                             T &o_hUpdateRight,
-                             T &o_huUpdateLeft,
-                             T &o_huUpdateRight,
+	//E.Drossos - vectorized code using Intel Cilk
+	//E.Drossos - all flop counts refer to one iteration, for total num flops has to be calculated by vlength
+	
+	/**
+	 * takes the water height, discharge and bathymatry in the left and right cell
+	 * and computes net updates (left and right going waves) according to the f-wave approach.
+	 * It also returns the maximum wave speed.
+	 */
+	void computeNetUpdates ( T* i_hLeft,  T* i_hRight,
+                             T* i_huLeft, T* i_huRight,
+                             T* i_bLeft,  T* i_bRight,
+
+                             T* o_hUpdateLeft,
+                             T* o_hUpdateRight,
+                             T* o_huUpdateLeft,
+                             T* o_huUpdateRight,
                             T &o_maxWaveSpeed ) const
 	{
 		  // determine the wet dry state and corr. values, if necessary.
-		  if( i_hLeft < dryTol && i_hRight < dryTol ) {
+		  if( i_hLeft[0:vlength] >= dryTol ) {
+		     if ( i_hRight[0:vlength] < dryTol ) {
+		      // Wet/Dry case
+		      // Set values according to wall boundary condition
+		      i_hLeft[0:vlength] = i_hRight[0:vlength];
+		      i_huLeft[0:vlength] = -i_huRight[0:vlength];
+		      i_bLeft[0:vlength] = i_bRight[0:vlength];
+		     } 
+		  } else if ( i_hRight[0:vlength] >= dryTol ) {
+		      // Dry/Wet case
+		      // Set values according to wall boundary condition
+		      i_hRight[0:vlength] = i_hLeft[0:vlength];
+		      i_huRight[0:vlength] = -i_huLeft[0:vlength];
+		      i_bRight[0:vlength] = i_bLeft[0:vlength];
+		  } else {
 		      // Dry/Dry case
 		      // Set dummy values such that the result is zero
-		      i_hLeft = dryTol;
-		      i_huLeft = 0.; i_bLeft = 0.;
-		      i_hRight = dryTol;
-		      i_huRight = 0.; i_bRight = 0.;
-		  } else if ( i_hLeft < dryTol ) {
-		      i_hLeft = i_hRight;
-		      i_huLeft = -i_huRight;
-		      i_bLeft = i_bRight;
-		  } else if ( i_hRight < dryTol ) {
-		      i_hRight = i_hLeft;
-		      i_huRight = -i_huLeft;
-		      i_bRight = i_bLeft;
-		  }
+		      i_hLeft[0:vlength] = dryTol;
+		      i_huLeft[0:vlength] = 0.; i_bLeft[0:vlength] = 0.; //can use __sec_reduce_all_zero(a[0:vlength]) instead
+		      i_hRight[0:vlength] = dryTol;
+		      i_huRight[0:vlength] = 0.; i_bRight[0:vlength] = 0.;
+		  };

 		  //! velocity on the left side of the edge
-		  T uLeft = i_huLeft / i_hLeft;
+		  //TODO: Use intrinsics to allocate and assure alligned
+		  T* uLeft = (T*)malloc(vlength*sizeof(T));
+		  uLeft[0:vlength] = (i_huLeft[0:vlength]) / (i_hLeft[0:vlength]);	// 1 FLOP (div)
 		  //! velocity on the right side of the edge
-		  T uRight = i_huRight / i_hRight;
+		  //TODO: Use intrinsics to allocate
+		  T* uRight = (T*)malloc(vlength*sizeof(T));
+		  uRight[0:vlength] = (i_huRight[0:vlength]) / (i_hRight[0:vlength]);                            // 1 FLOP (div)

 		  //! wave speeds of the f-waves
-		  T waveSpeeds0 = 0., waveSpeeds1 = 0.;
+		  //TODO: Use intrinsics to allocate
+		  T* waveSpeeds0 = (T*)malloc(vlength*sizeof(T));
+		  T* waveSpeeds1 = (T*)malloc(vlength*sizeof(T));
+		  waveSpeeds0[0:vlength] = 0.;
+		  waveSpeeds1[0:vlength] = 0.;

 		  //compute the wave speeds
-		  fWaveComputeWaveSpeeds( i_hLeft, i_hRight,
-		                          i_huLeft, i_huRight,
-		                          uLeft, uRight,
-		                          i_bLeft, i_bRight,
+		  // E.Drossos - function vectorized
+		  fWaveComputeWaveSpeeds( &i_hLeft[0], &i_hRight[0],
+		                          &i_huLeft[0], &i_huRight[0],
+		                          &uLeft[0], &uRight[0],
+		                          &i_bLeft[0], &i_bRight[0],

-		                          waveSpeeds0, waveSpeeds1 );
+		                          &waveSpeeds0[0], &waveSpeeds1[0] );         // 20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)

-		  //! where to store the two f-waves
-		  T fWaves0 = 0., fWaves1 = 0.;
+		  //! variables to store the two f-waves
+		  T* fWaves0 = (T*)malloc(vlength*sizeof(T));
+		  T* fWaves1 = (T*)malloc(vlength*sizeof(T));
+		  fWaves0[0:vlength] = 0.;
+		  fWaves1[0:vlength] = 0.;

 		  //compute the decomposition into f-waves
-		  fWaveComputeWaveDecomposition( i_hLeft, i_hRight,
-		                                 i_huLeft, i_huRight,
-		                                 uLeft, uRight,
-		                                 i_bLeft, i_bRight,
+		  // E.Drossos - function vectorized
+		  fWaveComputeWaveDecomposition( &i_hLeft[0], &i_hRight[0],
+		                                 &i_huLeft[0], &i_huRight[0],
+		                                 &uLeft[0], &uRight[0],
+		                                 &i_bLeft[0], &i_bRight[0],

-		                                 waveSpeeds0, waveSpeeds1,
-		                                 fWaves0, fWaves1);
+		                                 &waveSpeeds0[0], &waveSpeeds1[0],
+		                                 &fWaves0[0], &fWaves1[0]);           // 23 FLOPs (incl. 1 div)

 		  //compute the net-updates
-		  T hUpdateLeft = 0.;
-		  T hUpdateRight = 0.;
-		  T huUpdateLeft = 0.;
-		  T huUpdateRight = 0.;
+		  //alternatively use __sec_reduce_all_zero(a[0:vlength])
+		  o_hUpdateLeft[0:vlength] = 0.;
+		  o_hUpdateRight[0:vlength] = 0.;
+		  o_huUpdateLeft[0:vlength] = 0.;
+		  o_huUpdateRight[0:vlength] = 0.;

 		  //1st wave family
-		  if(waveSpeeds0 < -zeroTol) { //left going
-		    hUpdateLeft +=  fWaves0;
-		    huUpdateLeft += fWaves0 * waveSpeeds0;
+		  if(waveSpeeds0[0:vlength] < -zeroTol) { //left going
+		    o_hUpdateLeft[0:vlength] +=  fWaves0[0:vlength];
+		    o_huUpdateLeft[0:vlength] += (fWaves0[0:vlength]) * (waveSpeeds0[0:vlength]);                    // 3 FLOPs (assume left going wave ...)
 		  }
-		  else if(waveSpeeds0 > zeroTol) { //right going
-		    hUpdateRight +=  fWaves0;
-		    huUpdateRight += fWaves0 * waveSpeeds0;
+		  else if(waveSpeeds0[0:vlength] > zeroTol) { //right going
+		    o_hUpdateRight[0:vlength] +=  fWaves0[0:vlength];
+		    o_huUpdateRight[0:vlength] += fWaves0[0:vlength] * waveSpeeds0[0:vlength];
 		  }
-		  else { //split waves
-		    hUpdateLeft +=   (T).5*fWaves0;
-		    huUpdateLeft +=  (T).5*fWaves0 * waveSpeeds0;
-		    hUpdateRight +=  (T).5*fWaves0;
-		    huUpdateRight += (T).5*fWaves0 * waveSpeeds0;
+		  else { //split waves, if waveSpeeds0 close to 0
+		    o_hUpdateLeft[0:vlength] +=   (T).5*(fWaves0[0:vlength]);
+		    o_huUpdateLeft[0:vlength] +=  (T).5*(fWaves0[0:vlength] * waveSpeeds0[0:vlength]);
+		    o_hUpdateRight[0:vlength] +=  (T).5*(fWaves0[0:vlength]);
+		    o_huUpdateRight[0:vlength] += (T).5*(fWaves0[0:vlength] * waveSpeeds0[0:vlength]);
 		  }

 		  //2nd wave family
-		  if(waveSpeeds1 > zeroTol) { //right going
-		    hUpdateRight +=  fWaves1;
-		    huUpdateRight += fWaves1 * waveSpeeds1;
+		  if(waveSpeeds1[0:vlength] > zeroTol) { //right going
+		    o_hUpdateRight[0:vlength] +=  fWaves1[0:vlength];
+		    o_huUpdateRight[0:vlength] += (fWaves1[0:vlength] * waveSpeeds1[0:vlength]);                   // 3 FLOPs (assume right going wave ...)
 		  }
-		  else if(waveSpeeds1 < -zeroTol) { //left going
-			hUpdateLeft +=  fWaves1;
-			huUpdateLeft += fWaves1 * waveSpeeds1;
+		  else if(waveSpeeds1[0:vlength] < -zeroTol) { //left going
+			o_hUpdateLeft[0:vlength] +=  fWaves1[0:vlength];
+			o_huUpdateLeft[0:vlength] += (fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
 		  }
 		  else { //split waves
-		    hUpdateLeft +=   (T).5*fWaves1;
-		    huUpdateLeft +=  (T).5*fWaves1 * waveSpeeds1;
-		    hUpdateRight +=  (T).5*fWaves1;
-		    huUpdateRight += (T).5*fWaves1 * waveSpeeds1;
+		    o_hUpdateLeft[0:vlength] +=   (T).5*(fWaves1[0:vlength]);
+		    o_huUpdateLeft[0:vlength] +=  (T).5*(fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
+		    o_hUpdateRight[0:vlength] +=  (T).5*(fWaves1[0:vlength]);
+		    o_huUpdateRight[0:vlength] += (T).5*(fWaves1[0:vlength] * waveSpeeds1[0:vlength]);
 		  }

-		  // Set output variables
-		  o_hUpdateLeft = hUpdateLeft;
-		  o_hUpdateRight = hUpdateRight;
-		  o_huUpdateLeft = huUpdateLeft;
-		  o_huUpdateRight = huUpdateRight;
-
 		  //compute maximum wave speed (-> CFL-condition)
-		  o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) );
+		  //o_maxWaveSpeed = std::max( std::abs(waveSpeeds0) , std::abs(waveSpeeds1) ); 
+		  		   
+		  o_maxWaveSpeed = __sec_reduce_max(std::max( std::abs(waveSpeeds0[0:vlength]) , std::abs(waveSpeeds1[0:vlength]) )); 
+		                                                              // 3 FLOPs (2 abs, 1 max)
+		                                                              //========================
+		                                                              // 54 FLOPs (3 sqrt, 4 div, 2 abs, 3 min/max)
    }

 	inline
 	void fWaveComputeWaveSpeeds(
-            const T i_hLeft,  const T i_hRight,
-            const T i_huLeft, const T i_huRight,
-            const T i_uLeft,  const T i_uRight,
-            const T i_bLeft,  const T i_bRight,
+            const T* i_hLeft,  const T* i_hRight,
+            const T* i_huLeft, const T* i_huRight,
+            const T* i_uLeft,  const T* i_uRight,
+            const T* i_bLeft,  const T* i_bRight,

-            T &o_waveSpeed0, T &o_waveSpeed1 ) const
+            T* o_waveSpeed0, T* o_waveSpeed1 ) const
 	{
-		//compute eigenvalues of the jacobian matrices in states Q_{i-1} and Q_{i}
-		T characteristicSpeed0 = 0., characteristicSpeed1 = 0.;
-		characteristicSpeed0 = i_uLeft - std::sqrt(gravity*i_hLeft);
-		characteristicSpeed1 = i_uRight + std::sqrt(gravity*i_hRight);
-
-		//compute "Roe speeds"
-		T hRoe = (T).5 * (i_hRight + i_hLeft);
-		T uRoe = i_uLeft * std::sqrt(i_hLeft) + i_uRight * std::sqrt(i_hRight);
-		uRoe /= std::sqrt(i_hLeft) + std::sqrt(i_hRight);
-
-		T roeSpeed0 = 0., roeSpeed1 = 0.;
-		roeSpeed0 = uRoe - std::sqrt(gravity*hRoe);
-		roeSpeed1 = uRoe + std::sqrt(gravity*hRoe);
-
-		//computer eindfeldt speeds
-		o_waveSpeed0 = std::min(characteristicSpeed0, roeSpeed0);
-		o_waveSpeed1 = std::max(characteristicSpeed1, roeSpeed1);
+		// helper variables for sqrt of h:
+		// define arrays for helpers
+		T* sqrt_hLeft = (T*)malloc(vlength*sizeof(T));
+		T* sqrt_hRight = (T*)malloc(vlength*sizeof(T));
+		sqrt_hLeft[0:vlength] = std::sqrt(i_hLeft[0:vlength]);                                // 1 FLOP (sqrt)
+		sqrt_hRight[0:vlength] = std::sqrt(i_hRight[0:vlength]);                              // 1 FLOP (sqrt)
+
+		// compute eigenvalues of the jacobian matrices 
+		// in states Q_{i-1} and Q_{i}
+		T* characteristicSpeed0 = (T*)malloc(vlength*sizeof(T));
+		T* characteristicSpeed1 = (T*)malloc(vlength*sizeof(T));
+		characteristicSpeed0[0:vlength] = i_uLeft[0:vlength] - sqrt_gravity * sqrt_hLeft[0:vlength];     // 2 FLOPs
+		characteristicSpeed1[0:vlength] = i_uRight[0:vlength] + sqrt_gravity * sqrt_hRight[0:vlength];   // 2 FLOPs
+
+		// compute "Roe averages"
+		// E.Drossos - Define arrays for hRoe, sqrt_hRoe, uRoe
+		T* hRoe = (T*)malloc(vlength*sizeof(T));
+		T* sqrt_hRoe = (T*)malloc(vlength*sizeof(T));
+		T* uRoe = (T*)malloc(vlength*sizeof(T));
+		hRoe[0:vlength] = (T).5 * (i_hRight[0:vlength] + i_hLeft[0:vlength]);					// 2 FLOPs
+		sqrt_hRoe[0:vlength] = std::sqrt(hRoe[0:vlength]);										// 1 FLOP (sqrt)
+		uRoe[0:vlength] = i_uLeft[0:vlength] * sqrt_hLeft[0:vlength] + i_uRight[0:vlength] * sqrt_hRight[0:vlength];           // 3 FLOPs
+		uRoe[0:vlength] /= (sqrt_hLeft[0:vlength] + sqrt_hRight[0:vlength]);					// 2 FLOPs (1 div)
+
+		// compute "Roe speeds" from Roe averages
+		// define arrays for "Roe speeds"
+		T* roeSpeed0 = (T*)malloc(vlength*sizeof(T));
+		T* roeSpeed1 = (T*)malloc(vlength*sizeof(T));
+		roeSpeed0[0:vlength] = uRoe[0:vlength] - sqrt_gravity * sqrt_hRoe[0:vlength];                    // 2 FLOPs
+		roeSpeed1[0:vlength] = uRoe[0:vlength] + sqrt_gravity * sqrt_hRoe[0:vlength];                    // 2 FLOPs
+
+		// compute Eindfeldt speeds (returned as output parameters)
+		o_waveSpeed0[0:vlength] = std::min(characteristicSpeed0[0:vlength], roeSpeed0[0:vlength]);         // 1 FLOP (min)
+		o_waveSpeed1[0:vlength] = std::max(characteristicSpeed1[0:vlength], roeSpeed1[0:vlength]);         // 1 FLOP (max)
+		                                                                  //==============
+		                                                                  //20 FLOPs (incl. 3 sqrt, 1 div, 2 min/max)
 	}

 	inline
 	void fWaveComputeWaveDecomposition(
-			const T i_hLeft,  const T i_hRight,
-			const T i_huLeft, const T i_huRight,
-			const T i_uLeft,  const T i_uRight,
-			const T i_bLeft,  const T i_bRight,
-			const T i_waveSpeed0, const T i_waveSpeed1,
+			const T* i_hLeft,  const T* i_hRight,
+			const T* i_huLeft, const T* i_huRight,
+			const T* i_uLeft,  const T* i_uRight,
+			const T* i_bLeft,  const T* i_bRight,
+			const T* i_waveSpeed0, const T* i_waveSpeed1,

-			T &o_fWave0, T &o_fWave1 ) const
+			T *o_fWave0, T *o_fWave1 ) const
 	{
-	  T lambdaDif = i_waveSpeed1 - i_waveSpeed0;
-
-	  //compute the inverse matrix R^{-1}
-	  T Rinv00 = 0., Rinv01 = 0., Rinv10 = 0., Rinv11 = 0.;
-
-	  T oneDivLambdaDif = (T)1. /  lambdaDif;
-	  Rinv00 = oneDivLambdaDif *  i_waveSpeed1;
-	  Rinv01 = -oneDivLambdaDif;
-
-	  Rinv10 = oneDivLambdaDif * -i_waveSpeed0;
-	  Rinv11 = oneDivLambdaDif;
-
-	  //right hand side
-	  T fDif0 = 0., fDif1 = 0.;
-
 	  //calculate modified (bathymetry!) flux difference
-	  // f(Q_i) - f(Q_{i-1})
-	  fDif0 = i_huRight - i_huLeft;
-	  fDif1 = i_huRight * i_uRight + (T).5 * gravity * i_hRight * i_hRight
-	          -(i_huLeft  * i_uLeft  + (T).5 * gravity * i_hLeft  * i_hLeft);
+	  // f(Q_i) - f(Q_{i-1}) -> serve as right hand sides
+	  
+	  //allocate mem for fDif0, fDif1 arrays
+	  T* fDif0 = (T*)malloc(vlength*sizeof(T));
+	  T* fDif1 = (T*)malloc(vlength*sizeof(T));
+	  
+	  fDif0[0:vlength] = i_huRight[0:vlength] - i_huLeft[0:vlength];                                        // 1 FLOP
+	  fDif1[0:vlength] = i_huRight[0:vlength] * i_uRight[0:vlength] + half_gravity * i_hRight[0:vlength] * i_hRight[0:vlength]
+	          -(i_huLeft[0:vlength]  * i_uLeft[0:vlength]  + half_gravity * i_hLeft[0:vlength]  * i_hLeft[0:vlength]);   // 9 FLOPs

 	  // \delta x \Psi[2]
-	  T psi = -gravity * (T).5 * (i_hRight + i_hLeft)*(i_bRight - i_bLeft);
-	  fDif1 -= psi;
-
-	  //solve linear equations
-	  o_fWave0 = Rinv00 * fDif0 + Rinv01 * fDif1;
-	  o_fWave1 = Rinv10 * fDif0 + Rinv11 * fDif1;
+	  fDif1[0:vlength] += half_gravity * (i_hRight[0:vlength] + i_hLeft[0:vlength])*(i_bRight[0:vlength] - i_bLeft[0:vlength]);     // 5 FLOPs
+
+	  // solve linear system of equations to obtain f-waves:
+	  // (       1            1      ) ( o_fWave0 ) = ( fDif0 )
+	  // ( i_waveSpeed0 i_waveSpeed1 ) ( o_fWave1 )   ( fDif1 )
+	  
+	  // compute the inverse of the wave speed difference:
+	  T* inverseSpeedDiff = (T*)malloc(vlength*sizeof(T));
+	  inverseSpeedDiff[0:vlength] = (T)1. / ( i_waveSpeed1[0:vlength] - i_waveSpeed0[0:vlength] );          // 2 FLOPs (1 div)
+	  // compute f-waves:
+	  o_fWave0[0:vlength] = (  i_waveSpeed1[0:vlength] * fDif0[0:vlength] - fDif1[0:vlength] ) * inverseSpeedDiff[0:vlength];       // 3 FLOPs
+	  o_fWave1[0:vlength] = ( -i_waveSpeed0[0:vlength] * fDif0[0:vlength] + fDif1[0:vlength] ) * inverseSpeedDiff[0:vlength];       // 3 FLOPs
+	                                                                         //=========
+	                                                                         //23 FLOPs in total (incl. 1 div)
 	}
 };