Added SIMD files

304d4698 · hoelzlw · a79ebaa7 · 304d4698 · 304d4698 · 304d4698
Commit 304d4698 authored Jul 14, 2013 by hoelzlw
Showing with 2555 additions and 0 deletions

AugRie_SIMD.hpp src/solver/AugRie_SIMD.hpp +2056 -0

SIMD_COSTS.hpp src/solver/SIMD_COSTS.hpp +123 -0

SIMD_DEFINITIONS.hpp src/solver/SIMD_DEFINITIONS.hpp +239 -0

SIMD_TYPES.hpp src/solver/SIMD_TYPES.hpp +137 -0

No files found.
--- a/src/solver/AugRie_SIMD.hpp
+++ b/src/solver/AugRie_SIMD.hpp
+/**
+ * AugRie_SIMD.hpp
+ *
+ ****
+ **** Approximate Augmented Riemann Solver for the Shallow Water Equations
+ **** vectorized using SIMD-Extensions
+ ****
+ *
+ *  Created on: Apr 17, 2013
+ *  Last Update: Jul 14, 2013
+ *
+ ****
+ *
+ *  Author: Alexander Breuer
+ *    Homepage: http://www5.in.tum.de/wiki/index.php/Dipl.-Math._Alexander_Breuer
+ *    E-Mail: breuera AT in.tum.de
+ *
+ *  Some optimizations: Martin Schreiber
+ *    Homepage: http://www5.in.tum.de/wiki/index.php/Martin_Schreiber
+ *    E-Mail: schreibm AT in.tum.de
+ * 
+ *  Vectorization: Wolfgang Hölzl
+ *    E-Mail: hoelzlw AT in.tum.de
+ *
+ ****
+ *
+ * (Main) Literature:
+ *
+ *   @phdthesis{george2006finite,
+ *              Author = {George, D.L.},
+ *              Title = {Finite volume methods and adaptive refinement for tsunami propagation and inundation},
+ *              Year = {2006}}
+ *
+ *   @article{george2008augmented,
+ *            Author = {George, D.L.},
+ *            Journal = {Journal of Computational Physics},
+ *            Number = {6},
+ *            Pages = {3089--3113},
+ *            Publisher = {Elsevier},
+ *            Title = {Augmented Riemann solvers for the shallow water equations over variable topography with steady states and inundation},
+ *            Volume = {227},
+ *            Year = {2008}}
+ *
+ *   @book{leveque2002finite,
+ *         Author = {LeVeque, R. J.},
+ *         Date-Added = {2011-09-13 14:09:31 +0000},
+ *         Date-Modified = {2011-10-31 09:46:40 +0000},
+ *         Publisher = {Cambridge University Press},
+ *         Title = {Finite Volume Methods for Hyperbolic Problems},
+ *         Volume = {31},
+ *         Year = {2002}}
+ *
+ *   @webpage{levequeclawpack,
+ *            Author = {LeVeque, R. J.},
+ *            Lastchecked = {January, 05, 2011},
+ *            Title = {Clawpack Sofware},
+ *            Url = {https://github.com/clawpack/clawpack-4.x/blob/master/geoclaw/2d/lib}}
+ *
+ ****
+ *
+ * Acknowledgments:
+ *   Special thanks go to R.J. LeVeque and D.L. George for publishing their code
+ *   and the corresponding documentation (-> Literature).
+ */
+/*
+ * TODO: store nLow/nHigh variables in array[2]
+ */
+#ifndef AUGRIE_SIMD_HPP_
+#define AUGRIE_SIMD_HPP_
+#pragma message "Choosing single precision by default. Remove this if the solver is able to run with double precision"
+#undef FLOAT64
+#define FLOAT32
+#include "SIMD_TYPES.hpp"
+#include "WavePropagation.hpp"
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <vector>
+namespace solver {
+	class AugRie_SIMD;
+}
+/**
+ * Approximate Augmented Riemann Solver for the Shallow Water Equations.
+ *
+ * T should be double or float.
+ */
+class solver::AugRie_SIMD : public WavePropagation<real> {
+public:
+	mutable size_t flops;
+private:
+	//explicit for unit tests
+	//use nondependent names (template base class)
+	using solver::WavePropagation<real>::zeroTol;
+	using solver::WavePropagation<real>::g;
+	using solver::WavePropagation<real>::dryTol;
+	using solver::WavePropagation<real>::hLeft;
+	using solver::WavePropagation<real>::hRight;
+	using solver::WavePropagation<real>::huLeft;
+	using solver::WavePropagation<real>::huRight;
+	using solver::WavePropagation<real>::bLeft;
+	using solver::WavePropagation<real>::bRight;
+	using solver::WavePropagation<real>::uLeft;
+	using solver::WavePropagation<real>::uRight;
+	using solver::WavePropagation<real>::storeParameters;
+	integer wetDryState;
+	static const integer DryDry = 0;
+	static const integer WetWet = SHIFT_SIGN_RIGHT(1);
+	static const integer WetDryInundation = SHIFT_SIGN_RIGHT(2);
+	static const integer WetDryWall = SHIFT_SIGN_RIGHT(3);
+	static const integer WetDryWallInundation = SHIFT_SIGN_RIGHT(4);
+	static const integer DryWetInundation = SHIFT_SIGN_RIGHT(5);
+	static const integer DryWetWall = SHIFT_SIGN_RIGHT(6);
+	static const integer DryWetWallInundation = SHIFT_SIGN_RIGHT(7);
+#ifndef VECTOR_NOVEC
+	// Vector components
+	const integer_vector DryDry_V;
+	const integer_vector WetWet_V;
+	const integer_vector DryWetInundation_V;
+	const integer_vector WetDryInundation_V;
+	const integer_vector DryWetWallInundation_V;
+	const integer_vector DryWetWall_V;
+	const integer_vector WetDryWallInundation_V;
+	const integer_vector WetDryWall_V;
+	integer_vector wetDryState_v;
+#endif /* VECTOR_NOVEC */
+	//! Newton-tolerance (exit Newton-Raphson-method, if we are close enough to the root)
+	const real newtonTol;
+	//! maximum number of Newton-Raphson-Iterations
+	const int maxNumberOfNewtonIterations;
+#ifndef VECTOR_NOVEC
+	// Vector members
+	real_vector hLeft_v;
+	real_vector hRight_v;
+	real_vector huLeft_v;
+	real_vector huRight_v;
+	real_vector bLeft_v;
+	real_vector bRight_v;
+	real_vector uLeft_v;
+	real_vector uRight_v;
+#endif /* VECTOR_NOVEC */
+	//! height of our homogeneous Riemann-problem at middle state (computed by determineMiddleState)
+	real hMiddle;
+#ifndef  VECTOR_NOVEC
+	real_vector hMiddle_v;
+#endif /* VECTOR_NOVEC */
+	//! shock or inner rarefaction speeds of our homogeneous Riemann-problem (computed by computeMiddleState)
+	real middleStateSpeeds[2];
+#ifndef  VECTOR_NOVEC
+	real_vector middleStateSpeeds_v[2];
+#endif /* VECTOR_NOVEC */
+	/**
+	 * the Riemann-struture of the homogeneous Riemann-problem.
+	 */
+	static const integer ShockShock = SHIFT_SIGN_RIGHT(1);
+	static const integer RarefactionRarefaction = SHIFT_SIGN_RIGHT(2);
+	static const integer ShockRarefaction = SHIFT_SIGN_RIGHT(3);
+	static const integer RarefactionShock = SHIFT_SIGN_RIGHT(4);
+	static const integer DrySingleRarefaction = SHIFT_SIGN_RIGHT(5);
+	static const integer SingleRarefactionDry = SHIFT_SIGN_RIGHT(6);
+#ifndef  VECTOR_NOVEC
+	integer_vector riemannStructure_v;
+	const integer_vector ShockShock_V;
+	const integer_vector RarefactionRarefaction_V;
+	const integer_vector ShockRarefaction_V;
+	const integer_vector RarefactionShock_V;
+	const integer_vector DrySingleRarefaction_V;
+	const integer_vector SingleRarefactionDry_V;
+#endif /* VECTOR_NOVEC */
+	//! Riemann-structure of our homogeneous Riemann-problem (determined by determineRiemannStructure)
+	integer riemannStructure;
+public:
+	/**
+	 * Constructor of the Augmented Riemann solver with optional parameters.
+	 * 
+	 * @param i_dryTolerance numerical definition of "dry".
+	 * @param i_gravity gravity constant.
+	 * @param i_newtonTolerance numerical definition of "convergence" (used in the AugRie solver only).
+	 * @param i_maxNumberOfNewtonIterations maximum steps for the Newton-Raphson method (used in the AugRie solver only).
+	 * @param i_zeroTolerance numerical definition of zero.
+	 */
+	AugRie_SIMD (
+		real i_dryTolerance			= static_cast<real>(0.01),
+		real i_gravity				= static_cast<real>(9.81),
+		real i_newtonTolerance			= static_cast<real>(0.000001),
+		int i_maxNumberOfNewtonIterations	= 10,
+		real i_zeroTolerance			= static_cast<real>(0.00001)
+	) :
+		WavePropagation<real>(i_dryTolerance, i_gravity, i_zeroTolerance),
+#ifndef  VECTOR_NOVEC
+		DryDry_V(SETV_I(DryDry)), WetWet_V(SETV_I(WetWet)), DryWetInundation_V(SETV_I(DryWetInundation)), WetDryInundation_V(SETV_I(WetDryInundation)),
+		DryWetWallInundation_V(SETV_I(DryWetWallInundation)), DryWetWall_V(SETV_I(DryWetWall)), WetDryWallInundation_V(SETV_I(WetDryWallInundation)), WetDryWall_V(SETV_I(WetDryWall)),
+#endif /* VECTOR_NOVEC */
+		newtonTol (i_newtonTolerance),
+		maxNumberOfNewtonIterations (i_maxNumberOfNewtonIterations),
+#ifndef  VECTOR_NOVEC
+		ShockShock_V(SETV_I(ShockShock)), RarefactionRarefaction_V(SETV_I(RarefactionRarefaction)), ShockRarefaction_V(SETV_I(ShockRarefaction)), RarefactionShock_V(SETV_I(RarefactionShock)), DrySingleRarefaction_V(SETV_I(DrySingleRarefaction)), SingleRarefactionDry_V(SETV_I(SingleRarefactionDry)),
+#endif /* VECTOR_NOVEC */
+		sqrt_g(std :: sqrt(g))
+#ifndef  VECTOR_NOVEC
+		,sqrt_g_v(SETV_R(sqrt_g)),
+		zeroTol_v(SETV_R(zeroTol)),
+		zeroTol_v_neg(SETV_R(-zeroTol)),
+		dryTol_v(SETV_R(dryTol)),
+		dryTol_v_neg(SETV_R(-dryTol)),
+		g_v(SETV_R(g))
+#endif /* VECTOR_NOVEC */
+	{
+#ifndef NDEBUG
+#ifndef SUPPRESS_SOLVER_DEBUG_OUTPUT
+		//print some information about the used solver
+		std::cout << "  *** solver::AugRie_SIMD created" << std::endl
+				<< "    zeroTolerance=" << zeroTol << std::endl
+				<< "    gravity=" << g << std::endl
+				<< "    dryTolerance=" << dryTol << std::endl
+				<< "    newtonTolerance=" << newtonTol << std::endl
+				<< "    maxNumberOfNewtonIterations=" << maxNumberOfNewtonIterations << std::endl
+				<< "\n  ***\n\n";
+#endif
+#endif
+		flops = 0;
+	}
+	~AugRie_SIMD ()
+	{
+	}
+	//declare variables which are used over and over again
+	real sqrt_g_hLeft;
+	real sqrt_g_hRight;
+	real sqrt_hLeft;
+	real sqrt_hRight;
+#ifndef  VECTOR_NOVEC
+	real_vector sqrt_g_hLeft_v;
+	real_vector sqrt_g_hRight_v;
+	real_vector sqrt_hLeft_v;
+	real_vector sqrt_hRight_v;
+#endif /* VECTOR_NOVEC */
+	const real sqrt_g;
+#ifndef  VECTOR_NOVEC
+	const real_vector sqrt_g_v;
+	const real_vector zeroTol_v;
+	const real_vector zeroTol_v_neg;
+	const real_vector dryTol_v;
+	const real_vector dryTol_v_neg;
+	const real_vector g_v;
+#endif /* VECTOR_NOVEC */
+	/**
+	 * Compute net updates for the left/right cell of the edge.
+	 *
+	 * maxWaveSpeed will be set to the maximum (linearized) wave speed => CFL
+	 */
+	void
+	computeNetUpdates (
+		const real &i_hLeft, const real &i_hRight,
+		const real &i_huLeft, const real &i_huRight,
+		const real &i_bLeft, const real &i_bRight,
+		real &o_hUpdateLeft, real &o_hUpdateRight,
+		real &o_huUpdateLeft, real &o_huUpdateRight,
+		real &o_maxWaveSpeed
+	)
+	{
+		// store parameters to member variables
+		storeParameters(
+			i_hLeft, i_hRight,
+			i_huLeft, i_huRight,
+			i_bLeft, i_bRight
+		);
+		//set speeds to zero (will be determined later)
+		uLeft = uRight = 0.;
+		//reset net updates and the maximum wave speed
+		o_hUpdateLeft = o_hUpdateRight = o_huUpdateLeft = o_huUpdateRight = static_cast<real>(0);
+		o_maxWaveSpeed = static_cast<real>(0);
+		//determine the wet/dry state and compute local variables correspondingly
+		determineWetDryState();
+		if (wetDryState != DryDry) {
+			//precompute some terms which are fixed during
+			//the computation after some specific point
+			sqrt_hLeft = std::sqrt(hLeft);
+			sqrt_hRight = std::sqrt(hRight);
+			sqrt_g_hLeft = sqrt_g*sqrt_hLeft;
+			sqrt_g_hRight = sqrt_g*sqrt_hRight;
+			//where to store the three waves
+			real fWaves[3][2];
+			//and their speeds
+			real waveSpeeds[3];
+			//compute the augmented decomposition
+			//  (thats the place where the computational work is done..)
+			computeWaveDecomposition(fWaves, waveSpeeds);
+			//compute the updates from the three propagating waves
+			//A^-\delta Q = \sum{s[i]<0} \beta[i] * r[i] = A^-\delta Q = \sum{s[i]<0} Z^i
+			//A^+\delta Q = \sum{s[i]>0} \beta[i] * r[i] = A^-\delta Q = \sum{s[i]<0} Z^i
+			for (int waveNumber = 0; waveNumber < 3; waveNumber++) {
+				if (waveSpeeds[waveNumber] < -zeroTol) {
+					//left going
+					o_hUpdateLeft += fWaves[waveNumber][0];
+					o_huUpdateLeft += fWaves[waveNumber][1];
+				} else if (waveSpeeds[waveNumber] > zeroTol) {
+					//right going
+					o_hUpdateRight += fWaves[waveNumber][0];
+					o_huUpdateRight += fWaves[waveNumber][1];
+				} else {
+					//TODO: this case should not happen mathematically, but it does. Where is the bug? Machine accuracy only?
+					o_hUpdateLeft += static_cast<real>(0.5) * fWaves[waveNumber][0];
+					o_huUpdateLeft += static_cast<real>(0.5) * fWaves[waveNumber][1];
+					o_hUpdateRight += static_cast<real>(0.5) * fWaves[waveNumber][0];
+					o_huUpdateRight += static_cast<real>(0.5) * fWaves[waveNumber][1];
+#if defined COUNTFLOPS
+					flops += 2 * COSTS_ADDS; 
+#endif
+				}
+#if defined COUNTFLOPS
+				flops += 2 * COSTS_ADDS;
+#endif
+			}
+			//compute maximum wave speed (-> CFL-condition)
+			waveSpeeds[0] = std::fabs(waveSpeeds[0]);
+			waveSpeeds[1] = std::fabs(waveSpeeds[1]);
+			waveSpeeds[2] = std::fabs(waveSpeeds[2]);
+			o_maxWaveSpeed = std::max(waveSpeeds[0], waveSpeeds[1]);
+			o_maxWaveSpeed = std::max(o_maxWaveSpeed, waveSpeeds[2]);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_SQRTS + 2 * COSTS_MULS + 3 * COSTS_FABSS + 2 * COSTS_MAXS;
+#endif
+		}
+	}
+#ifndef  VECTOR_NOVEC
+	void
+	computeNetUpdates_SIMD (
+			const real* const i_hLeft, const real* const i_hRight,
+			const real* const i_huLeft, const real* const i_huRight,
+			const real* const i_bLeft, const real* const i_bRight,
+			real* const o_hUpdateLeft, real* const o_hUpdateRight,
+			real* const o_huUpdateLeft, real* const o_huUpdateRight,
+			real& o_maxWaveSpeed
+	)
+	{
+		// store parameters
+		hLeft_v = LOADU(i_hLeft);	hRight_v = LOADU(i_hRight);
+		huLeft_v = LOADU(i_huLeft);	huRight_v = LOADU(i_huRight);
+		bLeft_v = LOADU(i_bLeft);	bRight_v = LOADU(i_bRight);
+		uLeft_v = ZEROV_R();		uRight_v = ZEROV_R();
+		riemannStructure_v = ZEROV_I();
+		wetDryState_v = ZEROV_I();
+		// reset net updates
+		real_vector hUpdateLeft_v = ZEROV_R();			real_vector hUpdateRight_v = ZEROV_R();
+		real_vector huUpdateLeft_v = ZEROV_R();			real_vector huUpdateRight_v = ZEROV_R();
+		// determine the wet/dry state and compute local variables correspondingly
+		determineWetDryState_SIMD();
+		const real_vector wetDryMask = CAST_INT_TO_REAL_V(NOTV_I(CMP_EQ_I(wetDryState_v, DryDry_V)));
+#if defined COUNTFLOPS
+		flops += 6 * COSTS_LOADU + 6 * COSTS_ZEROV_R + COSTS_NOTV_I + COSTS_CMP_EQ_I + COSTS_MOVEMASK;
+#endif
+		if (MOVEMASK(wetDryMask) != 0) {
+			// calculate values, that are used over and over again
+			sqrt_hLeft_v = SQRTV(hLeft_v);			sqrt_hRight_v = SQRTV(hRight_v);
+			sqrt_g_hLeft_v = MULV(sqrt_g_v, sqrt_hLeft_v);	sqrt_g_hRight_v = MULV(sqrt_g_v, sqrt_hRight_v);
+			// where to store the three waves
+			real_vector fWaves[3][2];
+			// and their speeds
+			real_vector waveSpeeds[3];
+			computeWaveDecomposition_SIMD(fWaves, waveSpeeds, wetDryMask);
+			for (int waveNumber = 0; waveNumber < 3; ++waveNumber) {
+				const real_vector leftMask = ANDV_R(CMP_LT(waveSpeeds[waveNumber], zeroTol_v_neg), wetDryMask);
+				const real_vector rightMask = ANDV_R(ANDNOTV_R(leftMask, CMP_GT(waveSpeeds[waveNumber], zeroTol_v)), wetDryMask);
+				const real_vector middleMask = ANDV_R(ANDV_R(NOTV_R(leftMask), NOTV_R(rightMask)), wetDryMask);
+				hUpdateLeft_v = BLENDV(hUpdateLeft_v, ADDV(hUpdateLeft_v, fWaves[waveNumber][0]), leftMask);
+				huUpdateLeft_v = BLENDV(huUpdateLeft_v, ADDV(huUpdateLeft_v, fWaves[waveNumber][1]), leftMask);
+				hUpdateRight_v = BLENDV(hUpdateRight_v, ADDV(hUpdateRight_v, fWaves[waveNumber][0]), rightMask);
+				huUpdateRight_v = BLENDV(huUpdateRight_v, ADDV(huUpdateRight_v, fWaves[waveNumber][1]), rightMask);
+#if defined COUNTFLOPS
+				flops += COSTS_CMP_LT + COSTS_CMP_GT + COSTS_ORV_R + 4 * COSTS_ADDV + 4 * COSTS_BLENDV + COSTS_MOVEMASK;
+#endif
+				if (MOVEMASK(middleMask) != 0) {
+					// this should not happen mathematically
+					static const real_vector half = SETV_R(static_cast<real>(0.5));
+					const real_vector half_hUpdate = MULV(half, fWaves[waveNumber][0]);
+					const real_vector half_huUpdate = MULV(half, fWaves[waveNumber][1]);
+					hUpdateLeft_v = BLENDV(hUpdateLeft_v, ADDV(hUpdateLeft_v, half_hUpdate), middleMask);
+					huUpdateLeft_v = BLENDV(huUpdateLeft_v, ADDV(huUpdateLeft_v, half_huUpdate), middleMask);
+					hUpdateRight_v = BLENDV(hUpdateRight_v, ADDV(hUpdateRight_v, half_hUpdate), middleMask);
+					huUpdateRight_v = BLENDV(huUpdateRight_v, ADDV(huUpdateRight_v, half_huUpdate), middleMask);
+#if defined COUNTFLOPS
+					flops += 4 * (COSTS_BLENDV + COSTS_ADDV + COSTS_MULV);
+#endif
+				}
+				assert(MOVEMASK(ORV_R(NOTV_R(wetDryMask), ORV_R(leftMask, ORV_R(rightMask, middleMask)))) == VECTOR_FULL_MASK);
+			}
+			assert(checkVector(hUpdateLeft_v));	assert(checkVector(hUpdateRight_v));
+			assert(checkVector(huUpdateLeft_v));	assert(checkVector(huUpdateRight_v));
+			STOREU(o_hUpdateLeft, BLENDV(ZEROV_R(), hUpdateLeft_v, wetDryMask));
+			STOREU(o_huUpdateLeft, BLENDV(ZEROV_R(), huUpdateLeft_v, wetDryMask));
+			STOREU(o_hUpdateRight, BLENDV(ZEROV_R(), hUpdateRight_v, wetDryMask));
+			STOREU(o_huUpdateRight, BLENDV(ZEROV_R(), huUpdateRight_v, wetDryMask));
+			//compute maximum wave speed (-> CFL-condition)
+			const real_vector maxEdgeSpeed_v = MAXV(
+				MAXV(
+					FABS(ANDV_R(waveSpeeds[0], wetDryMask)),
+					FABS(ANDV_R(waveSpeeds[1], wetDryMask))
+				),
+				FABS(ANDV_R(waveSpeeds[2], wetDryMask))
+			);
+			assert(checkVector(maxEdgeSpeed_v));
+			// This may be erroneously "optimized out", when compiled with -fstrict-aliasing
+			// but there is simply no other possibility to get the single components of a vector
+			const real* const p_speed = reinterpret_cast<const real*>(&maxEdgeSpeed_v);
+			const integer* const p_wetDry = reinterpret_cast<const integer*>(&wetDryState_v);
+			o_maxWaveSpeed = static_cast<real>(0);
+			for (size_t i = 0; i < VECTOR_LENGTH; ++i) {
+				if (p_wetDry[i] != 0) {
+					o_maxWaveSpeed = std :: max(o_maxWaveSpeed, p_speed[i]);
+				}
+			}
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_SQRTV + 2 * COSTS_MULV + 4 * (COSTS_STOREU + COSTS_BLENDV) + 2 * COSTS_MAXV + 3 * COSTS_FABS + VECTOR_LENGTH * COSTS_MAXS;
+#endif
+		} else {
+			STOREU(o_hUpdateLeft, ZEROV_R());		STOREU(o_huUpdateLeft, ZEROV_R());
+			STOREU(o_hUpdateRight, ZEROV_R());		STOREU(o_huUpdateRight, ZEROV_R());
+			o_maxWaveSpeed = static_cast<real>(0);
+#if defined COUNTFLOPS
+			flops += 4 * (COSTS_STOREU + COSTS_ZEROV_R);
+#endif
+		}
+	}
+#endif /* VECTOR_NOVEC */
+private:
+	/**
+	 * Determine the wet/dry state and set member variables accordingly.
+	 */
+	void
+	determineWetDryState ()
+	{
+		//compute speeds or set them to zero (dry cells)
+		if (hLeft > dryTol) {
+			uLeft = huLeft / hLeft;
+#if defined COUNTFLOPS
+			flops += COSTS_DIVS + COSTS_CMPS;
+#endif
+		} else {
+			bLeft += hLeft;
+			hLeft = huLeft = uLeft = 0;
+#if defined COUNTFLOPS
+			flops += COSTS_ADDS;
+#endif
+		}
+		if (hRight > dryTol) {
+			uRight = huRight / hRight;
+#if defined COUNTFLOPS
+			flops += COSTS_DIVS + COSTS_CMPS;
+#endif
+		} else {
+			bRight += hRight;
+			hRight = huRight = uRight = 0;
+#if defined COUNTFLOPS
+			flops += COSTS_ADDS;
+#endif
+		}
+#if defined COUNTFLOPS
+		flops += 2 * COSTS_ADDS;
+#endif
+		if (hLeft >= dryTol and hRight >= dryTol) {
+			//test for simple wet/wet case since this is most probably the
+			//most frequently executed branch
+			wetDryState = WetWet;
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_CMPS + COSTS_ANDS;
+#endif
+		} else if (hLeft < dryTol and hRight < dryTol) {
+			//check for the dry/dry-case
+			wetDryState = DryDry;
+#if defined COUNTFLOPS
+			flops += 4 * COSTS_CMPS + 2 * COSTS_ANDS;
+#endif
+		} else if (hLeft < dryTol and hRight + bRight > bLeft) {
+			//we have a shoreline: one cell dry, one cell wet
+			//check for simple inundation problems
+			// (=> dry cell lies lower than the wet cell)
+			wetDryState = DryWetInundation;
+#if defined COUNTFLOPS
+			flops += 6 * COSTS_CMPS + COSTS_ADDS + 3 * COSTS_ANDS;
+#endif
+		} else if (hRight < dryTol and hLeft + bLeft > bRight) {
+			wetDryState = WetDryInundation;
+#if defined COUNTFLOPS
+			flops += 8 * COSTS_CMPS + 2 * COSTS_ADDS + 4 * COSTS_ANDS;
+#endif
+		} else if (hLeft < dryTol) {
+			//dry cell lies higher than the wet cell
+			//lets check if the momentum is able to overcome the difference in height
+			//  => solve homogeneous Riemann-problem to determine the middle state height
+			//     which would arise if there is a wall (wall-boundary-condition)
+			//       \cite[ch. 6.8.2]{george2006finite})
+			//       \cite[ch. 5.2]{george2008augmented}
+			computeMiddleState(
+				hRight, hRight,
+				-uRight, uRight,
+				maxNumberOfNewtonIterations
+			);
+			if (hMiddle + bRight > bLeft) {
+				//momentum is large enough, continue with the original values
+				//          bLeft = hMiddle + bRight;
+				wetDryState = DryWetWallInundation;
+			} else {
+				//momentum is not large enough, use wall-boundary-values
+				hLeft = hRight;
+				uLeft = -uRight;
+				huLeft = -huRight;
+				bLeft = bRight = static_cast<real>(0);
+				wetDryState = DryWetWall;
+			}
+#if defined COUNTFLOPS
+			flops += 3 * COSTS_ADDS + 10 * COSTS_CMPS + 4 * COSTS_ANDS;
+#endif
+		} else if (hRight < dryTol) {
+			//lets check if the momentum is able to overcome the difference in height
+			//  => solve homogeneous Riemann-problem to determine the middle state height
+			//     which would arise if there is a wall (wall-boundary-condition)
+			//       \cite[ch. 6.8.2]{george2006finite})
+			//       \cite[ch. 5.2]{george2008augmented}
+			computeMiddleState(
+				hLeft, hLeft,
+				uLeft, -uLeft,
+				maxNumberOfNewtonIterations
+			);
+			if (hMiddle + bLeft > bRight) {
+				//momentum is large enough, continue with the original values
+				//          bRight = hMiddle + bLeft;
+				wetDryState = WetDryWallInundation;
+			} else {
+				hRight = hLeft;
+				uRight = -uLeft;
+				huRight = -huLeft;
+				bRight = bLeft = static_cast<real>(0);
+				wetDryState = WetDryWall;
+			}
+#if defined COUNTFLOPS
+			flops += 4 * COSTS_ADDS + 12 * COSTS_CMPS + 4 * COSTS_ANDS;
+#endif
+		} else {
+			//done with all cases
+			assert(false);
+		}
+		//limit the effect of the source term if there is a "wall"
+		//\cite[end of ch. 5.2?]{george2008augmented}
+		//\cite[rpn2ez_fast_geo.f][levequeclawpack]
+		if (wetDryState == DryWetWallInundation) {
+			bLeft = hRight + bRight;
+#if defined COUNTFLOPS
+			flops += COSTS_ADDS + COSTS_CMPS;
+#endif
+		} else if (wetDryState == WetDryWallInundation) {
+			bRight = hLeft + bLeft;
+#if defined COUNTFLOPS
+			flops += COSTS_ADDS + 2 * COSTS_CMPS;
+#endif
+		}
+	}
+#ifndef  VECTOR_NOVEC
+	void
+	determineWetDryState_SIMD ()
+	{
+		//compute speeds or set them to zero (dry cells)
+		const real_vector mask_left = CMP_GT(hLeft_v, dryTol_v);
+		uLeft_v = BLENDV(ZEROV_R(), DIVV(huLeft_v, hLeft_v), mask_left);
+		bLeft_v = BLENDV(ADDV(bLeft_v, hLeft_v), bLeft_v, mask_left);
+		hLeft_v = BLENDV(ZEROV_R(), hLeft_v, mask_left);
+		huLeft_v = BLENDV(ZEROV_R(), huLeft_v, mask_left);
+		const real_vector mask_right = CMP_GT(hRight_v, dryTol_v);
+		uRight_v = BLENDV(ZEROV_R(), DIVV(huRight_v, hRight_v), mask_right);
+		bRight_v = BLENDV(ADDV(bRight_v, hRight_v), bRight_v, mask_right);
+		hRight_v = BLENDV(ZEROV_R(), hRight_v, mask_right);
+		huRight_v = BLENDV(ZEROV_R(), huRight_v, mask_right);
+		const real_vector wetWetMask = ANDV_R(
+			CMP_GE(hLeft_v, dryTol_v),
+			CMP_GE(hRight_v, dryTol_v)
+		);
+		wetDryState_v = BLENDV_I(wetDryState_v, WetWet_V, wetWetMask);
+#if defined COUNTFLOPS
+		flops += 8 * COSTS_BLENDV + COSTS_BLENDV_I + 2 * COSTS_CMP_GT + 2 * COSTS_CMP_GE + 2 * COSTS_DIVV + 2 * COSTS_ADDV + 4 * COSTS_ZEROV_R + COSTS_ZEROV_I + COSTS_ANDV_R + COSTS_MOVEMASK;
+#endif
+		if (MOVEMASK(wetWetMask) == VECTOR_FULL_MASK) {
+			return;
+		}
+		real_vector mask_accumulator = wetWetMask;
+		{
+			const real_vector dryDryMask = ANDNOTV_R(
+				mask_accumulator,
+				ANDV_R(
+					CMP_LT(hLeft_v, dryTol_v),
+					CMP_LT(hRight_v, dryTol_v)
+				)
+			);
+			wetDryState_v = BLENDV_I(wetDryState_v, DryDry_V, dryDryMask);
+			mask_accumulator = ORV_R(mask_accumulator, dryDryMask);
+		}
+		{
+			const real_vector dryWetInMask = ANDNOTV_R(
+				mask_accumulator,
+				ANDV_R(
+					CMP_LT(hLeft_v, dryTol_v),
+					CMP_GT(ADDV(hRight_v, bRight_v), bLeft_v)
+				)
+			);
+			wetDryState_v = BLENDV_I(wetDryState_v, DryWetInundation_V, dryWetInMask);
+			mask_accumulator = ORV_R(mask_accumulator, dryWetInMask);
+		}
+		{
+			const real_vector wetDryInMask = ANDNOTV_R(
+				mask_accumulator,
+				ANDV_R(
+					CMP_LT(hRight_v, dryTol_v),
+					CMP_GT(ADDV(hLeft_v, bLeft_v), bRight_v)
+				)
+			);
+			wetDryState_v = BLENDV_I(wetDryState_v, WetDryInundation_V, wetDryInMask);
+			mask_accumulator = ORV_R(mask_accumulator, wetDryInMask);
+		}
+		const real_vector wallMask_left = ANDNOTV_R(
+			mask_accumulator,
+			CMP_LT(hLeft_v, dryTol_v)
+		);
+		static const real_vector minus_one = SETV_R(static_cast<real>(-1));
+		if (MOVEMASK(wallMask_left)) {
+			const real_vector uRight_v_neg = MULV(uRight_v, minus_one);
+			computeMiddleState_SIMD(
+				hRight_v, hRight_v,
+				uRight_v_neg, uRight_v,
+				wallMask_left,
+				maxNumberOfNewtonIterations
+			);
+			const real_vector middleMask = CMP_GT(ADDV(hMiddle_v, bRight_v), bLeft_v);
+			{
+				const real_vector dryWetWallInMask = ANDV_R(
+					wallMask_left,
+					middleMask
+				);
+				wetDryState_v = BLENDV_I(wetDryState_v, DryWetWallInundation_V, dryWetWallInMask);
+			}
+			{
+				const real_vector dryWetWallMask = ANDV_R(
+					wallMask_left,
+					NOTV_R(middleMask)
+				);
+				wetDryState_v = BLENDV_I(wetDryState_v, DryWetWall_V, dryWetWallMask);
+				hLeft_v = BLENDV(hLeft_v, hRight_v, dryWetWallMask);
+				uLeft_v = BLENDV(uLeft_v, uRight_v_neg, dryWetWallMask);
+				huLeft_v = BLENDV(huLeft_v, MULV(huRight_v, minus_one), dryWetWallMask);
+				bLeft_v = BLENDV(bLeft_v, ZEROV_R(), dryWetWallMask);
+				bRight_v = BLENDV(bRight_v, ZEROV_R(), dryWetWallMask);
+			}
+#if defined COUNTFLOPS
+			flops +=  2 * COSTS_MULV + COSTS_CMP_GT + COSTS_ADDV + 2 * COSTS_ANDV_R + COSTS_NOTV_R + 2 * COSTS_BLENDV_I + 5 * COSTS_BLENDV + 2 * COSTS_ZEROV_R;
+#endif
+		}
+		mask_accumulator = ORV_R(mask_accumulator, wallMask_left);
+		const real_vector wallMask_right = ANDNOTV_R(
+			mask_accumulator,
+			CMP_LT(hRight_v, dryTol_v)
+		);		
+		if (MOVEMASK(wallMask_right)) {
+			const real_vector uLeft_v_neg = MULV(uLeft_v, minus_one);
+			computeMiddleState_SIMD(
+				hLeft_v, hLeft_v,
+				uLeft_v, uLeft_v_neg,
+				wallMask_right,
+				maxNumberOfNewtonIterations
+			);
+			const real_vector middleMask = CMP_GT(ADDV(hMiddle_v, bLeft_v), bRight_v);
+			{
+				const real_vector wetDryWallInMask = ANDV_R(
+					wallMask_right,
+					middleMask
+				);
+				wetDryState_v = BLENDV_I(wetDryState_v, WetDryWallInundation_V, wetDryWallInMask);
+			}
+			{
+				const real_vector wetDryWallMask = ANDV_R(
+					wallMask_right,
+					NOTV_R(middleMask)
+				);
+				wetDryState_v = BLENDV_I(wetDryState_v, WetDryWall_V, wetDryWallMask);
+				hRight_v = BLENDV(hRight_v, hLeft_v, wetDryWallMask);
+				uRight_v = BLENDV(uRight_v, uLeft_v_neg, wetDryWallMask);
+				huRight_v = BLENDV(huRight_v, MULV(huLeft_v, minus_one), wetDryWallMask);
+				bRight_v = BLENDV(bRight_v, ZEROV_R(), wetDryWallMask);
+				bLeft_v = BLENDV(bLeft_v, ZEROV_R(), wetDryWallMask);
+			}
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MULV + COSTS_CMP_GT + COSTS_ADDV + 2 * COSTS_ANDV_R + COSTS_NOTV_R + 2 * COSTS_BLENDV_I + 5 * COSTS_BLENDV + 2 * COSTS_ZEROV_R;
+#endif
+		}
+		mask_accumulator = ORV_R(mask_accumulator, wallMask_right);
+		assert(MOVEMASK(mask_accumulator) == VECTOR_FULL_MASK);
+		const real_vector dryWetWallInMask = CAST_INT_TO_REAL_V(CMP_EQ_I(wetDryState_v, DryWetWallInundation_V));
+		const real_vector wetDryWallInMask = CAST_INT_TO_REAL_V(CMP_EQ_I(wetDryState_v, WetDryWallInundation_V));
+		bLeft_v = BLENDV(bLeft_v, ADDV(hRight_v, bRight_v), dryWetWallInMask);
+		bRight_v = BLENDV(bRight_v, ADDV(hLeft_v, bLeft_v), wetDryWallInMask);
+#if defined COUNTFLOPS
+		flops += 5 * COSTS_ANDNOTV_R + 3 * COSTS_ANDV_R + 6 * COSTS_CMP_LT + 2 * COSTS_CMP_GT + 3 * COSTS_BLENDV_I + 5 * COSTS_ORV_R + 2 * COSTS_MOVEMASK + 2 * COSTS_CMP_EQ_I + 2 * COSTS_BLENDV + 2 * COSTS_ADDV;
+#endif
+	}
+#endif /* VECTOR_NOVEC */
+	/**
+	 * Compute the augmented wave decomposition.
+	 *
+	 * @param o_fWaves will be set to: Decomposition into f-Waves.
+	 * @param o_waveSpeeds will be set to: speeds of the linearized waves (eigenvalues).
+	 */
+	inline void
+	computeWaveDecomposition (real o_fWaves[3][2], real o_waveSpeeds[3])
+	{
+		//compute eigenvalues of the jacobian matrices in states Q_{i-1} and Q_{i} (char. speeds)
+		real characteristicSpeeds[2];
+		characteristicSpeeds[0] = uLeft - sqrt_g_hLeft;
+		characteristicSpeeds[1] = uRight + sqrt_g_hRight;
+		//compute "Roe speeds"
+		real hRoe = static_cast<real>(0.5) * (hRight + hLeft);
+		real uRoe = uLeft * sqrt_hLeft + uRight * sqrt_hRight;
+		uRoe /= sqrt_hLeft + sqrt_hRight;
+		real roeSpeeds[2];
+		//optimization for dumb compilers
+		real sqrt_g_hRoe = std::sqrt(g * hRoe);
+		roeSpeeds[0] = uRoe - sqrt_g_hRoe;
+		roeSpeeds[1] = uRoe + sqrt_g_hRoe;
+		//compute the middle state of the homogeneous Riemann-Problem
+		if (wetDryState != WetDryWall and wetDryState != DryWetWall) {
+			//case WDW and DWW was computed in determineWetDryState already
+			computeMiddleState(
+				hLeft, hRight,
+				uLeft, uRight
+			);
+		}
+		//compute extended eindfeldt speeds (einfeldt speeds + middle state speeds)
+		//  \cite[ch. 5.2]{george2008augmented}, \cite[ch. 6.8]{george2006finite}
+		real extEinfeldtSpeeds[2] = {static_cast<real>(0), static_cast<real>(0)};
+		if (wetDryState == WetWet or wetDryState == WetDryWall or wetDryState == DryWetWall) {
+			extEinfeldtSpeeds[0] = std::min(characteristicSpeeds[0], roeSpeeds[0]);
+			extEinfeldtSpeeds[0] = std::min(extEinfeldtSpeeds[0], middleStateSpeeds[1]);
+			extEinfeldtSpeeds[1] = std::max(characteristicSpeeds[1], roeSpeeds[1]);
+			extEinfeldtSpeeds[1] = std::max(extEinfeldtSpeeds[1], middleStateSpeeds[0]);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MINS + 2 * COSTS_MAXS + 2 * COSTS_ORS + 3 * COSTS_CMPS;
+#endif
+		} else if (hLeft < dryTol) {
+			//ignore undefined speeds
+			extEinfeldtSpeeds[0] = std::min(roeSpeeds[0], middleStateSpeeds[1]);
+			extEinfeldtSpeeds[1] = std::max(characteristicSpeeds[1], roeSpeeds[1]);
+#if defined COUNTFLOPS
+			flops += 3 * COSTS_MINS + 3 * COSTS_MAXS + 2 * COSTS_ORS + 4 * COSTS_CMPS;
+#endif
+			assert(middleStateSpeeds[0] < extEinfeldtSpeeds[1]);
+		} else if (hRight < dryTol) {
+			//ignore undefined speeds
+			extEinfeldtSpeeds[0] = std::min(characteristicSpeeds[0], roeSpeeds[0]);
+			extEinfeldtSpeeds[1] = std::max(roeSpeeds[1], middleStateSpeeds[0]);
+#if defined COUNTFLOPS
+			flops += 4 * COSTS_MINS + 4 * COSTS_MAXS + 2 * COSTS_ORS + 5 * COSTS_CMPS;
+#endif
+			assert(middleStateSpeeds[1] > extEinfeldtSpeeds[0]);
+		} else {
+			assert(false);
+		}
+		//HLL middle state
+		//  \cite[theorem 3.1]{george2006finite}, \cite[ch. 4.1]{george2008augmented}
+		real hLLMiddleHeight = huLeft - huRight + extEinfeldtSpeeds[1] * hRight - extEinfeldtSpeeds[0] * hLeft;
+		hLLMiddleHeight /= extEinfeldtSpeeds[1] - extEinfeldtSpeeds[0];
+		hLLMiddleHeight = std::max(hLLMiddleHeight, static_cast<real>(0));
+		//define eigenvalues
+		real eigenValues[3];
+		eigenValues[0] = extEinfeldtSpeeds[0];
+		//eigenValues[1] --> corrector wave
+		eigenValues[2] = extEinfeldtSpeeds[1];
+		//define eigenvectors
+		real eigenVectors[3][3];
+		//set first and third eigenvector
+		eigenVectors[0][0] = static_cast<real>(1);
+		eigenVectors[0][2] = static_cast<real>(1);
+		eigenVectors[1][0] = eigenValues[0];
+		eigenVectors[1][2] = eigenValues[2];
+		eigenVectors[2][0] = eigenValues[0] * eigenValues[0];
+		eigenVectors[2][2] = eigenValues[2] * eigenValues[2];
+		//compute rarefaction corrector wave
+		//  \cite[ch. 6.7.2]{george2006finite}, \cite[ch. 5.1]{george2008augmented}
+//		bool strongRarefaction = false;
+		//set 2nd eigenvector
+		eigenValues[1] = static_cast<real>(0.5) * (eigenValues[0] + eigenValues[2]);
+		eigenVectors[0][1] = 0.;
+		eigenVectors[1][1] = 0.;
+		eigenVectors[2][1] = 1.;
+		//compute the jump in state
+		real rightHandSide[3];
+		rightHandSide[0] = hRight - hLeft;
+		rightHandSide[1] = huRight - huLeft;
+		rightHandSide[2] = huRight * uRight + static_cast<real>(0.5) * g * hRight * hRight
+				- (huLeft * uLeft + static_cast<real>(0.5) * g * hLeft * hLeft);
+		//compute steady state wave
+		//  \cite[ch. 4.2.1 \& app. A]{george2008augmented}, \cite[ch. 6.2 \& ch. 4.4]{george2006finite}
+		real steadyStateWave[2];
+		real hBar = (hLeft + hRight) * static_cast<real>(0.5);
+		steadyStateWave[0] = -(bRight - bLeft);
+		steadyStateWave[1] = -g * hBar * (bRight - bLeft);
+		//preserve depth-positivity
+		//  \cite[ch. 6.5.2]{george2006finite}, \cite[ch. 4.2.3]{george2008augmented}
+		if (eigenValues[0] < -zeroTol and eigenValues[2] > zeroTol) {
+			//subsonic
+			steadyStateWave[0] = std::max(steadyStateWave[0], hLLMiddleHeight * (eigenValues[2] - eigenValues[0]) / eigenValues[0]);
+			steadyStateWave[0] = std::min(steadyStateWave[0], hLLMiddleHeight * (eigenValues[2] - eigenValues[0]) / eigenValues[2]);
+#if defined COUNTFLOPS
+			flops += COSTS_MAXS + COSTS_MINS + 2 * COSTS_CMPS + COSTS_ANDS;
+#endif
+		} else if (eigenValues[0] > zeroTol) {
+			//supersonic right TODO: motivation?
+			steadyStateWave[0] = std::max(steadyStateWave[0], -hLeft);
+			steadyStateWave[0] = std::min(steadyStateWave[0], hLLMiddleHeight * (eigenValues[2] - eigenValues[0]) / eigenValues[0]);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MAXS + 2 * COSTS_MINS + 3 * COSTS_CMPS + COSTS_ANDS;
+#endif
+		} else if (eigenValues[2] < -zeroTol) {
+			//supersonic left TODO: motivation?
+			steadyStateWave[0] = std::max(steadyStateWave[0], hLLMiddleHeight * (eigenValues[2] - eigenValues[0]) / eigenValues[2]);
+			steadyStateWave[0] = std::min(steadyStateWave[0], hRight);
+#if defined COUNTFLOPS
+			flops += 3 * COSTS_MAXS + 3 * COSTS_MINS + 4 * COSTS_CMPS + COSTS_ANDS;
+#endif
+		}
+		//Limit the effect of the source term
+		//  \cite[ch. 6.4.2]{george2006finite}
+		steadyStateWave[1] = std::min(steadyStateWave[1], g * std::max(-hLeft * (bRight - bLeft), -hRight * (bRight - bLeft)));
+		steadyStateWave[1] = std::max(steadyStateWave[1], g * std::min(-hLeft * (bRight - bLeft), -hRight * (bRight - bLeft)));
+		rightHandSide[0] -= steadyStateWave[0];
+		//rightHandSide[1]: no source term
+		rightHandSide[2] -= steadyStateWave[1];
+		//everything is ready, solve the equations!
+		real beta[3];
+		solveLinearEquation(eigenVectors, rightHandSide, beta);
+		//compute f-waves and wave-speeds
+		if (wetDryState == WetDryWall) {
+			//zero ghost updates (wall boundary)
+			//care about the left going wave (0) only
+			o_fWaves[0][0] = beta[0] * eigenVectors[1][0];
+			o_fWaves[0][1] = beta[0] * eigenVectors[2][0];
+			//set the rest to zero
+			o_fWaves[1][0] = o_fWaves[1][1] = static_cast<real>(0);
+			o_fWaves[2][0] = o_fWaves[2][1] = static_cast<real>(0);
+			o_waveSpeeds[0] = eigenValues[0];
+			o_waveSpeeds[1] = o_waveSpeeds[2] = static_cast<real>(0);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MULS;
+#endif
+			assert(eigenValues[0] < zeroTol);
+		} else if (wetDryState == DryWetWall) {
+			//zero ghost updates (wall boundary)
+			//care about the right going wave (2) only
+			o_fWaves[2][0] = beta[2] * eigenVectors[1][2];
+			o_fWaves[2][1] = beta[2] * eigenVectors[2][2];
+			//set the rest to zero
+			o_fWaves[0][0] = o_fWaves[0][1] = static_cast<real>(0);
+			o_fWaves[1][0] = o_fWaves[1][1] = static_cast<real>(0);
+			o_waveSpeeds[2] = eigenValues[2];
+			o_waveSpeeds[0] = o_waveSpeeds[1] = static_cast<real>(0);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MULS;
+#endif
+			assert(eigenValues[2] > -zeroTol);
+		} else {
+			//compute f-waves (default)
+			for (int waveNumber = 0; waveNumber < 3; waveNumber++) {
+				o_fWaves[waveNumber][0] = beta[waveNumber] * eigenVectors[1][waveNumber]; //select 2nd and
+				o_fWaves[waveNumber][1] = beta[waveNumber] * eigenVectors[2][waveNumber]; //3rd component of the augmented decomposition
+#if defined COUNTFLOPS
+				flops += 2 * COSTS_MULS;
+#endif
+			}
+			o_waveSpeeds[0] = eigenValues[0];
+			o_waveSpeeds[1] = eigenValues[1];
+			o_waveSpeeds[2] = eigenValues[2];
+		}
+#if defined COUNTFLOPS
+		flops += 15 * COSTS_SUBS + 9 * COSTS_ADDS + 28 * COSTS_MULS + 2 * COSTS_DIVS + COSTS_SQRTS + 2 * COSTS_CMPS + COSTS_ANDS + 3 * COSTS_MAXS + 2 * COSTS_MINS;
+#endif
+	}
+#ifndef  VECTOR_NOVEC
+	inline void
+	computeWaveDecomposition_SIMD (real_vector o_fWaves[3][2], real_vector o_waveSpeeds[3], const real_vector wetDryMask)
+	{
+		static const real_vector half = SETV_R(static_cast<real>(0.5));
+		//compute eigenvalues of the jacobian matrices in states Q_{i-1} and Q_{i} (char. speeds)
+		const real_vector characteristicSpeeds[2] = {
+			SUBV(uLeft_v, sqrt_g_hLeft_v),
+			ADDV(uRight_v, sqrt_g_hRight_v)
+		};
+		//compute "Roe speeds"
+		const real_vector hRoe = MULV(ADDV(hRight_v, hLeft_v), half);
+		const real_vector uRoe = BLENDV(
+			ZEROV_R(),
+			DIVV(
+				ADDV(
+					MULV(uLeft_v, sqrt_hLeft_v),
+					MULV(uRight_v, sqrt_hRight_v)
+				),
+				ADDV(
+					sqrt_hLeft_v,
+					sqrt_hRight_v
+				)
+			),
+			wetDryMask
+		);
+		const real_vector sqrt_g_hRoe = SQRTV(MULV(g_v, hRoe));
+		const real_vector roeSpeeds[2] = {
+			SUBV(uRoe, sqrt_g_hRoe),
+			ADDV(uRoe, sqrt_g_hRoe)
+		};
+		const real_vector middleState_blendMask = ANDV_R(
+			NOTV_R(
+				ORV_R(
+					CAST_INT_TO_REAL_V(CMP_EQ_I(wetDryState_v, WetDryWall_V)),
+					CAST_INT_TO_REAL_V(CMP_EQ_I(wetDryState_v, DryWetWall_V))
+				)
+			),
+			wetDryMask
+		);
+		//compute the middle state of the homogeneous Riemann-Problem
+		computeMiddleState_SIMD(
+			hLeft_v, hRight_v,
+			uLeft_v, uRight_v,
+			middleState_blendMask
+		);
+		//compute extended eindfeldt speeds (einfeldt speeds + middle state speeds)
+		//  \cite[ch. 5.2]{george2008augmented}, \cite[ch. 6.8]{george2006finite}
+		real_vector extEinfeldtSpeeds[2] = {
+			ZEROV_R(),
+			ZEROV_R()
+		};
+		extEinfeldtSpeeds[0] = MINV(characteristicSpeeds[0], roeSpeeds[0]);
+		extEinfeldtSpeeds[1] = MAXV(roeSpeeds[1], middleStateSpeeds_v[0]);
+		const real_vector speeds_t0[2] = {
+			MINV(extEinfeldtSpeeds[0], middleStateSpeeds_v[1]),
+			MAXV(extEinfeldtSpeeds[1], characteristicSpeeds[1])
+		};
+		const real_vector speeds_t1[2] = {
+			MINV(roeSpeeds[0], middleStateSpeeds_v[1]),
+			MAXV(characteristicSpeeds[1], roeSpeeds[1])
+		};
+		const real_vector wetMask = ANDV_R(
+			CAST_INT_TO_REAL_V(
+				ORV_I(
+					ORV_I(
+						CMP_EQ_I(wetDryState_v, WetWet_V),
+						CMP_EQ_I(wetDryState_v, WetDryWall_V)
+					),
+					CMP_EQ_I(wetDryState_v, DryWetWall_V)
+				)
+			),
+			wetDryMask
+		);
+		real_vector mask_accumulator = wetMask;
+		extEinfeldtSpeeds[0] = BLENDV(extEinfeldtSpeeds[0], speeds_t0[0], wetMask);
+		extEinfeldtSpeeds[1] = BLENDV(extEinfeldtSpeeds[1], speeds_t0[1], wetMask);
+		const real_vector leftDryMask = ANDV_R(
+			ANDNOTV_R(
+				mask_accumulator,
+				CMP_LT(hLeft_v, dryTol_v)
+			),
+			wetDryMask
+		);
+		extEinfeldtSpeeds[0] = BLENDV(extEinfeldtSpeeds[0], speeds_t1[0], leftDryMask);
+		extEinfeldtSpeeds[1] = BLENDV(extEinfeldtSpeeds[1], speeds_t1[1], leftDryMask);
+		assert(MOVEMASK(ANDV_R(leftDryMask, CMP_GE(middleStateSpeeds_v[0], extEinfeldtSpeeds[1]))) == 0);
+#ifndef NDEBUG
+		const real_vector rightDryMask = ANDV_R(wetDryMask, ANDNOTV_R(ORV_R(wetMask, leftDryMask), CMP_LT(hRight_v, dryTol_v)));
+		assert(MOVEMASK(ANDV_R(rightDryMask, CMP_LE(middleStateSpeeds_v[1], extEinfeldtSpeeds[0]))) == 0);
+		assert(MOVEMASK(ORV_R(NOTV_R(wetDryMask), ORV_R(ORV_R(wetMask, leftDryMask), rightDryMask))) == VECTOR_FULL_MASK);
+#endif
+		//HLL middle state
+		//  \cite[theorem 3.1]{george2006finite}, \cite[ch. 4.1]{george2008augmented}
+		const real_vector hLLMiddleHeight = MAXV(
+			DIVV(
+				SUBV(
+					ADDV(
+						SUBV(huLeft_v, huRight_v),
+						MULV(extEinfeldtSpeeds[1], hRight_v)
+					),
+					MULV(extEinfeldtSpeeds[0], hLeft_v)
+				),
+				SUBV(extEinfeldtSpeeds[1], extEinfeldtSpeeds[0])
+			),
+			ZEROV_R()
+		);
+#pragma message "strongRarefaction set to false; further investigation needed, about senseless initializations of eigenValues[1]"
+//		static const bool strongRarefaction = false;
+		static const real_vector one = SETV_R(static_cast<real>(1));
+		const real_vector eigenValues[3] = {
+			extEinfeldtSpeeds[0],
+			MULV(half, ADDV(extEinfeldtSpeeds[0], extEinfeldtSpeeds[1])),
+			extEinfeldtSpeeds[1]
+		};
+		assert(checkVector(eigenValues[0]));	assert(checkVector(eigenValues[1]));	assert(checkVector(eigenValues[2]));
+		const real_vector eigenVectors[3][3] = {
+			{
+				one,
+				ZEROV_R(),
+				one
+			},
+			{
+				eigenValues[0],
+				ZEROV_R(), 
+				eigenValues[2]
+			},
+			{
+				MULV(eigenValues[0], eigenValues[0]),
+				one,
+				MULV(eigenValues[2], eigenValues[2])
+			}
+		};
+		assert(checkVector(eigenVectors[0][0]));	assert(checkVector(eigenVectors[0][1]));	assert(checkVector(eigenVectors[0][2]));
+		assert(checkVector(eigenVectors[1][0]));	assert(checkVector(eigenVectors[1][1]));	assert(checkVector(eigenVectors[1][2]));
+		assert(checkVector(eigenVectors[2][0]));	assert(checkVector(eigenVectors[2][1]));	assert(checkVector(eigenVectors[2][2]));
+		//compute steady state wave
+		//  \cite[ch. 4.2.1 \& app. A]{george2008augmented}, \cite[ch. 6.2 \& ch. 4.4]{george2006finite}
+		const real_vector hBar = MULV(ADDV(hLeft_v, hRight_v), half);
+		const real_vector difB = SUBV(bLeft_v, bRight_v);
+		real_vector steadyStateWave[2] = {
+			difB,
+			MULV(difB, MULV(hBar, g_v))
+		};
+		//preserve depth-positivity
+		//  \cite[ch. 6.5.2]{george2006finite}, \cite[ch. 4.2.3]{george2008augmented}
+		mask_accumulator = ZEROV_R();
+		const real_vector subsonic_mask = ANDV_R(ANDV_R(CMP_LT(eigenValues[0], zeroTol_v_neg), CMP_GT(eigenValues[2], zeroTol_v)), wetDryMask);
+		mask_accumulator = ORV_R(mask_accumulator, subsonic_mask);
+		const real_vector supersonic_right_mask = ANDV_R(ANDNOTV_R(mask_accumulator, CMP_GT(eigenValues[0], zeroTol_v)), wetDryMask);
+		mask_accumulator = ORV_R(mask_accumulator, supersonic_right_mask);
+		const real_vector supersonic_left_mask = ANDV_R(ANDNOTV_R(mask_accumulator, CMP_LT(eigenValues[2], zeroTol_v_neg)), wetDryMask);
+		mask_accumulator = ORV_R(mask_accumulator, supersonic_left_mask);
+		if (MOVEMASK(mask_accumulator)) {
+			static const real_vector minus_one = SETV_R(static_cast<real>(-1));
+			const real_vector hLeft_neg = MULV(hLeft_v, minus_one);
+			const real_vector depth_positivity_t0 = MULV(hLLMiddleHeight, SUBV(eigenValues[2], eigenValues[0]));
+			const real_vector depth_positivity_t1 = DIVV(depth_positivity_t0, eigenValues[0]);
+			const real_vector depth_positivity_t2 = DIVV(depth_positivity_t0, eigenValues[2]);
+			const real_vector depth_positivity_max1 = MAXV(steadyStateWave[0], depth_positivity_t1);
+			const real_vector depth_positivity_max2 = MAXV(steadyStateWave[0], hLeft_neg);
+			const real_vector depth_positivity_max3 = MAXV(steadyStateWave[0], depth_positivity_t2);
+			const real_vector depth_positivity_min1 = MINV(depth_positivity_max1, depth_positivity_t2);
+			const real_vector depth_positivity_min2 = MINV(depth_positivity_max2, depth_positivity_t1);
+			const real_vector depth_positivity_min3 = MINV(depth_positivity_max3, hRight_v);
+			steadyStateWave[0] = BLENDV(steadyStateWave[0], depth_positivity_min1, subsonic_mask);
+			steadyStateWave[0] = BLENDV(steadyStateWave[0], depth_positivity_min2, supersonic_right_mask);
+			steadyStateWave[0] = BLENDV(steadyStateWave[0], depth_positivity_min3, supersonic_left_mask);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MULV + COSTS_SUBV + 2 * COSTS_DIVV + 3 * (COSTS_MAXV + COSTS_MINV + COSTS_BLENDV);
+#endif
+		}
+		const real_vector difB_left = MULV(hLeft_v, difB);
+		const real_vector difB_right = MULV(hRight_v, difB);
+		const real_vector difB_max = MULV(
+			g_v,
+			MAXV(difB_left, difB_right)
+		);
+		const real_vector difB_min = MULV(
+			g_v,
+			MINV(difB_left, difB_right)
+		);
+		//Limit the effect of the source term
+		//  \cite[ch. 6.4.2]{george2006finite}
+		steadyStateWave[1] = MINV(steadyStateWave[1], difB_max);
+		steadyStateWave[1] = MAXV(steadyStateWave[1], difB_min);
+		// TODO: Optimize
+		const real_vector rightHandSide[3] = {
+			SUBV(
+				SUBV(hRight_v, hLeft_v),
+				steadyStateWave[0]
+			),
+			SUBV(huRight_v, huLeft_v),
+			SUBV(
+				SUBV(
+					ADDV(
+						MULV(huRight_v, uRight_v),
+						MULV(
+							MULV(
+								MULV(half, g_v),
+								hRight_v
+							),
+							hRight_v
+						)
+					),
+					ADDV(
+						MULV(huLeft_v, uLeft_v),
+						MULV(
+							MULV(
+								MULV(half, g_v),
+								hLeft_v
+							),
+							hLeft_v
+						)
+					)
+				),
+				steadyStateWave[1]
+			)
+		};
+		assert(checkVector(rightHandSide[0]));		assert(checkVector(rightHandSide[1]));		assert(checkVector(rightHandSide[2]));
+		real_vector beta[3];
+		//everything is ready, solve the equations!
+		solveLinearEquation_SIMD(eigenVectors, rightHandSide, beta);
+#if defined COUNTFLOPS
+		flops += 30 * COSTS_MULS + 9 * COSTS_SUBS + 8 * COSTS_ADDS + COSTS_DIVS;
+#endif
+		beta[0] = BLENDV(ZEROV_R(), beta[0], wetDryMask);
+		beta[1] = BLENDV(ZEROV_R(), beta[1], wetDryMask);
+		beta[2] = BLENDV(ZEROV_R(), beta[2], wetDryMask);
+		assert(checkVector(beta[0]));			assert(checkVector(beta[1]));			assert(checkVector(beta[2]));
+		//compute f-waves and wave-speeds
+		const real_vector DryWetWall_Mask = ANDV_R(CAST_INT_TO_REAL_V(CMP_EQ_I(wetDryState_v, DryWetWall_V)), wetDryMask);
+		const integer DryWetWall_State = MOVEMASK(DryWetWall_Mask);
+		const real_vector WetDryWall_Mask = ANDV_R(ANDNOTV_R(DryWetWall_Mask, CAST_INT_TO_REAL_V(CMP_EQ_I(wetDryState_v, WetDryWall_V))), wetDryMask);
+		const integer WetDryWall_State = MOVEMASK(WetDryWall_Mask);
+		const real_vector noWall_Mask = ANDNOTV_R(ORV_R(DryWetWall_Mask, WetDryWall_Mask), wetDryMask);
+		const integer noWall_State = MOVEMASK(noWall_Mask);
+		assert(MOVEMASK(ORV_R(NOTV_R(wetDryMask), ORV_R(DryWetWall_Mask, ORV_R(WetDryWall_Mask, noWall_Mask)))) == VECTOR_FULL_MASK);
+#if defined COUNTFLOPS
+		flops += 6 * COSTS_ZEROV_R + 10 * COSTS_ADDV + 23 * COSTS_MULV + 2 * COSTS_DIVV + COSTS_SQRTV + 11 * COSTS_SUBV + 5 * COSTS_MINV + 6 * COSTS_MAXV + 3 * COSTS_ORV_I + 5 * COSTS_CMP_EQ_I + 4 * COSTS_BLENDV + 5 * COSTS_ANDNOTV_R + 3 * COSTS_CMP_LT + 2 * COSTS_CMP_GT + 4 * COSTS_ORV_I + COSTS_ANDV_R + COSTS_NOTV_I + 4 * COSTS_MOVEMASK;
+#endif
+		if (noWall_State) {
+			//compute f-waves (default)
+			for (int waveNumber = 0; waveNumber < 3; ++waveNumber) {
+				o_fWaves[waveNumber][0] = BLENDV(o_fWaves[waveNumber][0], MULV(beta[waveNumber], eigenVectors[1][waveNumber]), noWall_Mask);
+				o_fWaves[waveNumber][1] = BLENDV(o_fWaves[waveNumber][1], MULV(beta[waveNumber], eigenVectors[2][waveNumber]), noWall_Mask);
+			}
+			o_waveSpeeds[0] = BLENDV(o_waveSpeeds[0], eigenValues[0], noWall_Mask);
+			o_waveSpeeds[1] = BLENDV(o_waveSpeeds[1], eigenValues[1], noWall_Mask);
+			o_waveSpeeds[2] = BLENDV(o_waveSpeeds[2], eigenValues[2], noWall_Mask);
+#if defined COUNTFLOPS
+			flops += 6 * COSTS_MULV + 5 * COSTS_BLENDV;
+#endif
+		}
+		if (WetDryWall_State) {
+			//zero ghost updates (wall boundary)
+			//care about the left going wave (0) only
+			o_fWaves[0][0] = BLENDV(o_fWaves[0][0], MULV(beta[0], eigenVectors[1][0]), WetDryWall_Mask);
+			o_fWaves[0][1] = BLENDV(o_fWaves[0][1], MULV(beta[0], eigenVectors[2][0]), WetDryWall_Mask);
+			o_waveSpeeds[0] = BLENDV(o_waveSpeeds[0], eigenValues[0], WetDryWall_Mask);
+			// set the rest to zero
+			o_fWaves[1][0] = BLENDV(o_fWaves[1][0], ZEROV_R(), WetDryWall_Mask);
+			o_fWaves[1][1] = BLENDV(o_fWaves[1][1], ZEROV_R(), WetDryWall_Mask);
+			o_fWaves[2][0] = BLENDV(o_fWaves[2][0], ZEROV_R(), WetDryWall_Mask);
+			o_fWaves[2][1] = BLENDV(o_fWaves[2][1], ZEROV_R(), WetDryWall_Mask);
+			o_waveSpeeds[1] = BLENDV(o_waveSpeeds[1], ZEROV_R(), WetDryWall_Mask);
+			o_waveSpeeds[2] = BLENDV(o_waveSpeeds[2], ZEROV_R(), WetDryWall_Mask);
+#if defined COUNTFLOPS
+			flops += 9 * COSTS_BLENDV + 2 * COSTS_MULV + 6 * COSTS_ZEROV_R;
+#endif
+			assert(MOVEMASK(ANDV_R(WetDryWall_Mask, CMP_GE(eigenValues[0], zeroTol_v))) == 0);
+		}
+		if (DryWetWall_State) {
+			//zero ghost updates (wall boundary)
+			//care about the right going wave (2) only
+			o_fWaves[2][0] = BLENDV(o_fWaves[2][0], MULV(beta[2], eigenVectors[1][2]), DryWetWall_Mask);
+			o_fWaves[2][1] = BLENDV(o_fWaves[2][1], MULV(beta[2], eigenVectors[2][2]), DryWetWall_Mask);
+			o_waveSpeeds[2] = BLENDV(o_waveSpeeds[2], eigenValues[2], DryWetWall_Mask);
+			// set the rest to zero
+			o_fWaves[0][0] = BLENDV(o_fWaves[0][0], ZEROV_R(), DryWetWall_Mask);
+			o_fWaves[0][1] = BLENDV(o_fWaves[0][1], ZEROV_R(), DryWetWall_Mask);
+			o_fWaves[1][0] = BLENDV(o_fWaves[1][0], ZEROV_R(), DryWetWall_Mask);
+			o_fWaves[1][1] = BLENDV(o_fWaves[1][1], ZEROV_R(), DryWetWall_Mask);
+			o_waveSpeeds[0] = BLENDV(o_waveSpeeds[0], ZEROV_R(), DryWetWall_Mask);
+			o_waveSpeeds[1] = BLENDV(o_waveSpeeds[1], ZEROV_R(), DryWetWall_Mask);
+#if defined COUNTFLOPS
+			flops += 9 * COSTS_BLENDV + 6 * COSTS_ZEROV_R + 2 * COSTS_MULV;
+#endif
+			assert(MOVEMASK(ANDV_R(DryWetWall_Mask, CMP_LE(eigenValues[2], zeroTol_v_neg))) == 0);
+		}
+	}
+#endif /* VECTOR_NOVEC */
+	/**
+	 * Computes the middle state of the homogeneous Riemann-problem.
+	 *   -> (\cite[ch. 13]{leveque2002finite})
+	 *
+	 * @param i_hLeft height on the left side of the edge.
+	 * @param i_hRight height on the right side of the edge.
+	 * @param i_uLeft velocity on the left side of the edge.
+	 * @param i_uRight velocity on the right side of the edge.
+	 * @param i_huLeft momentum on the left side of the edge.
+	 * @param i_huRight momentum on the right side of the edge.
+	 * @param i_maxNumberOfNewtonIterations maximum number of Newton iterations.
+	 */
+	inline void
+	computeMiddleState (
+		const real &i_hLeft, const real &i_hRight,
+		const real &i_uLeft, const real &i_uRight,
+		const int i_maxNumberOfNewtonIterations = 1
+	)
+	{
+		//set everything to zero
+		hMiddle = static_cast<real>(0);
+		middleStateSpeeds[0] = static_cast<real>(0);
+		middleStateSpeeds[1] = static_cast<real>(0);
+		//compute local square roots
+		//(not necessarily the same ones as in computeNetUpdates!)
+		real l_sqrt_g_hRight = std::sqrt(g * i_hRight);
+		real l_sqrt_g_hLeft = std::sqrt(g * i_hLeft);
+		//single rarefaction in the case of a wet/dry interface
+		if (i_hLeft < dryTol) {
+			middleStateSpeeds[1] = middleStateSpeeds[0] = i_uRight - static_cast<real>(2) * l_sqrt_g_hRight;
+			riemannStructure = DrySingleRarefaction;
+#if defined COUNTFLOPS
+			flops += COSTS_CMPS + COSTS_SUBS + COSTS_MULS;
+#endif
+			return;
+		} else if (i_hRight < dryTol) {
+			middleStateSpeeds[0] = middleStateSpeeds[1] = i_uLeft + static_cast<real>(2) * l_sqrt_g_hLeft;
+			riemannStructure = SingleRarefactionDry;
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_CMPS + COSTS_SUBS + COSTS_MULS;
+#endif
+			return;
+		}
+		//determine the wave structure of the Riemann-problem
+		riemannStructure = determineRiemannStructure(i_hLeft, i_hRight, i_uLeft, i_uRight);
+		//will be computed later
+		real sqrt_g_hMiddle = static_cast<real>(0);
+		if (riemannStructure == ShockShock) {
+			hMiddle = std::min(i_hLeft, i_hRight);
+			real l_sqrtTermH[2] = {0, 0};
+			for (int i = 0; i < i_maxNumberOfNewtonIterations; i++) {
+				l_sqrtTermH[0] = std::sqrt(static_cast<real>(0.5) * g * ((hMiddle + i_hLeft) / (hMiddle * i_hLeft)));
+				l_sqrtTermH[1] = std::sqrt(static_cast<real>(0.5) * g * ((hMiddle + i_hRight) / (hMiddle * i_hRight)));
+				real phi = i_uRight - i_uLeft + (hMiddle - i_hLeft) * l_sqrtTermH[0] + (hMiddle - i_hRight) * l_sqrtTermH[1];
+#if defined COUNTFLOPS
+				flops += 2 * COSTS_SQRTS + 8 * COSTS_MULS + 4 * COSTS_ADDS + 2 * COSTS_DIVS + 3 * COSTS_SUBS + COSTS_CMPS;
+#endif
+				if (std::fabs(phi) < newtonTol) {
+					break;
+				}
+				real derivativePhi = l_sqrtTermH[0] + l_sqrtTermH[1]
+						- static_cast<real>(0.25) * g * (hMiddle - i_hLeft) / (l_sqrtTermH[0] * hMiddle * hMiddle)
+						- static_cast<real>(0.25) * g * (hMiddle - i_hRight) / (l_sqrtTermH[1] * hMiddle * hMiddle);
+				hMiddle = hMiddle - phi / derivativePhi; //Newton step
+				assert(hMiddle >= dryTol);
+#if defined COUNTFLOPS
+				flops += COSTS_ADDS + 5 * COSTS_SUBS + 8 * COSTS_MULS + COSTS_DIVS;
+#endif
+			}
+			sqrt_g_hMiddle = std::sqrt(g * hMiddle);
+#if defined COUNTFLOPS
+			flops += COSTS_MINS + COSTS_SQRTS + COSTS_MULS;
+#endif
+		}
+		if (riemannStructure == RarefactionRarefaction) {
+			hMiddle = std::max(static_cast<real>(0), i_uLeft - i_uRight + static_cast<real>(2) * (l_sqrt_g_hLeft + l_sqrt_g_hRight));
+			hMiddle = static_cast<real>(1) / (static_cast<real>(16) * g) * hMiddle * hMiddle;
+			sqrt_g_hMiddle = std::sqrt(g * hMiddle);
+#if defined COUNTFLOPS
+			flops += COSTS_MAXS + COSTS_SQRTS + COSTS_SUBS + 2 * COSTS_ADDS + 5 * COSTS_MULS + COSTS_DIVS;
+#endif
+		}
+		if (riemannStructure == ShockRarefaction or riemannStructure == RarefactionShock) {
+			real hMin, hMax;
+			if (riemannStructure == ShockRarefaction) {
+				hMin = i_hLeft;
+				hMax = i_hRight;
+			} else {
+				hMin = i_hRight;
+				hMax = i_hLeft;
+			}
+			hMiddle = hMin;
+			sqrt_g_hMiddle = std::sqrt(g * hMiddle);
+			real sqrt_g_hMax = std::sqrt(g * hMax);
+			for (int i = 0; i < i_maxNumberOfNewtonIterations; i++) {
+				real sqrtTermHMin = std::sqrt(static_cast<real>(0.5) * g * ((hMiddle + hMin) / (hMiddle * hMin)));
+				real phi = i_uRight - i_uLeft + (hMiddle - hMin) * sqrtTermHMin + static_cast<real>(2) * (sqrt_g_hMiddle - sqrt_g_hMax);
+#if defined COUNTFLOPS
+				flops += COSTS_SQRTS + 5 * COSTS_MULS + 3 * COSTS_ADDS + 3 * COSTS_SUBS + COSTS_DIVS + COSTS_CMPS;
+#endif
+				if (std::fabs(phi) < newtonTol) {
+					break;
+				}
+				real derivativePhi = sqrtTermHMin - static_cast<real>(0.25) * g * (hMiddle - hMin) / (hMiddle * hMiddle * sqrtTermHMin) + sqrt_g / sqrt_g_hMiddle;
+				hMiddle = hMiddle - phi / derivativePhi; //Newton step
+				sqrt_g_hMiddle = std::sqrt(g * hMiddle);
+#if defined COUNTFLOPS
+				flops += 3 * COSTS_SUBS + 5 * COSTS_MULS + 3 * COSTS_DIVS + COSTS_SQRTS;
+#endif
+			}
+#if defined COUNTFLOPS
+			flops += COSTS_CMPS + 2 * COSTS_SQRTS + 2 * COSTS_MULS;
+#endif
+		}
+		middleStateSpeeds[0] = i_uLeft + static_cast<real>(2) * l_sqrt_g_hLeft - static_cast<real>(3) * sqrt_g_hMiddle;
+		middleStateSpeeds[1] = i_uRight - static_cast<real>(2) * l_sqrt_g_hRight + static_cast<real>(3) * sqrt_g_hMiddle;
+#if defined COUNTFLOPS
+		flops += 6 * COSTS_MULS + 2 * COSTS_SQRTS + 2 * COSTS_CMPS + 2 * COSTS_ADDS + 2 * COSTS_SUBS + 4 * COSTS_CMPS + COSTS_ORS;
+#endif
+		assert(hMiddle >= 0);
+	}
+#ifndef  VECTOR_NOVEC
+	inline void
+	computeMiddleState_SIMD (
+		const real_vector i_hLeft, const real_vector i_hRight,
+		const real_vector i_uLeft, const real_vector i_uRight,
+		const real_vector blendMask,
+		const int i_maxNumberOfNewtonIterations = 1
+	)
+	{
+		static const real_vector two = SETV_R(static_cast<real>(2));
+		static const real_vector three = SETV_R(static_cast<real>(3));
+		static const real_vector half_g = MULV(SETV_R(static_cast<real>(0.5)), g_v);
+		static const real_vector quarter_g = MULV(SETV_R(static_cast<real>(0.25)), g_v);
+		const real_vector newtonTol_v = SETV_R(newtonTol);
+		//set everything to zero
+		real_vector l_hMiddle = ZEROV_R();
+		real_vector l_middleStateSpeeds[2] = { ZEROV_R(), ZEROV_R() };
+		//compute local square roots
+		//(not necessarily the same ones as in computeNetUpdates!)
+		const real_vector l_sqrt_g_hRight = SQRTV(MULV(g_v, i_hRight));
+		const real_vector l_sqrt_g_hLeft = SQRTV(MULV(g_v, i_hLeft));
+		//single rarefaction in the case of a wet/dry interface
+		const real_vector DrySingleMask = BLENDV(ZEROV_R(), CMP_LT(i_hLeft, dryTol_v), blendMask);
+		const real_vector SingleDryMask = BLENDV(ZEROV_R(), ANDNOTV_R(DrySingleMask, CMP_LT(i_hRight, dryTol_v)), blendMask);
+		const real_vector Riemann_blendMask = BLENDV(ZEROV_R(), NOTV_R(ORV_R(DrySingleMask, SingleDryMask)), blendMask);
+		const real_vector speeds_dry_single = SUBV(i_uRight, MULV(two, l_sqrt_g_hRight));
+		riemannStructure_v = BLENDV_I(riemannStructure_v, DrySingleRarefaction_V, DrySingleMask);
+		const real_vector speeds_single_dry = ADDV(i_uLeft, MULV(two, l_sqrt_g_hLeft));
+		riemannStructure_v = BLENDV_I(riemannStructure_v, SingleRarefactionDry_V, SingleDryMask);
+		if (MOVEMASK(Riemann_blendMask) != 0) {
+			//determine the wave structure of the Riemann-problem
+			riemannStructure_v = determineRiemannStructure_SIMD(
+				i_hLeft, i_hRight,
+				i_uLeft, i_uRight,
+				Riemann_blendMask
+			);
+			//will be computed later
+			real_vector sqrt_g_hMiddle = ZEROV_R();
+			const real_vector ShockShockMask = CAST_INT_TO_REAL_V(riemannStructure_v);
+			if (MOVEMASK(ShockShockMask)) {
+				/* Compute All-Shock Riemann Solution
+				 * \cite[ch. 13.7]{leveque2002finite}
+				 */
+				real_vector iteration_variable = MINV(i_hLeft, i_hRight);
+				for (int i = 0; i < i_maxNumberOfNewtonIterations; ++i) {
+					const real_vector l_sqrtTermH[2] = {
+						SQRTV(
+							MULV(
+								half_g,
+								DIVV(
+									ADDV(iteration_variable, i_hLeft),
+									MULV(iteration_variable, i_hLeft)
+								)
+							)
+						),
+						SQRTV(
+							MULV(
+								half_g,
+								DIVV(
+									ADDV(iteration_variable, i_hRight),
+									MULV(iteration_variable, i_hRight)
+								)
+							)
+						)
+					};
+					const real_vector delta_h_left = SUBV(iteration_variable, i_hLeft);
+					const real_vector delta_h_right = SUBV(iteration_variable, i_hRight);
+					const real_vector phi = ADDV(
+						SUBV(i_uRight, i_uLeft),
+						ADDV(
+							MULV(delta_h_left, l_sqrtTermH[0]),
+							MULV(delta_h_right, l_sqrtTermH[1])
+						)
+					);
+#if defined COUNTFLOPS
+					flops += 2 * COSTS_SQRTV + 3 * COSTS_SUBV + COSTS_DIVV + 2 * COSTS_DIVV + 7 * COSTS_MULV + 4 * COSTS_ADDV + COSTS_FABS + COSTS_CMP_GT + COSTS_ANDV_R + COSTS_MOVEMASK;
+#endif
+					if (MOVEMASK(ANDV_R(ShockShockMask, CMP_GT(FABS(phi), newtonTol_v))) == 0) {
+						break;
+					}
+					const real_vector quarter_g_div_hMiddle_squared = DIVV(
+						quarter_g,
+						MULV(iteration_variable, iteration_variable)
+					);
+					const real_vector derivativePhi = SUBV(
+						ADDV(l_sqrtTermH[0], l_sqrtTermH[1]),
+						MULV(
+							quarter_g_div_hMiddle_squared,
+							ADDV(
+								DIVV(delta_h_left, l_sqrtTermH[0]),
+								DIVV(delta_h_right, l_sqrtTermH[1])
+							)
+						)
+					);
+					iteration_variable = SUBV(iteration_variable, DIVV(phi, derivativePhi));
+					assert(MOVEMASK(ORV_R(NOTV_R(ShockShockMask), CMP_GE(iteration_variable, dryTol_v))) == VECTOR_FULL_MASK);
+#if defined COUNTFLOPS
+					flops += 2 * COSTS_SUBV + 2 * COSTS_ADDV + COSTS_MULV + 3 * COSTS_DIVV;
+#endif
+				}
+				l_hMiddle = BLENDV(l_hMiddle, iteration_variable, ShockShockMask);
+#if defined COUNTFLOPS
+				flops += COSTS_MINV + 2 * COSTS_ZEROV_R + COSTS_BLENDV;
+#endif
+			}
+			const real_vector RarefactionRarefactionMask = CAST_INT_TO_REAL_V(SHIFT_LEFT(riemannStructure_v, 1));
+			if (MOVEMASK(RarefactionRarefactionMask)) {
+				/* Compute All-Rarefaction Riemann Solution
+				 * \cite[ch. 13.8.6]{leveque2002finite}
+				 */
+				static const real_vector one = SETV_R(static_cast<real>(1));
+				static const real_vector sixteen = SETV_R(static_cast<real>(16));
+				real_vector iteration_variable = MAXV(
+					ZEROV_R(),
+					ADDV(
+						SUBV(i_uLeft, i_uRight),
+						MULV(
+							two,
+							ADDV(l_sqrt_g_hLeft, l_sqrt_g_hRight)
+						)
+					)
+				);
+				iteration_variable = MULV(
+					DIVV(
+						one,
+						MULV(sixteen, g_v)
+					),
+					MULV(iteration_variable, iteration_variable)
+				);
+				l_hMiddle = BLENDV(l_hMiddle, iteration_variable, RarefactionRarefactionMask);
+#if defined COUNTFLOPS
+				flops += COSTS_MAXV + COSTS_ZEROV_R + 2 * COSTS_ADDV + COSTS_SUBV + 4 * COSTS_MULV + COSTS_DIVV + COSTS_BLENDV;
+#endif
+			}
+			sqrt_g_hMiddle = BLENDV(sqrt_g_hMiddle, SQRTV(MULV(g_v, l_hMiddle)), BLENDV(ZEROV_R(), ORV_R(ShockShockMask, RarefactionRarefactionMask), blendMask));
+			const real_vector ShockRarefactionMask = BLENDV(ZEROV_R(), CAST_INT_TO_REAL_V(SHIFT_LEFT(riemannStructure_v, 2)), blendMask);
+			const real_vector RarefactionShockMask = BLENDV(ZEROV_R(), CAST_INT_TO_REAL_V(SHIFT_LEFT(riemannStructure_v, 3)), blendMask);
+			const real_vector MixShockRarefactionMask = ORV_R(ShockRarefactionMask, RarefactionShockMask);
+			if (MOVEMASK(MixShockRarefactionMask)) {
+				/* Compute dam-break Riemann-solution
+				 * TODO: reference
+				 */
+				const real_vector hMin = BLENDV(i_hRight, i_hLeft, ShockRarefactionMask);
+				const real_vector hMax = BLENDV(i_hLeft, i_hRight, ShockRarefactionMask);
+				real_vector iteration_variable = hMin;
+				real_vector iteration_variable_sqrt = SQRTV(MULV(g_v, iteration_variable));
+				const real_vector sqrt_g_hMax = SQRTV(MULV(g_v, hMax));
+				for (int i = 0; i < i_maxNumberOfNewtonIterations; ++i) {
+					const real_vector sqrtTermHMin = SQRTV(
+						MULV(
+							half_g,
+							DIVV(
+								ADDV(iteration_variable, hMin),
+								MULV(iteration_variable, hMin)
+							)
+						)
+					);
+					const real_vector delta_h_min = SUBV(iteration_variable, hMin);
+					const real_vector phi = ADDV(
+						SUBV(i_uRight, i_uLeft),
+						ADDV(
+							MULV(delta_h_min, sqrtTermHMin),
+							MULV(
+								two,
+								SUBV(iteration_variable_sqrt, sqrt_g_hMax)
+							)
+						)
+					);
+					if (MOVEMASK(ANDV_R(MixShockRarefactionMask, CMP_GT(FABS(phi), newtonTol_v))) == 0) {
+						break;
+					}
+					const real_vector derivativePhi = ADDV(
+						SUBV(
+							sqrtTermHMin,
+							MULV(
+								quarter_g,
+								DIVV(
+									delta_h_min,
+									MULV(
+										MULV(iteration_variable, iteration_variable),
+										sqrtTermHMin
+									)
+								)
+							)
+						),
+						DIVV(sqrt_g_v, iteration_variable_sqrt)
+					);
+					iteration_variable = SUBV(iteration_variable, DIVV(phi, derivativePhi));
+#ifndef NDEBUG
+					assert(checkVector(BLENDV(ZEROV_R(), iteration_variable, MixShockRarefactionMask)));
+					assert(MOVEMASK(ANDV_R(MixShockRarefactionMask, CMP_LT(iteration_variable, hMin))) == 0);
+#endif
+					iteration_variable_sqrt = SQRTV(MULV(g_v, iteration_variable));
+#if defined COUNTFLOPS
+					flops += 4 * COSTS_ADDV + 5 * COSTS_SUBV + 8 * COSTS_MULV + 4 * COSTS_DIVV + 2 * COSTS_SQRTV;
+#endif
+				}
+				l_hMiddle = BLENDV(l_hMiddle, iteration_variable, MixShockRarefactionMask);
+				sqrt_g_hMiddle = BLENDV(sqrt_g_hMiddle, iteration_variable_sqrt, MixShockRarefactionMask);
+				assert(checkVector(BLENDV(ZEROV_R(), sqrt_g_hMiddle, blendMask)));
+#if defined COUNTFLOPS
+				flops += 4 * COSTS_BLENDV + 2 * COSTS_SQRTV + 2 * COSTS_MULV;
+#endif
+			}
+			const real_vector three_sqrt_g_hMiddle = MULV(three, sqrt_g_hMiddle);
+			l_middleStateSpeeds[0] = ADDV(
+				i_uLeft,
+				SUBV(
+					MULV(two, l_sqrt_g_hLeft),
+					three_sqrt_g_hMiddle
+				)
+			);
+			l_middleStateSpeeds[1] = ADDV(
+				SUBV(
+					i_uRight,
+					MULV(two, l_sqrt_g_hRight)
+				),
+				three_sqrt_g_hMiddle
+			);
+#if defined COUNTFLOPS
+			flops += COSTS_BLENDV + COSTS_NOTV_R + COSTS_ZEROV_R + COSTS_SQRTV + 4 * COSTS_MULV + COSTS_ADDV + 3 * COSTS_SUBV + 3 * COSTS_SHIFT_LEFT + COSTS_ANDV_R + COSTS_ORV_R + 3 * COSTS_MOVEMASK;
+#endif
+			assert(MOVEMASK(ORV_R(NOTV_R(blendMask), ORV_R(DrySingleMask, ORV_R(SingleDryMask, ORV_R(ShockShockMask, ORV_R(RarefactionRarefactionMask, MixShockRarefactionMask)))))) == VECTOR_FULL_MASK);
+		}
+		l_middleStateSpeeds[0] = BLENDV(l_middleStateSpeeds[0], speeds_dry_single, DrySingleMask);
+		l_middleStateSpeeds[1] = BLENDV(l_middleStateSpeeds[1], speeds_dry_single, DrySingleMask);
+		l_middleStateSpeeds[0] = BLENDV(l_middleStateSpeeds[0], speeds_single_dry, SingleDryMask);
+		l_middleStateSpeeds[1] = BLENDV(l_middleStateSpeeds[1], speeds_single_dry, SingleDryMask);
+		assert(checkVector(BLENDV(ZEROV_R(), l_middleStateSpeeds[0], blendMask)));	assert(checkVector(BLENDV(ZEROV_R(), l_middleStateSpeeds[1], blendMask)));
+		hMiddle_v = BLENDV(hMiddle_v, l_hMiddle, blendMask);
+		middleStateSpeeds_v[0] = BLENDV(middleStateSpeeds_v[0], l_middleStateSpeeds[0], blendMask);
+		middleStateSpeeds_v[1] = BLENDV(middleStateSpeeds_v[1], l_middleStateSpeeds[1], blendMask);
+		assert(MOVEMASK(BLENDV(ZEROV_R(), CMP_LT(hMiddle_v, ZEROV_R()), blendMask)) == 0);
+		assert(checkVector(hMiddle_v));
+		assert(checkVector(middleStateSpeeds_v[0]));	assert(checkVector(middleStateSpeeds_v[1]));
+#if defined COUNTFLOPS
+		flops += COSTS_SETV_R + 3 * COSTS_ZEROV_R + 2 * COSTS_SQRTV + 4 * COSTS_MULV + 2 * COSTS_CMP_LT + COSTS_ANDNOTV_R + COSTS_ORV_R + COSTS_NOTV_R + COSTS_MOVEMASK + COSTS_ADDV + COSTS_SUBV + 2 * COSTS_BLENDV_I + 7 * COSTS_BLENDV;
+#endif
+	}
+#endif /* VECTOR_NOVEC */
+	/**
+	 * Determine the Riemann-structure of a given problem.
+	 *   -> \cite[theorem 4.2]{george2006finite}, \cite[appendix B]{george2008augmented}
+	 *
+	 * @param i_hLeft height on the left side of the edge.
+	 * @param i_hRight height on the right side of the edge.
+	 * @param i_uLeft velocity on the left side of the edge.
+	 * @param i_uRight velocity on the right side of the edge.
+	 *
+	 * @return Riemann-structure of a given problem.
+	 */
+	inline integer
+	determineRiemannStructure (const real &i_hLeft, const real &i_hRight,
+			const real &i_uLeft, const real &i_uRight) const
+	{
+		real hMin = std::min(i_hLeft, i_hRight);
+		real hMax = std::max(i_hLeft, i_hRight);
+		real uDif = i_uRight - i_uLeft;
+#if defined COUNTFLOPS
+		flops += COSTS_MINS + COSTS_MAXS + COSTS_SUBS;
+#endif
+		if (0 <= static_cast<real>(2) * (std::sqrt(g * hMin) - std::sqrt(g * hMax)) + uDif) {
+#if defined COUNTFLOPS
+			flops += 3 * COSTS_MULS + 2 * COSTS_SQRTS + COSTS_ADDS + COSTS_SUBS + COSTS_CMPS;
+#endif
+			return RarefactionRarefaction;
+		} else if ((hMax - hMin) * std::sqrt(g * static_cast<real>(0.5) * (1 / hMax + 1 / hMin)) + uDif <= 0) {
+#if defined COUNTFLOPS
+			flops += 6 * COSTS_MULS + 3 * COSTS_SQRTS + 2 * COSTS_ADDS + 2 * COSTS_SUBS + 2 * COSTS_CMPS + COSTS_DIVS;
+#endif
+			return ShockShock;
+		} else if (i_hLeft < i_hRight) {
+#if defined COUNTFLOPS
+			flops += 6 * COSTS_MULS + 3 * COSTS_SQRTS + 2 * COSTS_ADDS + 2 * COSTS_SUBS + 3 * COSTS_CMPS + COSTS_DIVS;
+#endif
+			return ShockRarefaction;
+		} else {
+#if defined COUNTFLOPS
+			flops += 6 * COSTS_MULS + 3 * COSTS_SQRTS + 2 * COSTS_ADDS + 2 * COSTS_SUBS + 3 * COSTS_CMPS + COSTS_DIVS;
+#endif
+			return RarefactionShock;
+		}
+	}
+#ifndef  VECTOR_NOVEC
+	inline integer_vector
+	determineRiemannStructure_SIMD (
+		const real_vector i_hLeft, const real_vector i_hRight,
+		const real_vector i_uLeft, const real_vector i_uRight,
+		const real_vector blendMask
+	) const
+	{
+		static const real_vector two = SETV_R(static_cast<real>(2));
+		integer_vector l_riemannStructure = RarefactionShock_V;
+		const real_vector hMin = MINV(i_hLeft, i_hRight);
+		const real_vector hMax = MAXV(i_hLeft, i_hRight);
+		const real_vector uDif = SUBV(i_uLeft, i_uRight);
+		const real_vector condition_value_first = MULV(
+			two,
+			SUBV(
+				SQRTV(MULV(g_v, hMin)),
+				SQRTV(MULV(g_v, hMax))
+			)
+		);
+		const real_vector RarefactionRarefactionMask = BLENDV(ZEROV_R(), CMP_GE(condition_value_first, uDif), blendMask);
+		l_riemannStructure = BLENDV_I(l_riemannStructure, RarefactionRarefaction_V, RarefactionRarefactionMask);
+		if (MOVEMASK(ORV_R(NOTV_R(blendMask), RarefactionRarefactionMask)) != VECTOR_FULL_MASK) {
+			real_vector mask_accumulator = RarefactionRarefactionMask;
+			static const real_vector one = SETV_R(static_cast<real>(1));
+			// Note that the following line will cause errors, if the gravitational constant g changes during the simulation
+			static const real_vector half_g = MULV(g_v, SETV_R(static_cast<real>(0.5)));
+			const real_vector condition_value_second = MULV(
+				SUBV(hMax, hMin),
+				SQRTV(
+					MULV(
+						half_g,
+						ADDV(
+							DIVV(one, hMax),
+							DIVV(one, hMin)
+						)
+					)
+				)
+			);
+			const real_vector ShockShockMask = BLENDV(ZEROV_R(), ANDNOTV_R(mask_accumulator, CMP_LE(condition_value_second, uDif)), blendMask);
+			l_riemannStructure = BLENDV_I(l_riemannStructure, ShockShock_V, ShockShockMask);
+			mask_accumulator = ORV_R(mask_accumulator, ShockShockMask);
+			const real_vector ShockRarefactionMask = BLENDV(ZEROV_R(), ANDNOTV_R(mask_accumulator, CMP_LT(i_hLeft, i_hRight)), blendMask);
+			l_riemannStructure = BLENDV_I(l_riemannStructure, ShockRarefaction_V, ShockRarefactionMask);
+#if defined COUNTFLOPS
+			flops += 2 * COSTS_MULV + COSTS_SUBV + COSTS_SQRTV + COSTS_ADDV + 2 * COSTS_DIVV + 2 * COSTS_ANDNOTV_R + COSTS_CMP_LE + COSTS_CMP_LT + 2 * COSTS_BLENDV_I + COSTS_ORV_R + COSTS_ZEROV_R;
+#endif
+		}
+#if defined COUNTFLOPS
+		flops += COSTS_MINV + COSTS_MAXV + 2 * COSTS_SUBV + 3 * COSTS_MULV + 2 * COSTS_SQRTV + COSTS_CMP_GE + 2 * COSTS_BLENDV + COSTS_MOVEMASK;
+#endif
+		return BLENDV_I(riemannStructure_v, l_riemannStructure, blendMask);
+	}
+#endif /* VECTOR_NOVEC */
+	/**
+	 * Solve the linear equation:
+	 * A * x = b with A \in \mathbb{R}^{3\times3}, x,b \in \mathbb{R}^3
+	 *
+	 * @param i_matrix the matrix
+	 * @param i_b right hand side
+	 * @param o_x solution
+	 */
+	static inline void
+	solveLinearEquation (const real i_matrix[3][3],
+			const real i_b[3],
+			real o_x[3])
+	{
+		// compute inverse of 3x3 matrix
+		const real m[3][3] = {
+			{
+				i_matrix[1][1] * i_matrix[2][2] - i_matrix[1][2] * i_matrix[2][1],
+				i_matrix[0][2] * i_matrix[2][1] - i_matrix[0][1] * i_matrix[2][2],
+				i_matrix[0][1] * i_matrix[1][2] - i_matrix[0][2] * i_matrix[1][1]
+			},
+			{
+				i_matrix[1][2] * i_matrix[2][0] - i_matrix[1][0] * i_matrix[2][2],
+				i_matrix[0][0] * i_matrix[2][2] - i_matrix[0][2] * i_matrix[2][0],
+				i_matrix[0][2] * i_matrix[1][0] - i_matrix[0][0] * i_matrix[1][2]
+			},
+			{
+				i_matrix[1][0] * i_matrix[2][1] - i_matrix[1][1] * i_matrix[2][0],
+				i_matrix[0][1] * i_matrix[2][0] - i_matrix[0][0] * i_matrix[2][1],
+				i_matrix[0][0] * i_matrix[1][1] - i_matrix[0][1] * i_matrix[1][0]
+			}
+		};
+		real d = i_matrix[0][0] * m[0][0] + (i_matrix[0][1] * m[1][0] + i_matrix[0][2] * m[2][0]);
+		// m stores not really the inverse matrix, but the inverse multiplied by d
+		real s = 1 / d;
+		// compute m*i_b
+		o_x[0] = (m[0][0] * i_b[0] + (m[0][1] * i_b[1] + m[0][2] * i_b[2])) * s;
+		o_x[1] = (m[1][0] * i_b[0] + (m[1][1] * i_b[1] + m[1][2] * i_b[2])) * s;
+		o_x[2] = (m[2][0] * i_b[0] + (m[2][1] * i_b[1] + m[2][2] * i_b[2])) * s;
+	}
+#ifndef  VECTOR_NOVEC
+	inline void
+	solveLinearEquation_SIMD (
+		const real_vector i_matrix[3][3],
+		const real_vector i_b[3],
+		real_vector o_x[3]
+	) const
+	{
+		// Vectorization works, but results are different from file 5 up to the end
+		const real_vector m[3][3] = {
+			{
+				SUBV(MULV(i_matrix[1][1], i_matrix[2][2]), MULV(i_matrix[1][2], i_matrix[2][1])),
+				SUBV(MULV(i_matrix[0][2], i_matrix[2][1]), MULV(i_matrix[0][1], i_matrix[2][2])),
+				SUBV(MULV(i_matrix[0][1], i_matrix[1][2]), MULV(i_matrix[0][2], i_matrix[1][1]))
+			},
+			{
+				SUBV(MULV(i_matrix[1][2], i_matrix[2][0]), MULV(i_matrix[1][0], i_matrix[2][2])),
+				SUBV(MULV(i_matrix[0][0], i_matrix[2][2]), MULV(i_matrix[0][2], i_matrix[2][0])),
+				SUBV(MULV(i_matrix[0][2], i_matrix[1][0]), MULV(i_matrix[0][0], i_matrix[1][2]))
+			},
+			{
+				SUBV(MULV(i_matrix[1][0], i_matrix[2][1]), MULV(i_matrix[1][1], i_matrix[2][0])),
+				SUBV(MULV(i_matrix[0][1], i_matrix[2][0]), MULV(i_matrix[0][0], i_matrix[2][1])),
+				SUBV(MULV(i_matrix[0][0], i_matrix[1][1]), MULV(i_matrix[0][1], i_matrix[1][0]))
+			}
+		};
+/*
+ * The results of the following two versions of the same (!) statement, differ up to 20% in the momentum-updates
+ */
+#if 0
+		/*
+		 * original version -- not tested !!!
+		 */ 
+		const real_vector d = ADDV(
+			ADDV(
+				MULV(i_matrix[0][0], m[0][0]),
+				MULV(i_matrix[0][1], m[1][0]),
+			),
+			MULV(i_matrix[0][2], m[2][0])
+		);
+#else
+		/*
+		 * this version is tested, even though nobody knows whether it is the right one
+		 */ 
+		const real_vector d = ADDV(
+			MULV(i_matrix[0][0], m[0][0]),
+			ADDV(
+				MULV(i_matrix[0][1], m[1][0]),
+				MULV(i_matrix[0][2], m[2][0])
+			)
+		);
+#endif
+		static const real_vector one = SETV_R(static_cast<real>(1));
+		const real_vector s = DIVV(one, d);
+/*
+ * same as above
+ */ 
+#if 0
+		o_x[0] = MULV(
+			ADDV(
+				ADDV(
+					MULV(m[0][0], i_b[0]),
+					MULV(m[0][1], i_b[1]),
+				),
+				MULV(m[0][2], i_b[2])
+			),
+			s
+		);
+		o_x[1] = MULV(
+			ADDV(
+				ADDV(
+					MULV(m[1][0], i_b[0]),
+					MULV(m[1][1], i_b[1]),
+				),
+				MULV(m[1][2], i_b[2])
+			),
+			s
+		);
+		o_x[2] = MULV(
+			ADDV(
+				ADDV(
+					MULV(m[2][0], i_b[0]),
+					MULV(m[2][1], i_b[1]),
+				),
+				MULV(m[2][2], i_b[2])
+			),
+			s
+		);
+#else
+		o_x[0] = MULV(
+			ADDV(
+				MULV(m[0][0], i_b[0]),
+				ADDV(
+					MULV(m[0][1], i_b[1]),
+					MULV(m[0][2], i_b[2])
+				)
+			),
+			s
+		);
+		o_x[1] = MULV(
+			ADDV(
+				MULV(m[1][0], i_b[0]),
+				ADDV(
+					MULV(m[1][1], i_b[1]),
+					MULV(m[1][2], i_b[2])
+				)
+			),
+			s
+		);
+		o_x[2] = MULV(
+			ADDV(
+				MULV(m[2][0], i_b[0]),
+				ADDV(
+					MULV(m[2][1], i_b[1]),
+					MULV(m[2][2], i_b[2])
+				)
+			),
+			s
+		);
+#endif
+#if defined COUNTFLOPS
+			flops += 9 * COSTS_SUBV + COSTS_DIVV + 33 * COSTS_MULV + 8 * COSTS_ADDV;
+#endif
+	}
+#endif /* VECTOR_NOVEC */
+};
+#endif /* AUGRIE_SIMD_HPP_ */
--- a/src/solver/SIMD_COSTS.hpp
+++ b/src/solver/SIMD_COSTS.hpp
+/***********************************************************************************//**
+ *
+ * \file SIMD_COSTS.hpp
+ *
+ * \brief Contains latency costs for the intrinsics used for the vectorization
+ *
+ * \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
+ *
+ **************************************************************************************/
+#pragma once
+#ifndef  SIMD_COSTS_H
+#define  SIMD_COSTS_H
+#ifdef COUNTFLOPS
+#ifndef  SIMD_TYPES_H
+	#error "SIMD_COSTS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
+#endif /* defined SIMD_TYPES_H */
+#define COSTS_ADDS		1
+#define COSTS_SUBS		1
+#define COSTS_MULS		1
+#define COSTS_DIVS		1
+#define COSTS_SQRTS		1
+#define COSTS_MAXS		1
+#define COSTS_MINS		1
+#define COSTS_CMPS		0
+#define COSTS_ANDS		0
+#define COSTS_ORS		0
+#define COSTS_FABSS		1
+#if defined VECTOR_SSE4_FLOAT32
+	#define COSTS_ADDV		4
+	#define COSTS_SUBV		4
+	#define COSTS_MULV		4
+	#define COSTS_DIVV		4
+	#define COSTS_SQRTV		4
+	#define COSTS_LOADU		0
+	#define COSTS_STOREU		0
+	#define COSTS_SETV_R		0
+	#define COSTS_SETV_I		0
+	#define COSTS_ZEROV_R		0
+	#define COSTS_ZEROV_I		0
+	#define COSTS_MAXV		4
+	#define COSTS_MINV		4
+	#define COSTS_CMP_LT		0
+	#define COSTS_CMP_LE		0
+	#define COSTS_CMP_GT		0
+	#define COSTS_CMP_GE		0
+	#define COSTS_CMP_EQ_I		0
+	#define COSTS_ANDV_R		0
+	#define COSTS_ORV_R		0
+	#define COSTS_XORV_R		0
+	#define COSTS_ORV_I		0
+	#define COSTS_ANDNOTV_R		0
+	#define COSTS_BLENDV		0
+	#define COSTS_BLENDV_I		0
+	#define COSTS_MOVEMASK		0
+	#define COSTS_SHIFT_LEFT	0
+	#define COSTS_FABS		4
+	#define COSTS_NOTV_R		0
+	#define COSTS_NOTV_I		0
+#elif defined VECTOR_SSE4_FLOAT64
+	#error "SSE4 with double precision not implemented at the moment"
+#elif defined VECTOR_AVX_FLOAT32
+	#define COSTS_ADDV		8
+	#define COSTS_SUBV		8
+	#define COSTS_MULV		8
+	#define COSTS_DIVV		8
+	#define COSTS_SQRTV		8
+	#define COSTS_LOADU		0
+	#define COSTS_STOREU		0
+	#define COSTS_SETV_R		0
+	#define COSTS_SETV_I		0
+	#define COSTS_ZEROV_R		0
+	#define COSTS_ZEROV_I		0
+	#define COSTS_MAXV		8
+	#define COSTS_MINV		8
+	#define COSTS_CMP_LT		0
+	#define COSTS_CMP_LE		0
+	#define COSTS_CMP_GT		0
+	#define COSTS_CMP_GE		0
+	#define COSTS_CMP_EQ_I		0
+	#define COSTS_ANDV_R		0
+	#define COSTS_ORV_R		0
+	#define COSTS_XORV_R		0
+	#define COSTS_ORV_I		0
+	#define COSTS_ANDNOTV_R		0
+	#define COSTS_BLENDV		0
+	#define COSTS_BLENDV_I		0
+	#define COSTS_MOVEMASK		0
+	#define COSTS_SHIFT_LEFT	0
+	#define COSTS_FABS		8
+	#define COSTS_NOTV_R		0
+	#define COSTS_NOTV_I		0
+#elif defined VECTOR_AVX_FLOAT64
+	#error "AVX with double precision not implemented at the moment"
+#else /* no vectorization type defined */
+	#pragma message "SIMD-Costs included, but no Vector-Type defined."
+#endif
+#endif /* not defined COUNTFLOPS */
+#endif /* #ifndef SIMD_COSTS_H */ 
--- a/src/solver/SIMD_DEFINITIONS.hpp
+++ b/src/solver/SIMD_DEFINITIONS.hpp
+/***********************************************************************************//**
+ *
+ * \file SIMD_DEFINITIONS.hpp
+ *
+ * \brief Contains macro definitions for the intrinsics used for the vectorization
+ *
+ * \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
+ *
+ **************************************************************************************/
+#pragma once
+#ifndef  SIMD_DEFINITIONS_H
+#define  SIMD_DEFINITIONS_H
+#include <cmath>
+#include <limits>
+/*
+ * Check whether the file SIMD_TYPES.hpp has been included.
+ * This file (SIMD_DEFINITIONS.hpp) needs some macros to be set properly by that file (SIMD_TYPES.hpp).
+ */ 
+#ifndef  SIMD_TYPES_H
+	#error "SIMD_DEFINITIONS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
+#endif /* defined SIMD_TYPES_H */
+#if defined VECTOR_SSE4_FLOAT32
+/*
+ * Map single precision SSE intrinsics
+ */ 
+	#define ADDV _mm_add_ps
+	#define SUBV _mm_sub_ps
+	#define MULV _mm_mul_ps
+	#define DIVV _mm_div_ps
+	#define SQRTV _mm_sqrt_ps
+	#define LOADU _mm_loadu_ps
+	#define STOREU _mm_storeu_ps
+	#define SETV_R _mm_set1_ps
+	#define SETV_I _mm_set1_epi32
+	#define ZEROV_R _mm_setzero_ps
+	#define ZEROV_I _mm_setzero_si128
+	#define MAXV _mm_max_ps
+	#define MINV _mm_min_ps
+	#define CMP_LT _mm_cmplt_ps
+	#define CMP_LE _mm_cmple_ps
+	#define CMP_GT _mm_cmpgt_ps
+	#define CMP_GE _mm_cmpge_ps
+	#define CMP_EQ_I _mm_cmpeq_epi32
+	#define CMP_EQ_R _mm_cmpeq_ps
+	#define ANDV_R _mm_and_ps
+	#define ORV_R _mm_or_ps
+	#define XORV_R _mm_xor_ps
+	#define ORV_I _mm_or_si128
+	#define NOTV_R not_ps
+	#define NOTV_I not_si128
+	#define ANDNOTV_R _mm_andnot_ps
+	#define BLENDV _mm_blendv_ps
+	#define BLENDV_I(else_part, if_part, mask) CAST_REAL_TO_INT_V(_mm_blendv_ps(CAST_INT_TO_REAL_V(else_part), CAST_INT_TO_REAL_V(if_part), mask))
+	#define MOVEMASK _mm_movemask_ps
+	#define SHIFT_LEFT _mm_slli_epi32
+	#define CAST_INT_TO_REAL_V _mm_castsi128_ps
+	#define CAST_REAL_TO_INT_V _mm_castps_si128
+	#define FABS fabs_ps
+	/*
+	 * Compute the absolute value of a vector by forcing the sign bit to be zero
+	 */ 
+	inline __m128 fabs_ps(const __m128 x) {
+		static const __m128 sign_mask = CAST_INT_TO_REAL_V(_mm_set1_epi32(1 << 31));
+		return _mm_andnot_ps(sign_mask, x);
+	}
+	/*
+	 * Bitwise NOT operation for integers
+	 */ 
+	inline __m128i not_si128(const __m128i x) {
+		static const __m128i mask = _mm_set1_epi32(~0);
+		return CAST_REAL_TO_INT_V(_mm_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x)));
+	}
+	/*
+	 * Bitwise NOT operation for reals
+	 */ 
+	inline __m128 not_ps(const __m128 x) {
+		static const __m128i mask = _mm_set1_epi32(~0);
+		return _mm_xor_ps(CAST_INT_TO_REAL_V(mask), x);
+	}
+	/*
+	 * Check, whether a real_vector contains infinity or NaN
+	 */ 
+	inline bool checkVector (const __m128 x) {
+		static const real_vector infinity = SETV_R(std :: numeric_limits<float> :: infinity());
+		return MOVEMASK(ANDV_R(CMP_EQ_R(x, x), NOTV_R(CMP_EQ_R(x, infinity)))) == VECTOR_FULL_MASK;
+	}
+#elif defined VECTOR_SSE4_FLOAT64
+/*
+ * Map double precision SSE intrinsics
+ */ 
+	#error "SSE4 with double precision not implemented at the moment"
+#elif defined VECTOR_AVX_FLOAT32
+/*
+ * Map single precision AVX intrinsics
+ */ 
+	#define ADDV _mm256_add_ps
+	#define SUBV _mm256_sub_ps
+	#define MULV _mm256_mul_ps
+	#define DIVV _mm256_div_ps
+	#define SQRTV _mm256_sqrt_ps
+	#define LOADU _mm256_loadu_ps
+	#define STOREU _mm256_storeu_ps
+	#define SETV_R _mm256_set1_ps
+	#define SETV_I _mm256_set1_epi32
+	#define ZEROV_R _mm256_setzero_ps
+	#define ZEROV_I _mm256_setzero_si256
+	#define MAXV _mm256_max_ps
+	#define MINV _mm256_min_ps
+	#define CMP_LT(x, y) _mm256_cmp_ps((x), (y), _CMP_LT_OS)
+	#define CMP_LE(x, y) _mm256_cmp_ps((x), (y), _CMP_LE_OS)
+	#define CMP_GT(x, y) _mm256_cmp_ps((x), (y), _CMP_GT_OS)
+	#define CMP_GE(x, y) _mm256_cmp_ps((x), (y), _CMP_GE_OS)
+	#define CMP_EQ_R(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OS)
+	/*
+	 * Define test for equality of integers
+	 * Replace with
+	 * 
+	 * #define CMP_EQ_I(x, y) _mm256_cmpeq_epi32((x), (y))
+	 * 
+	 * when running with AVX2
+	 */ 
+	static inline __m256i CMP_EQ_I(const __m256i a, const __m256i b)
+	{
+		__m256i out = ZEROV_I();
+		const integer* const p = reinterpret_cast<const integer*>(&a);
+		const integer* const q = reinterpret_cast<const integer*>(&b);
+		integer* const r = reinterpret_cast<integer*>(&out);
+		for (int i = 0; i < VECTOR_LENGTH; ++i) {
+			r[i] = p[i] == q[i] ? 0xFFFFFFFF : 0;
+		}
+		return out;
+	}
+	#define ANDV_R _mm256_and_ps
+	#define ORV_R _mm256_or_ps
+	#define XORV_R _mm256_xor_ps
+	#define ORV_I(x, y) CAST_REAL_TO_INT_V(_mm256_or_ps(CAST_INT_TO_REAL_V(x), CAST_INT_TO_REAL_V(y)))
+	#define NOTV_R not_ps
+	#define NOTV_I not_si256
+	#define ANDNOTV_R _mm256_andnot_ps
+	#define BLENDV _mm256_blendv_ps
+	#define BLENDV_I(else_part, if_part, mask) CAST_REAL_TO_INT_V(_mm256_blendv_ps(CAST_INT_TO_REAL_V(else_part), CAST_INT_TO_REAL_V(if_part), mask))
+	#define MOVEMASK _mm256_movemask_ps
+	/*
+	 * Define left shifting for integers
+	 * Replace with
+	 * 
+	 * #define SHIFT_LEFT(x, y) _mm256_slli_epi32((x), (y))
+	 * 
+	 * when running with AVX2
+	 */ 
+	static inline __m256i SHIFT_LEFT(const __m256i x, const integer y)
+	{
+		__m256i out = ZEROV_I();
+		const integer* const p = reinterpret_cast<const integer*>(&x);
+		integer* const q = reinterpret_cast<integer*>(&out);
+		for (int i = 0; i < VECTOR_LENGTH; ++i) {
+			q[i] = p[i] << y;
+		}
+		return out;
+	}
+	#define CAST_INT_TO_REAL_V _mm256_castsi256_ps
+	#define CAST_REAL_TO_INT_V _mm256_castps_si256
+	#define FABS fabs_ps
+	/*
+	 * Compute the absolute value of a vector by forcing the sign bit to be zero
+	 */
+	inline __m256 fabs_ps(const __m256 x) {
+		static const __m256 sign_mask = CAST_INT_TO_REAL_V(_mm256_set1_epi32(1 << 31));
+		return _mm256_andnot_ps(sign_mask, x);
+	}
+	/*
+	 * Bitwise NOT operation for integers
+	 */
+	inline __m256i not_si256(const __m256i x) {
+		static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF);
+		return CAST_REAL_TO_INT_V(_mm256_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x)));
+	}
+	/*
+	 * Bitwise NOT operation for reals
+	 */
+	inline __m256 not_ps(const __m256 x) {
+		static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF);
+		return _mm256_xor_ps(CAST_INT_TO_REAL_V(mask), x);
+	}
+	/*
+	 * Check, whether a real_vector contains infinity or NaN
+	 */ 
+	inline bool checkVector (const __m256 x) {
+		static const real_vector infinity = SETV_R(std :: numeric_limits<float> :: infinity());
+		return MOVEMASK(ANDV_R(CMP_EQ_R(x, x), NOTV_R(CMP_EQ_R(x, infinity)))) == VECTOR_FULL_MASK;
+	}
+#elif defined VECTOR_AVX_FLOAT64
+/*
+ * Map double precision AVX intrinsics
+ */ 
+	#error "AVX with double precision not implemented at the moment"
+#else /* no vectorization type defined */
+/*
+ * No vectorization demanded.
+ * Do nothing, but inform the user
+ */ 
+	#pragma message "SIMD-Definitions included, but no Vector-Type defined."
+#endif
+#endif /* #ifndef SIMD_DEFINITIONS_H */ 
--- a/src/solver/SIMD_TYPES.hpp
+++ b/src/solver/SIMD_TYPES.hpp
+/***********************************************************************************//**
+ *
+ * \file SIMD_TYPES.hpp
+ *
+ * \brief Defines the length of the vectors and the corresponding functions
+ *
+ * \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
+ *
+ **************************************************************************************/
+#pragma once
+#ifndef  SIMD_TYPES_H
+#define  SIMD_TYPES_H
+/*
+ * Check, whether the function definitions are already included.
+ * If yes, this denotes an error.
+ * 
+ * The SIMD_DEFINITIONS.hpp-file needs the control macros set in this file to work properly.
+ */ 
+#ifdef  SIMD_DEFINITIONS_H
+	#error "SIMD Definitions already included! Never include that file directly! Include it only via including SIMD_TYPES (this file!)"
+#endif /* defined SIMD_DEFINITIONS_H */
+/*
+ * Check, whether a solver is chosen, that uses the macros in this file.
+ */ 
+#if not WAVE_PROPAGATION_SOLVER == 5
+	#pragma message "SIMD macros included but non-vectorized solver specified"
+#endif /* not WAVE_PROPAGATION_SOLVER == 5  */
+/*
+ * Care about precision.
+ * Use single precision as default.
+ * 
+ * Additionally, declare the macro SHIFT_SIGN_RIGHT to work properly with 32 and 64 bit.
+ * Note the INTENTIONALLY FORGOTTEN semicolon at the end of the SHIFT_SIGN_RIGHT-definitions.
+ * This forces the user to write the semicolon himself!
+ */ 
+#if defined FLOAT64
+	#pragma message "Using double as type for real numbers"
+	typedef double real;
+	#pragma message "Using unsigned long long as type for integer numbers"
+	typedef unsigned long long integer;
+	#define SHIFT_SIGN_RIGHT(x) static_cast<integer>(static_cast<integer>(1) << (static_cast<integer>(64) - static_cast<integer>(x)))
+#else /* not defined FLOAT64  */
+	#pragma message "Using float as type for real numbers"
+	typedef float real;
+	#pragma message "Using unsigned int as type for integer numbers"
+	typedef unsigned int integer;
+	#define SHIFT_SIGN_RIGHT(x) (1 << (32 - static_cast<integer>(x)))
+#endif /* not defined FLOAT64  */
+/*
+ * Set control macros
+ * 
+ * Declare the vector length
+ * Additionally declare a configuration specific macro of the form
+ * 	VECTOR_extension_precision
+ * 
+ * Moreover, declare an integer, representing the number,
+ * which is returned by the instruction MOVEMASK, if the instruction is called on a vector with ALL SIGN BITS set
+ * Use is as
+ * 
+ * 	const real_vector vector = CONDITION(operand_a, operand_b);
+ * 
+ * 	if (MOVEMASK(vector) == VECTOR_FULL_MASK) {
+ *		// all components fullfill the condition
+ *	} else {
+ *		// some components do not fullfill the condition
+ *	} 
+ */ 
+#if  (defined __SSE4_1__ and not defined __AVX__) or (defined __AVX__ and defined AVX128)
+	#pragma message "Using SSE4.1 for vectorization"
+	#include <smmintrin.h>
+	typedef __m128i integer_vector;
+	#if  defined FLOAT64
+		#define VECTOR_LENGTH 2
+		#define VECTOR_SSE4_FLOAT64
+		#define VECTOR_FULL_MASK 0x00000003
+		typedef __m128d real_vector;
+		#pragma message "Using vectors of 2 doubles"
+	#else /* not defined FLOAT64  */
+		#define VECTOR_LENGTH 4
+		#define VECTOR_SSE4_FLOAT32
+		#define VECTOR_FULL_MASK 0x0000000F
+		typedef __m128 real_vector;
+		#pragma message "Using vectors of 4 floats"
+	#endif /* not defined FLOAT64 */
+#elif  defined __AVX__
+	#pragma message "Using AVX for vectorization"
+	#include <immintrin.h>
+	typedef __m256i integer_vector;
+	#if  defined FLOAT64
+		#define VECTOR_LENGTH 4
+		#define VECTOR_AVX_FLOAT64
+		#define VECTOR_FULL_MASK 0x0000000F
+		typedef __m256d real_vector;
+		#pragma message "Using vectors of 4 doubles"
+	#else /* not defined FLOAT64  */
+		#define VECTOR_LENGTH 8
+		#define VECTOR_AVX_FLOAT32
+		#define VECTOR_FULL_MASK 0x000000FF
+		typedef __m256 real_vector;
+		#pragma message "Using vectors of 8 floats"
+	#endif /* not defined FLOAT64  */
+#else /* not defined __SSE4__ and not defined __AVX__  */
+	#pragma message "Using no vectorization at all"
+	#define VECTOR_LENGTH 1
+	#define VECTOR_NOVEC
+#endif /* not defined __SSE4__ and not defined __AVX__ */
+/*
+ * Control macros are set
+ * 
+ * Include the function macros
+ */ 
+#include "SIMD_DEFINITIONS.hpp"
+/*
+ * Include the cost macros if flop counting is demanded
+ */		
+#if defined COUNTFLOPS
+	#include "SIMD_COSTS.hpp"
+#endif /* defined COUNTFLOPS  */
+#endif /* #ifndef SIMD_TYPES_H */