Added SIMD files

304d4698 · hoelzlw · a79ebaa7 · 304d4698 · 304d4698 · 304d4698
Commit 304d4698 authored Jul 14, 2013 by hoelzlw
Showing with 2555 additions and 0 deletions

AugRie_SIMD.hpp src/solver/AugRie_SIMD.hpp +2056 -0

SIMD_COSTS.hpp src/solver/SIMD_COSTS.hpp +123 -0

SIMD_DEFINITIONS.hpp src/solver/SIMD_DEFINITIONS.hpp +239 -0

SIMD_TYPES.hpp src/solver/SIMD_TYPES.hpp +137 -0

No files found.
--- a/src/solver/AugRie_SIMD.hpp
+++ b/src/solver/AugRie_SIMD.hpp
--- a/src/solver/SIMD_COSTS.hpp
+++ b/src/solver/SIMD_COSTS.hpp
+/***********************************************************************************//**
+ *
+ * \file SIMD_COSTS.hpp
+ *
+ * \brief Contains latency costs for the intrinsics used for the vectorization
+ *
+ * \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
+ *
+ **************************************************************************************/
+
+#pragma once
+
+#ifndef  SIMD_COSTS_H
+#define  SIMD_COSTS_H
+
+#ifdef COUNTFLOPS
+
+#ifndef  SIMD_TYPES_H
+	#error "SIMD_COSTS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
+#endif /* defined SIMD_TYPES_H */
+
+#define COSTS_ADDS		1
+#define COSTS_SUBS		1
+#define COSTS_MULS		1
+#define COSTS_DIVS		1
+#define COSTS_SQRTS		1
+
+#define COSTS_MAXS		1
+#define COSTS_MINS		1
+
+#define COSTS_CMPS		0
+
+#define COSTS_ANDS		0
+#define COSTS_ORS		0
+
+#define COSTS_FABSS		1
+
+#if defined VECTOR_SSE4_FLOAT32
+	#define COSTS_ADDV		4
+	#define COSTS_SUBV		4
+	#define COSTS_MULV		4
+	#define COSTS_DIVV		4
+	#define COSTS_SQRTV		4
+
+	#define COSTS_LOADU		0
+	#define COSTS_STOREU		0
+	#define COSTS_SETV_R		0
+	#define COSTS_SETV_I		0
+	#define COSTS_ZEROV_R		0
+	#define COSTS_ZEROV_I		0
+
+	#define COSTS_MAXV		4
+	#define COSTS_MINV		4
+
+	#define COSTS_CMP_LT		0
+	#define COSTS_CMP_LE		0
+	#define COSTS_CMP_GT		0
+	#define COSTS_CMP_GE		0
+
+	#define COSTS_CMP_EQ_I		0
+
+	#define COSTS_ANDV_R		0
+	#define COSTS_ORV_R		0
+	#define COSTS_XORV_R		0
+	#define COSTS_ORV_I		0
+	#define COSTS_ANDNOTV_R		0
+
+	#define COSTS_BLENDV		0
+	#define COSTS_BLENDV_I		0
+	#define COSTS_MOVEMASK		0
+	#define COSTS_SHIFT_LEFT	0
+
+	#define COSTS_FABS		4
+	#define COSTS_NOTV_R		0
+	#define COSTS_NOTV_I		0
+#elif defined VECTOR_SSE4_FLOAT64
+	#error "SSE4 with double precision not implemented at the moment"
+#elif defined VECTOR_AVX_FLOAT32
+	#define COSTS_ADDV		8
+	#define COSTS_SUBV		8
+	#define COSTS_MULV		8
+	#define COSTS_DIVV		8
+	#define COSTS_SQRTV		8
+
+	#define COSTS_LOADU		0
+	#define COSTS_STOREU		0
+	#define COSTS_SETV_R		0
+	#define COSTS_SETV_I		0
+	#define COSTS_ZEROV_R		0
+	#define COSTS_ZEROV_I		0
+
+	#define COSTS_MAXV		8
+	#define COSTS_MINV		8
+
+	#define COSTS_CMP_LT		0
+	#define COSTS_CMP_LE		0
+	#define COSTS_CMP_GT		0
+	#define COSTS_CMP_GE		0
+	#define COSTS_CMP_EQ_I		0
+
+	#define COSTS_ANDV_R		0
+	#define COSTS_ORV_R		0
+	#define COSTS_XORV_R		0
+	#define COSTS_ORV_I		0
+	#define COSTS_ANDNOTV_R		0
+
+	#define COSTS_BLENDV		0
+	#define COSTS_BLENDV_I		0
+	#define COSTS_MOVEMASK		0
+	#define COSTS_SHIFT_LEFT	0
+
+	#define COSTS_FABS		8
+	#define COSTS_NOTV_R		0
+	#define COSTS_NOTV_I		0
+#elif defined VECTOR_AVX_FLOAT64
+	#error "AVX with double precision not implemented at the moment"
+#else /* no vectorization type defined */
+	#pragma message "SIMD-Costs included, but no Vector-Type defined."
+#endif
+
+#endif /* not defined COUNTFLOPS */
+
+#endif /* #ifndef SIMD_COSTS_H */ 
--- a/src/solver/SIMD_DEFINITIONS.hpp
+++ b/src/solver/SIMD_DEFINITIONS.hpp
+/***********************************************************************************//**
+ *
+ * \file SIMD_DEFINITIONS.hpp
+ *
+ * \brief Contains macro definitions for the intrinsics used for the vectorization
+ *
+ * \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
+ *
+ **************************************************************************************/
+
+#pragma once
+
+#ifndef  SIMD_DEFINITIONS_H
+#define  SIMD_DEFINITIONS_H
+
+#include <cmath>
+#include <limits>
+
+/*
+ * Check whether the file SIMD_TYPES.hpp has been included.
+ * This file (SIMD_DEFINITIONS.hpp) needs some macros to be set properly by that file (SIMD_TYPES.hpp).
+ */ 
+#ifndef  SIMD_TYPES_H
+	#error "SIMD_DEFINITIONS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
+#endif /* defined SIMD_TYPES_H */
+
+#if defined VECTOR_SSE4_FLOAT32
+/*
+ * Map single precision SSE intrinsics
+ */ 
+	#define ADDV _mm_add_ps
+	#define SUBV _mm_sub_ps
+	#define MULV _mm_mul_ps
+	#define DIVV _mm_div_ps
+	#define SQRTV _mm_sqrt_ps
+
+	#define LOADU _mm_loadu_ps
+	#define STOREU _mm_storeu_ps
+	#define SETV_R _mm_set1_ps
+	#define SETV_I _mm_set1_epi32
+	#define ZEROV_R _mm_setzero_ps
+	#define ZEROV_I _mm_setzero_si128
+
+	#define MAXV _mm_max_ps
+	#define MINV _mm_min_ps
+
+	#define CMP_LT _mm_cmplt_ps
+	#define CMP_LE _mm_cmple_ps
+	#define CMP_GT _mm_cmpgt_ps
+	#define CMP_GE _mm_cmpge_ps
+
+	#define CMP_EQ_I _mm_cmpeq_epi32
+	#define CMP_EQ_R _mm_cmpeq_ps
+
+	#define ANDV_R _mm_and_ps
+	#define ORV_R _mm_or_ps
+	#define XORV_R _mm_xor_ps
+	#define ORV_I _mm_or_si128
+	#define NOTV_R not_ps
+	#define NOTV_I not_si128
+	#define ANDNOTV_R _mm_andnot_ps
+
+	#define BLENDV _mm_blendv_ps
+	#define BLENDV_I(else_part, if_part, mask) CAST_REAL_TO_INT_V(_mm_blendv_ps(CAST_INT_TO_REAL_V(else_part), CAST_INT_TO_REAL_V(if_part), mask))
+	#define MOVEMASK _mm_movemask_ps
+	#define SHIFT_LEFT _mm_slli_epi32
+
+	#define CAST_INT_TO_REAL_V _mm_castsi128_ps
+	#define CAST_REAL_TO_INT_V _mm_castps_si128
+	#define FABS fabs_ps
+
+	/*
+	 * Compute the absolute value of a vector by forcing the sign bit to be zero
+	 */ 
+	inline __m128 fabs_ps(const __m128 x) {
+		static const __m128 sign_mask = CAST_INT_TO_REAL_V(_mm_set1_epi32(1 << 31));
+		return _mm_andnot_ps(sign_mask, x);
+	}
+	
+	/*
+	 * Bitwise NOT operation for integers
+	 */ 
+	inline __m128i not_si128(const __m128i x) {
+		static const __m128i mask = _mm_set1_epi32(~0);
+		return CAST_REAL_TO_INT_V(_mm_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x)));
+	}
+	
+	/*
+	 * Bitwise NOT operation for reals
+	 */ 
+	inline __m128 not_ps(const __m128 x) {
+		static const __m128i mask = _mm_set1_epi32(~0);
+		return _mm_xor_ps(CAST_INT_TO_REAL_V(mask), x);
+	}
+	
+	/*
+	 * Check, whether a real_vector contains infinity or NaN
+	 */ 
+	inline bool checkVector (const __m128 x) {
+		static const real_vector infinity = SETV_R(std :: numeric_limits<float> :: infinity());
+		return MOVEMASK(ANDV_R(CMP_EQ_R(x, x), NOTV_R(CMP_EQ_R(x, infinity)))) == VECTOR_FULL_MASK;
+	}
+#elif defined VECTOR_SSE4_FLOAT64
+/*
+ * Map double precision SSE intrinsics
+ */ 
+	#error "SSE4 with double precision not implemented at the moment"
+#elif defined VECTOR_AVX_FLOAT32
+/*
+ * Map single precision AVX intrinsics
+ */ 
+	#define ADDV _mm256_add_ps
+	#define SUBV _mm256_sub_ps
+	#define MULV _mm256_mul_ps
+	#define DIVV _mm256_div_ps
+	#define SQRTV _mm256_sqrt_ps
+
+	#define LOADU _mm256_loadu_ps
+	#define STOREU _mm256_storeu_ps
+	#define SETV_R _mm256_set1_ps
+	#define SETV_I _mm256_set1_epi32
+	#define ZEROV_R _mm256_setzero_ps
+	#define ZEROV_I _mm256_setzero_si256
+
+	#define MAXV _mm256_max_ps
+	#define MINV _mm256_min_ps
+
+	#define CMP_LT(x, y) _mm256_cmp_ps((x), (y), _CMP_LT_OS)
+	#define CMP_LE(x, y) _mm256_cmp_ps((x), (y), _CMP_LE_OS)
+	#define CMP_GT(x, y) _mm256_cmp_ps((x), (y), _CMP_GT_OS)
+	#define CMP_GE(x, y) _mm256_cmp_ps((x), (y), _CMP_GE_OS)
+
+	#define CMP_EQ_R(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OS)
+	
+	/*
+	 * Define test for equality of integers
+	 * Replace with
+	 * 
+	 * #define CMP_EQ_I(x, y) _mm256_cmpeq_epi32((x), (y))
+	 * 
+	 * when running with AVX2
+	 */ 
+	static inline __m256i CMP_EQ_I(const __m256i a, const __m256i b)
+	{
+		__m256i out = ZEROV_I();
+		const integer* const p = reinterpret_cast<const integer*>(&a);
+		const integer* const q = reinterpret_cast<const integer*>(&b);
+		integer* const r = reinterpret_cast<integer*>(&out);
+
+		for (int i = 0; i < VECTOR_LENGTH; ++i) {
+			r[i] = p[i] == q[i] ? 0xFFFFFFFF : 0;
+		}
+
+		return out;
+	}
+
+	#define ANDV_R _mm256_and_ps
+	#define ORV_R _mm256_or_ps
+	#define XORV_R _mm256_xor_ps
+	#define ORV_I(x, y) CAST_REAL_TO_INT_V(_mm256_or_ps(CAST_INT_TO_REAL_V(x), CAST_INT_TO_REAL_V(y)))
+
+	#define NOTV_R not_ps
+	#define NOTV_I not_si256
+	#define ANDNOTV_R _mm256_andnot_ps
+
+	#define BLENDV _mm256_blendv_ps
+	#define BLENDV_I(else_part, if_part, mask) CAST_REAL_TO_INT_V(_mm256_blendv_ps(CAST_INT_TO_REAL_V(else_part), CAST_INT_TO_REAL_V(if_part), mask))
+	#define MOVEMASK _mm256_movemask_ps
+	
+	/*
+	 * Define left shifting for integers
+	 * Replace with
+	 * 
+	 * #define SHIFT_LEFT(x, y) _mm256_slli_epi32((x), (y))
+	 * 
+	 * when running with AVX2
+	 */ 
+	static inline __m256i SHIFT_LEFT(const __m256i x, const integer y)
+	{
+		__m256i out = ZEROV_I();
+		const integer* const p = reinterpret_cast<const integer*>(&x);
+		integer* const q = reinterpret_cast<integer*>(&out);
+
+		for (int i = 0; i < VECTOR_LENGTH; ++i) {
+			q[i] = p[i] << y;
+		}
+
+		return out;
+	}
+
+	#define CAST_INT_TO_REAL_V _mm256_castsi256_ps
+	#define CAST_REAL_TO_INT_V _mm256_castps_si256
+	#define FABS fabs_ps
+
+	/*
+	 * Compute the absolute value of a vector by forcing the sign bit to be zero
+	 */
+	inline __m256 fabs_ps(const __m256 x) {
+		static const __m256 sign_mask = CAST_INT_TO_REAL_V(_mm256_set1_epi32(1 << 31));
+		return _mm256_andnot_ps(sign_mask, x);
+	}
+	
+	/*
+	 * Bitwise NOT operation for integers
+	 */
+	inline __m256i not_si256(const __m256i x) {
+		static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF);
+		return CAST_REAL_TO_INT_V(_mm256_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x)));
+	}
+	
+	/*
+	 * Bitwise NOT operation for reals
+	 */
+	inline __m256 not_ps(const __m256 x) {
+		static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF);
+		return _mm256_xor_ps(CAST_INT_TO_REAL_V(mask), x);
+	}
+	
+	/*
+	 * Check, whether a real_vector contains infinity or NaN
+	 */ 
+	inline bool checkVector (const __m256 x) {
+		static const real_vector infinity = SETV_R(std :: numeric_limits<float> :: infinity());
+		return MOVEMASK(ANDV_R(CMP_EQ_R(x, x), NOTV_R(CMP_EQ_R(x, infinity)))) == VECTOR_FULL_MASK;
+	}
+#elif defined VECTOR_AVX_FLOAT64
+/*
+ * Map double precision AVX intrinsics
+ */ 
+	#error "AVX with double precision not implemented at the moment"
+#else /* no vectorization type defined */
+/*
+ * No vectorization demanded.
+ * Do nothing, but inform the user
+ */ 
+	#pragma message "SIMD-Definitions included, but no Vector-Type defined."
+#endif
+
+#endif /* #ifndef SIMD_DEFINITIONS_H */ 
--- a/src/solver/SIMD_TYPES.hpp
+++ b/src/solver/SIMD_TYPES.hpp
+/***********************************************************************************//**
+ *
+ * \file SIMD_TYPES.hpp
+ *
+ * \brief Defines the length of the vectors and the corresponding functions
+ *
+ * \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
+ *
+ **************************************************************************************/
+
+#pragma once
+
+#ifndef  SIMD_TYPES_H
+#define  SIMD_TYPES_H
+
+/*
+ * Check, whether the function definitions are already included.
+ * If yes, this denotes an error.
+ * 
+ * The SIMD_DEFINITIONS.hpp-file needs the control macros set in this file to work properly.
+ */ 
+#ifdef  SIMD_DEFINITIONS_H
+	#error "SIMD Definitions already included! Never include that file directly! Include it only via including SIMD_TYPES (this file!)"
+#endif /* defined SIMD_DEFINITIONS_H */
+
+/*
+ * Check, whether a solver is chosen, that uses the macros in this file.
+ */ 
+#if not WAVE_PROPAGATION_SOLVER == 5
+	#pragma message "SIMD macros included but non-vectorized solver specified"
+#endif /* not WAVE_PROPAGATION_SOLVER == 5  */
+
+/*
+ * Care about precision.
+ * Use single precision as default.
+ * 
+ * Additionally, declare the macro SHIFT_SIGN_RIGHT to work properly with 32 and 64 bit.
+ * Note the INTENTIONALLY FORGOTTEN semicolon at the end of the SHIFT_SIGN_RIGHT-definitions.
+ * This forces the user to write the semicolon himself!
+ */ 
+#if defined FLOAT64
+	#pragma message "Using double as type for real numbers"
+	typedef double real;
+	#pragma message "Using unsigned long long as type for integer numbers"
+	typedef unsigned long long integer;
+
+	#define SHIFT_SIGN_RIGHT(x) static_cast<integer>(static_cast<integer>(1) << (static_cast<integer>(64) - static_cast<integer>(x)))
+#else /* not defined FLOAT64  */
+	#pragma message "Using float as type for real numbers"
+	typedef float real;
+	#pragma message "Using unsigned int as type for integer numbers"
+	typedef unsigned int integer;
+
+	#define SHIFT_SIGN_RIGHT(x) (1 << (32 - static_cast<integer>(x)))
+#endif /* not defined FLOAT64  */
+
+/*
+ * Set control macros
+ * 
+ * Declare the vector length
+ * Additionally declare a configuration specific macro of the form
+ * 	VECTOR_extension_precision
+ * 
+ * Moreover, declare an integer, representing the number,
+ * which is returned by the instruction MOVEMASK, if the instruction is called on a vector with ALL SIGN BITS set
+ * Use is as
+ * 
+ * 	const real_vector vector = CONDITION(operand_a, operand_b);
+ * 
+ * 	if (MOVEMASK(vector) == VECTOR_FULL_MASK) {
+ *		// all components fullfill the condition
+ *	} else {
+ *		// some components do not fullfill the condition
+ *	} 
+ */ 
+#if  (defined __SSE4_1__ and not defined __AVX__) or (defined __AVX__ and defined AVX128)
+	#pragma message "Using SSE4.1 for vectorization"
+	#include <smmintrin.h>
+
+	typedef __m128i integer_vector;
+	#if  defined FLOAT64
+		#define VECTOR_LENGTH 2
+		#define VECTOR_SSE4_FLOAT64
+		#define VECTOR_FULL_MASK 0x00000003
+
+		typedef __m128d real_vector;
+		#pragma message "Using vectors of 2 doubles"
+	#else /* not defined FLOAT64  */
+		#define VECTOR_LENGTH 4
+		#define VECTOR_SSE4_FLOAT32
+		#define VECTOR_FULL_MASK 0x0000000F
+
+		typedef __m128 real_vector;
+		#pragma message "Using vectors of 4 floats"
+	#endif /* not defined FLOAT64 */
+#elif  defined __AVX__
+	#pragma message "Using AVX for vectorization"
+	#include <immintrin.h>
+	
+	typedef __m256i integer_vector;
+	#if  defined FLOAT64
+		#define VECTOR_LENGTH 4
+		#define VECTOR_AVX_FLOAT64
+		#define VECTOR_FULL_MASK 0x0000000F
+
+		typedef __m256d real_vector;
+		#pragma message "Using vectors of 4 doubles"
+	#else /* not defined FLOAT64  */
+		#define VECTOR_LENGTH 8
+		#define VECTOR_AVX_FLOAT32
+		#define VECTOR_FULL_MASK 0x000000FF
+
+		typedef __m256 real_vector;
+		#pragma message "Using vectors of 8 floats"
+	#endif /* not defined FLOAT64  */
+#else /* not defined __SSE4__ and not defined __AVX__  */
+	#pragma message "Using no vectorization at all"
+	#define VECTOR_LENGTH 1
+	#define VECTOR_NOVEC
+#endif /* not defined __SSE4__ and not defined __AVX__ */
+
+/*
+ * Control macros are set
+ * 
+ * Include the function macros
+ */ 
+#include "SIMD_DEFINITIONS.hpp"
+		
+		
+/*
+ * Include the cost macros if flop counting is demanded
+ */		
+#if defined COUNTFLOPS
+	#include "SIMD_COSTS.hpp"
+#endif /* defined COUNTFLOPS  */
+
+#endif /* #ifndef SIMD_TYPES_H */