Commit 304d4698 authored by hoelzlw's avatar hoelzlw

Added SIMD files

parent a79ebaa7
This diff is collapsed.
/***********************************************************************************//**
*
* \file SIMD_COSTS.hpp
*
* \brief Contains latency costs for the intrinsics used for the vectorization
*
* \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
*
**************************************************************************************/
#pragma once
#ifndef SIMD_COSTS_H
#define SIMD_COSTS_H
#ifdef COUNTFLOPS
#ifndef SIMD_TYPES_H
#error "SIMD_COSTS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
#endif /* defined SIMD_TYPES_H */
#define COSTS_ADDS 1
#define COSTS_SUBS 1
#define COSTS_MULS 1
#define COSTS_DIVS 1
#define COSTS_SQRTS 1
#define COSTS_MAXS 1
#define COSTS_MINS 1
#define COSTS_CMPS 0
#define COSTS_ANDS 0
#define COSTS_ORS 0
#define COSTS_FABSS 1
#if defined VECTOR_SSE4_FLOAT32
#define COSTS_ADDV 4
#define COSTS_SUBV 4
#define COSTS_MULV 4
#define COSTS_DIVV 4
#define COSTS_SQRTV 4
#define COSTS_LOADU 0
#define COSTS_STOREU 0
#define COSTS_SETV_R 0
#define COSTS_SETV_I 0
#define COSTS_ZEROV_R 0
#define COSTS_ZEROV_I 0
#define COSTS_MAXV 4
#define COSTS_MINV 4
#define COSTS_CMP_LT 0
#define COSTS_CMP_LE 0
#define COSTS_CMP_GT 0
#define COSTS_CMP_GE 0
#define COSTS_CMP_EQ_I 0
#define COSTS_ANDV_R 0
#define COSTS_ORV_R 0
#define COSTS_XORV_R 0
#define COSTS_ORV_I 0
#define COSTS_ANDNOTV_R 0
#define COSTS_BLENDV 0
#define COSTS_BLENDV_I 0
#define COSTS_MOVEMASK 0
#define COSTS_SHIFT_LEFT 0
#define COSTS_FABS 4
#define COSTS_NOTV_R 0
#define COSTS_NOTV_I 0
#elif defined VECTOR_SSE4_FLOAT64
#error "SSE4 with double precision not implemented at the moment"
#elif defined VECTOR_AVX_FLOAT32
#define COSTS_ADDV 8
#define COSTS_SUBV 8
#define COSTS_MULV 8
#define COSTS_DIVV 8
#define COSTS_SQRTV 8
#define COSTS_LOADU 0
#define COSTS_STOREU 0
#define COSTS_SETV_R 0
#define COSTS_SETV_I 0
#define COSTS_ZEROV_R 0
#define COSTS_ZEROV_I 0
#define COSTS_MAXV 8
#define COSTS_MINV 8
#define COSTS_CMP_LT 0
#define COSTS_CMP_LE 0
#define COSTS_CMP_GT 0
#define COSTS_CMP_GE 0
#define COSTS_CMP_EQ_I 0
#define COSTS_ANDV_R 0
#define COSTS_ORV_R 0
#define COSTS_XORV_R 0
#define COSTS_ORV_I 0
#define COSTS_ANDNOTV_R 0
#define COSTS_BLENDV 0
#define COSTS_BLENDV_I 0
#define COSTS_MOVEMASK 0
#define COSTS_SHIFT_LEFT 0
#define COSTS_FABS 8
#define COSTS_NOTV_R 0
#define COSTS_NOTV_I 0
#elif defined VECTOR_AVX_FLOAT64
#error "AVX with double precision not implemented at the moment"
#else /* no vectorization type defined */
#pragma message "SIMD-Costs included, but no Vector-Type defined."
#endif
#endif /* not defined COUNTFLOPS */
#endif /* #ifndef SIMD_COSTS_H */
/***********************************************************************************//**
*
* \file SIMD_DEFINITIONS.hpp
*
* \brief Contains macro definitions for the intrinsics used for the vectorization
*
* \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
*
**************************************************************************************/
#pragma once
#ifndef SIMD_DEFINITIONS_H
#define SIMD_DEFINITIONS_H
#include <cmath>
#include <limits>
/*
* Check whether the file SIMD_TYPES.hpp has been included.
* This file (SIMD_DEFINITIONS.hpp) needs some macros to be set properly by that file (SIMD_TYPES.hpp).
*/
#ifndef SIMD_TYPES_H
#error "SIMD_DEFINITIONS included without SIMD_TYPES! Never include this file directly! Include it only via SIMD_TYPES!"
#endif /* defined SIMD_TYPES_H */
#if defined VECTOR_SSE4_FLOAT32
/*
* Map single precision SSE intrinsics
*/
#define ADDV _mm_add_ps
#define SUBV _mm_sub_ps
#define MULV _mm_mul_ps
#define DIVV _mm_div_ps
#define SQRTV _mm_sqrt_ps
#define LOADU _mm_loadu_ps
#define STOREU _mm_storeu_ps
#define SETV_R _mm_set1_ps
#define SETV_I _mm_set1_epi32
#define ZEROV_R _mm_setzero_ps
#define ZEROV_I _mm_setzero_si128
#define MAXV _mm_max_ps
#define MINV _mm_min_ps
#define CMP_LT _mm_cmplt_ps
#define CMP_LE _mm_cmple_ps
#define CMP_GT _mm_cmpgt_ps
#define CMP_GE _mm_cmpge_ps
#define CMP_EQ_I _mm_cmpeq_epi32
#define CMP_EQ_R _mm_cmpeq_ps
#define ANDV_R _mm_and_ps
#define ORV_R _mm_or_ps
#define XORV_R _mm_xor_ps
#define ORV_I _mm_or_si128
#define NOTV_R not_ps
#define NOTV_I not_si128
#define ANDNOTV_R _mm_andnot_ps
#define BLENDV _mm_blendv_ps
#define BLENDV_I(else_part, if_part, mask) CAST_REAL_TO_INT_V(_mm_blendv_ps(CAST_INT_TO_REAL_V(else_part), CAST_INT_TO_REAL_V(if_part), mask))
#define MOVEMASK _mm_movemask_ps
#define SHIFT_LEFT _mm_slli_epi32
#define CAST_INT_TO_REAL_V _mm_castsi128_ps
#define CAST_REAL_TO_INT_V _mm_castps_si128
#define FABS fabs_ps
/*
* Compute the absolute value of a vector by forcing the sign bit to be zero
*/
inline __m128 fabs_ps(const __m128 x) {
static const __m128 sign_mask = CAST_INT_TO_REAL_V(_mm_set1_epi32(1 << 31));
return _mm_andnot_ps(sign_mask, x);
}
/*
* Bitwise NOT operation for integers
*/
inline __m128i not_si128(const __m128i x) {
static const __m128i mask = _mm_set1_epi32(~0);
return CAST_REAL_TO_INT_V(_mm_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x)));
}
/*
* Bitwise NOT operation for reals
*/
inline __m128 not_ps(const __m128 x) {
static const __m128i mask = _mm_set1_epi32(~0);
return _mm_xor_ps(CAST_INT_TO_REAL_V(mask), x);
}
/*
* Check, whether a real_vector contains infinity or NaN
*/
inline bool checkVector (const __m128 x) {
static const real_vector infinity = SETV_R(std :: numeric_limits<float> :: infinity());
return MOVEMASK(ANDV_R(CMP_EQ_R(x, x), NOTV_R(CMP_EQ_R(x, infinity)))) == VECTOR_FULL_MASK;
}
#elif defined VECTOR_SSE4_FLOAT64
/*
* Map double precision SSE intrinsics
*/
#error "SSE4 with double precision not implemented at the moment"
#elif defined VECTOR_AVX_FLOAT32
/*
* Map single precision AVX intrinsics
*/
#define ADDV _mm256_add_ps
#define SUBV _mm256_sub_ps
#define MULV _mm256_mul_ps
#define DIVV _mm256_div_ps
#define SQRTV _mm256_sqrt_ps
#define LOADU _mm256_loadu_ps
#define STOREU _mm256_storeu_ps
#define SETV_R _mm256_set1_ps
#define SETV_I _mm256_set1_epi32
#define ZEROV_R _mm256_setzero_ps
#define ZEROV_I _mm256_setzero_si256
#define MAXV _mm256_max_ps
#define MINV _mm256_min_ps
#define CMP_LT(x, y) _mm256_cmp_ps((x), (y), _CMP_LT_OS)
#define CMP_LE(x, y) _mm256_cmp_ps((x), (y), _CMP_LE_OS)
#define CMP_GT(x, y) _mm256_cmp_ps((x), (y), _CMP_GT_OS)
#define CMP_GE(x, y) _mm256_cmp_ps((x), (y), _CMP_GE_OS)
#define CMP_EQ_R(x, y) _mm256_cmp_ps((x), (y), _CMP_EQ_OS)
/*
* Define test for equality of integers
* Replace with
*
* #define CMP_EQ_I(x, y) _mm256_cmpeq_epi32((x), (y))
*
* when running with AVX2
*/
static inline __m256i CMP_EQ_I(const __m256i a, const __m256i b)
{
__m256i out = ZEROV_I();
const integer* const p = reinterpret_cast<const integer*>(&a);
const integer* const q = reinterpret_cast<const integer*>(&b);
integer* const r = reinterpret_cast<integer*>(&out);
for (int i = 0; i < VECTOR_LENGTH; ++i) {
r[i] = p[i] == q[i] ? 0xFFFFFFFF : 0;
}
return out;
}
#define ANDV_R _mm256_and_ps
#define ORV_R _mm256_or_ps
#define XORV_R _mm256_xor_ps
#define ORV_I(x, y) CAST_REAL_TO_INT_V(_mm256_or_ps(CAST_INT_TO_REAL_V(x), CAST_INT_TO_REAL_V(y)))
#define NOTV_R not_ps
#define NOTV_I not_si256
#define ANDNOTV_R _mm256_andnot_ps
#define BLENDV _mm256_blendv_ps
#define BLENDV_I(else_part, if_part, mask) CAST_REAL_TO_INT_V(_mm256_blendv_ps(CAST_INT_TO_REAL_V(else_part), CAST_INT_TO_REAL_V(if_part), mask))
#define MOVEMASK _mm256_movemask_ps
/*
* Define left shifting for integers
* Replace with
*
* #define SHIFT_LEFT(x, y) _mm256_slli_epi32((x), (y))
*
* when running with AVX2
*/
static inline __m256i SHIFT_LEFT(const __m256i x, const integer y)
{
__m256i out = ZEROV_I();
const integer* const p = reinterpret_cast<const integer*>(&x);
integer* const q = reinterpret_cast<integer*>(&out);
for (int i = 0; i < VECTOR_LENGTH; ++i) {
q[i] = p[i] << y;
}
return out;
}
#define CAST_INT_TO_REAL_V _mm256_castsi256_ps
#define CAST_REAL_TO_INT_V _mm256_castps_si256
#define FABS fabs_ps
/*
* Compute the absolute value of a vector by forcing the sign bit to be zero
*/
inline __m256 fabs_ps(const __m256 x) {
static const __m256 sign_mask = CAST_INT_TO_REAL_V(_mm256_set1_epi32(1 << 31));
return _mm256_andnot_ps(sign_mask, x);
}
/*
* Bitwise NOT operation for integers
*/
inline __m256i not_si256(const __m256i x) {
static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF);
return CAST_REAL_TO_INT_V(_mm256_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x)));
}
/*
* Bitwise NOT operation for reals
*/
inline __m256 not_ps(const __m256 x) {
static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF);
return _mm256_xor_ps(CAST_INT_TO_REAL_V(mask), x);
}
/*
* Check, whether a real_vector contains infinity or NaN
*/
inline bool checkVector (const __m256 x) {
static const real_vector infinity = SETV_R(std :: numeric_limits<float> :: infinity());
return MOVEMASK(ANDV_R(CMP_EQ_R(x, x), NOTV_R(CMP_EQ_R(x, infinity)))) == VECTOR_FULL_MASK;
}
#elif defined VECTOR_AVX_FLOAT64
/*
* Map double precision AVX intrinsics
*/
#error "AVX with double precision not implemented at the moment"
#else /* no vectorization type defined */
/*
* No vectorization demanded.
* Do nothing, but inform the user
*/
#pragma message "SIMD-Definitions included, but no Vector-Type defined."
#endif
#endif /* #ifndef SIMD_DEFINITIONS_H */
/***********************************************************************************//**
*
* \file SIMD_TYPES.hpp
*
* \brief Defines the length of the vectors and the corresponding functions
*
* \author Wolfgang Hölzl (hoelzlw), hoelzlw AT in.tum.de
*
**************************************************************************************/
#pragma once
#ifndef SIMD_TYPES_H
#define SIMD_TYPES_H
/*
* Check, whether the function definitions are already included.
* If yes, this denotes an error.
*
* The SIMD_DEFINITIONS.hpp-file needs the control macros set in this file to work properly.
*/
#ifdef SIMD_DEFINITIONS_H
#error "SIMD Definitions already included! Never include that file directly! Include it only via including SIMD_TYPES (this file!)"
#endif /* defined SIMD_DEFINITIONS_H */
/*
* Check, whether a solver is chosen, that uses the macros in this file.
*/
#if not WAVE_PROPAGATION_SOLVER == 5
#pragma message "SIMD macros included but non-vectorized solver specified"
#endif /* not WAVE_PROPAGATION_SOLVER == 5 */
/*
* Care about precision.
* Use single precision as default.
*
* Additionally, declare the macro SHIFT_SIGN_RIGHT to work properly with 32 and 64 bit.
* Note the INTENTIONALLY FORGOTTEN semicolon at the end of the SHIFT_SIGN_RIGHT-definitions.
* This forces the user to write the semicolon himself!
*/
#if defined FLOAT64
#pragma message "Using double as type for real numbers"
typedef double real;
#pragma message "Using unsigned long long as type for integer numbers"
typedef unsigned long long integer;
#define SHIFT_SIGN_RIGHT(x) static_cast<integer>(static_cast<integer>(1) << (static_cast<integer>(64) - static_cast<integer>(x)))
#else /* not defined FLOAT64 */
#pragma message "Using float as type for real numbers"
typedef float real;
#pragma message "Using unsigned int as type for integer numbers"
typedef unsigned int integer;
#define SHIFT_SIGN_RIGHT(x) (1 << (32 - static_cast<integer>(x)))
#endif /* not defined FLOAT64 */
/*
* Set control macros
*
* Declare the vector length
* Additionally declare a configuration specific macro of the form
* VECTOR_extension_precision
*
* Moreover, declare an integer, representing the number,
* which is returned by the instruction MOVEMASK, if the instruction is called on a vector with ALL SIGN BITS set
* Use is as
*
* const real_vector vector = CONDITION(operand_a, operand_b);
*
* if (MOVEMASK(vector) == VECTOR_FULL_MASK) {
* // all components fullfill the condition
* } else {
* // some components do not fullfill the condition
* }
*/
#if (defined __SSE4_1__ and not defined __AVX__) or (defined __AVX__ and defined AVX128)
#pragma message "Using SSE4.1 for vectorization"
#include <smmintrin.h>
typedef __m128i integer_vector;
#if defined FLOAT64
#define VECTOR_LENGTH 2
#define VECTOR_SSE4_FLOAT64
#define VECTOR_FULL_MASK 0x00000003
typedef __m128d real_vector;
#pragma message "Using vectors of 2 doubles"
#else /* not defined FLOAT64 */
#define VECTOR_LENGTH 4
#define VECTOR_SSE4_FLOAT32
#define VECTOR_FULL_MASK 0x0000000F
typedef __m128 real_vector;
#pragma message "Using vectors of 4 floats"
#endif /* not defined FLOAT64 */
#elif defined __AVX__
#pragma message "Using AVX for vectorization"
#include <immintrin.h>
typedef __m256i integer_vector;
#if defined FLOAT64
#define VECTOR_LENGTH 4
#define VECTOR_AVX_FLOAT64
#define VECTOR_FULL_MASK 0x0000000F
typedef __m256d real_vector;
#pragma message "Using vectors of 4 doubles"
#else /* not defined FLOAT64 */
#define VECTOR_LENGTH 8
#define VECTOR_AVX_FLOAT32
#define VECTOR_FULL_MASK 0x000000FF
typedef __m256 real_vector;
#pragma message "Using vectors of 8 floats"
#endif /* not defined FLOAT64 */
#else /* not defined __SSE4__ and not defined __AVX__ */
#pragma message "Using no vectorization at all"
#define VECTOR_LENGTH 1
#define VECTOR_NOVEC
#endif /* not defined __SSE4__ and not defined __AVX__ */
/*
* Control macros are set
*
* Include the function macros
*/
#include "SIMD_DEFINITIONS.hpp"
/*
* Include the cost macros if flop counting is demanded
*/
#if defined COUNTFLOPS
#include "SIMD_COSTS.hpp"
#endif /* defined COUNTFLOPS */
#endif /* #ifndef SIMD_TYPES_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment