28 #ifndef __SIMDHelper_H__
29 #define __SIMDHelper_H__
46 #if defined(__INTEL_COMPILER)
49 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16)
50 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
52 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
54 #define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))
56 #elif defined(_MSC_VER)
58 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
61 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
71 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
76 #include <xmmintrin.h>
79 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
109 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
111 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below
122 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
124 __m128 tmp3, tmp2, tmp1, tmp0; \
131 tmp0 = _mm_unpacklo_ps(r0, r1); \
132 tmp2 = _mm_unpackhi_ps(r0, r1); \
133 tmp1 = _mm_unpacklo_ps(r2, r3); \
134 tmp3 = _mm_unpackhi_ps(r2, r3); \
136 r0 = _mm_movelh_ps(tmp0, tmp1); \
137 r1 = _mm_movehl_ps(tmp1, tmp0); \
138 r2 = _mm_movelh_ps(tmp2, tmp3); \
139 r3 = _mm_movehl_ps(tmp3, tmp2); \
150 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
152 __m128 tmp0, tmp1, tmp2; \
158 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); \
159 tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); \
160 tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); \
162 v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); \
163 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); \
164 v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); \
174 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
176 __m128 tmp0, tmp1, tmp2; \
182 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); \
183 tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); \
184 tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); \
186 v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); \
187 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); \
188 v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); \
194 #define __MM_SELECT(v, fp) \
195 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
198 #define __MM_ACCUM4_PS(a, b, c, d) \
199 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
204 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
205 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
210 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
211 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
214 #define __MM_ACCUM3_PS(a, b, c) \
215 _mm_add_ps(_mm_add_ps(a, b), c)
220 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
221 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
224 #define __MM_MADD_PS(a, b, c) \
225 _mm_add_ps(_mm_mul_ps(a, b), c)
228 #define __MM_LERP_PS(t, a, b) \
229 __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
232 #define __MM_MADD_SS(a, b, c) \
233 _mm_add_ss(_mm_mul_ss(a, b), c)
236 #define __MM_LERP_SS(t, a, b) \
237 __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
240 #define __MM_LOAD_PS(p) \
241 (*(const __m128*)(p))
244 #define __MM_STORE_PS(p, v) \
245 (*(__m128*)(p) = (v))
250 template <
bool aligned = false>
251 struct SSEMemoryAccessor
255 return _mm_loadu_ps(p);
257 static FORCEINLINE void store(
float *p,
const __m128& v)
264 struct SSEMemoryAccessor<true>
266 static FORCEINLINE const __m128& load(
const float *p)
268 return __MM_LOAD_PS(p);
270 static FORCEINLINE void store(
float *p,
const __m128& v)
278 static FORCEINLINE bool _isAlignedForSSE(
const void *p)
280 return (((
size_t)p) & 15) == 0;
286 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(
const __m128& x)
288 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
289 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
290 __m128 t = _mm_rsqrt_ps(x);
291 return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
292 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
297 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
300 assert(_isAlignedForSSE(&test)); \
303 #else // !OGRE_DEBUG_MODE
304 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
306 #endif // OGRE_DEBUG_MODE
309 #endif // __OGRE_HAVE_SSE
315 #endif // __SIMDHelper_H__