OGRE  1.9
Object-Oriented Graphics Rendering Engine
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
OgreSIMDHelper.h
Go to the documentation of this file.
1 /*
2 -----------------------------------------------------------------------------
3 This source file is part of OGRE
4  (Object-oriented Graphics Rendering Engine)
5 For the latest info, see http://www.ogre3d.org/
6 
7 Copyright (c) 2000-2014 Torus Knot Software Ltd
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 -----------------------------------------------------------------------------
27 */
28 #ifndef __SIMDHelper_H__
29 #define __SIMDHelper_H__
30 
31 #include "OgrePrerequisites.h"
33 
34 // Stack-alignment hackery.
35 //
36 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
37 // special code to ensure stack align to a 16-bytes boundary.
38 //
39 // Note:
40 // This macro can only guarantee callee stack pointer (esp) align
41 // to a 16-bytes boundary, but not that for frame pointer (ebp).
42 // Because most compiler might use frame pointer to access to stack
43 // variables, so you need to wrap those alignment required functions
44 // with extra function call.
45 //
46 #if defined(__INTEL_COMPILER)
47 // For intel's compiler, simply calling alloca seems to do the right
48 // thing. The size of the allocated block seems to be irrelevant.
49 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16)
50 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
51 
52 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
53 // mark functions with GCC attribute to force stack alignment to 16 bytes
54 #define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))
55 
56 #elif defined(_MSC_VER)
57 // Fortunately, MSVC will align the stack automatically
58 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
59 
60 #else
61 #define __OGRE_SIMD_ALIGN_ATTRIBUTE
62 
63 #endif
64 
65 
66 // Additional platform-dependent header files and declares.
67 //
68 // NOTE: Should be sync with __OGRE_HAVE_SSE macro.
69 //
70 
71 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
72 
73 // GCC version 4.0 upwards should be reliable for official SSE now,
74 // so no longer define SSE macros ourselves
75 // We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky?
76 #include <xmmintrin.h>
77 
78 
79 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
80 
81 
82 
83 //---------------------------------------------------------------------
84 // SIMD macros and helpers
85 //---------------------------------------------------------------------
86 
87 
88 namespace Ogre {
96 #if __OGRE_HAVE_SSE
97 
108 #if 1
109 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
110 #else
111 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below
112 #endif
113 
122 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
123  { \
124  __m128 tmp3, tmp2, tmp1, tmp0; \
125  \
126  /* r00 r01 r02 r03 */ \
127  /* r10 r11 r12 r13 */ \
128  /* r20 r21 r22 r23 */ \
129  /* r30 r31 r32 r33 */ \
130  \
131  tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \
132  tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \
133  tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \
134  tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \
135  \
136  r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \
137  r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \
138  r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \
139  r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \
140  }
141 
150 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
151  { \
152  __m128 tmp0, tmp1, tmp2; \
153  \
154  /* r00 r01 r02 r10 */ \
155  /* r11 r12 r20 r21 */ \
156  /* r22 r30 r31 r32 */ \
157  \
158  tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \
159  tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \
160  tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \
161  \
162  v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \
163  v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \
164  v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \
165  }
166 
174 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
175  { \
176  __m128 tmp0, tmp1, tmp2; \
177  \
178  /* r00 r10 r20 r30 */ \
179  /* r01 r11 r21 r31 */ \
180  /* r02 r12 r22 r32 */ \
181  \
182  tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \
183  tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \
184  tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \
185  \
186  v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \
187  v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \
188  v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \
189  }
190 
194 #define __MM_SELECT(v, fp) \
195  _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
196 
198 #define __MM_ACCUM4_PS(a, b, c, d) \
199  _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
200 
204 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
205  __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
206 
210 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
211  __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
212 
214 #define __MM_ACCUM3_PS(a, b, c) \
215  _mm_add_ps(_mm_add_ps(a, b), c)
216 
220 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
221  __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
222 
224 #define __MM_MADD_PS(a, b, c) \
225  _mm_add_ps(_mm_mul_ps(a, b), c)
226 
228 #define __MM_LERP_PS(t, a, b) \
229  __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
230 
232 #define __MM_MADD_SS(a, b, c) \
233  _mm_add_ss(_mm_mul_ss(a, b), c)
234 
236 #define __MM_LERP_SS(t, a, b) \
237  __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
238 
240 #define __MM_LOAD_PS(p) \
241  (*(const __m128*)(p))
242 
244 #define __MM_STORE_PS(p, v) \
245  (*(__m128*)(p) = (v))
246 
247 
250  template <bool aligned = false>
251  struct SSEMemoryAccessor
252  {
253  static FORCEINLINE __m128 load(const float *p)
254  {
255  return _mm_loadu_ps(p);
256  }
257  static FORCEINLINE void store(float *p, const __m128& v)
258  {
259  _mm_storeu_ps(p, v);
260  }
261  };
262  // Special aligned accessor
263  template <>
264  struct SSEMemoryAccessor<true>
265  {
266  static FORCEINLINE const __m128& load(const float *p)
267  {
268  return __MM_LOAD_PS(p);
269  }
270  static FORCEINLINE void store(float *p, const __m128& v)
271  {
272  __MM_STORE_PS(p, v);
273  }
274  };
275 
278  static FORCEINLINE bool _isAlignedForSSE(const void *p)
279  {
280  return (((size_t)p) & 15) == 0;
281  }
282 
286  static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
287  {
288  static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
289  static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
290  __m128 t = _mm_rsqrt_ps(x);
291  return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
292  _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
293  }
294 
295 // Macro to check the stack aligned for SSE
296 #if OGRE_DEBUG_MODE
297 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
298  { \
299  __m128 test; \
300  assert(_isAlignedForSSE(&test)); \
301  }
302 
303 #else // !OGRE_DEBUG_MODE
304 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
305 
306 #endif // OGRE_DEBUG_MODE
307 
308 
309 #endif // __OGRE_HAVE_SSE
310 
313 }
314 
315 #endif // __SIMDHelper_H__
#define FORCEINLINE
Definition: OgrePlatform.h:102