OGRE  2.0
Object-Oriented Graphics Rendering Engine
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
OgreSIMDHelper.h
Go to the documentation of this file.
1 /*
2 -----------------------------------------------------------------------------
3 This source file is part of OGRE
4  (Object-oriented Graphics Rendering Engine)
5 For the latest info, see http://www.ogre3d.org/
6 
7 Copyright (c) 2000-2014 Torus Knot Software Ltd
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 -----------------------------------------------------------------------------
27 */
28 #ifndef __SIMDHelper_H__
29 #define __SIMDHelper_H__
30 
31 #include "OgrePrerequisites.h"
33 
34 
35 // Additional platform-dependent header files and declares.
36 //
37 // NOTE: Should be sync with __OGRE_HAVE_SSE macro.
38 //
39 
40 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
41 
42 // GCC version 4.0 upwards should be reliable for official SSE now,
43 // so no longer define SSE macros ourselves
44 // We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky?
45 #include <xmmintrin.h>
46 
47 
48 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
49 
50 
51 
52 //---------------------------------------------------------------------
53 // SIMD macros and helpers
54 //---------------------------------------------------------------------
55 
56 
57 namespace Ogre {
65 #if __OGRE_HAVE_SSE
66 
77 #if 1
78 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
79 #else
80 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below
81 #endif
82 
91 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
92  { \
93  __m128 tmp3, tmp2, tmp1, tmp0; \
94  \
95  /* r00 r01 r02 r03 */ \
96  /* r10 r11 r12 r13 */ \
97  /* r20 r21 r22 r23 */ \
98  /* r30 r31 r32 r33 */ \
99  \
100  tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \
101  tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \
102  tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \
103  tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \
104  \
105  r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \
106  r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \
107  r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \
108  r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \
109  }
110 
119 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
120  { \
121  __m128 tmp0, tmp1, tmp2; \
122  \
123  /* r00 r01 r02 r10 */ \
124  /* r11 r12 r20 r21 */ \
125  /* r22 r30 r31 r32 */ \
126  \
127  tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \
128  tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \
129  tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \
130  \
131  v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \
132  v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \
133  v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \
134  }
135 
143 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
144  { \
145  __m128 tmp0, tmp1, tmp2; \
146  \
147  /* r00 r10 r20 r30 */ \
148  /* r01 r11 r21 r31 */ \
149  /* r02 r12 r22 r32 */ \
150  \
151  tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \
152  tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \
153  tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \
154  \
155  v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \
156  v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \
157  v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \
158  }
159 
163 #define __MM_SELECT(v, fp) \
164  _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
165 
167 #define __MM_ACCUM4_PS(a, b, c, d) \
168  _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
169 
173 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
174  __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
175 
179 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
180  __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
181 
183 #define __MM_ACCUM3_PS(a, b, c) \
184  _mm_add_ps(_mm_add_ps(a, b), c)
185 
189 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
190  __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
191 
193 #define __MM_MADD_PS(a, b, c) \
194  _mm_add_ps(_mm_mul_ps(a, b), c)
195 
197 #define __MM_LERP_PS(t, a, b) \
198  __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
199 
201 #define __MM_MADD_SS(a, b, c) \
202  _mm_add_ss(_mm_mul_ss(a, b), c)
203 
205 #define __MM_LERP_SS(t, a, b) \
206  __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
207 
209 #define __MM_LOAD_PS(p) \
210  (*(const __m128*)(p))
211 
213 #define __MM_STORE_PS(p, v) \
214  (*(__m128*)(p) = (v))
215 
216 
219  template <bool aligned = false>
220  struct SSEMemoryAccessor
221  {
222  static FORCEINLINE __m128 load(const float *p)
223  {
224  return _mm_loadu_ps(p);
225  }
226  static FORCEINLINE void store(float *p, const __m128& v)
227  {
228  _mm_storeu_ps(p, v);
229  }
230  };
231  // Special aligned accessor
232  template <>
233  struct SSEMemoryAccessor<true>
234  {
235  static FORCEINLINE const __m128& load(const float *p)
236  {
237  return __MM_LOAD_PS(p);
238  }
239  static FORCEINLINE void store(float *p, const __m128& v)
240  {
241  __MM_STORE_PS(p, v);
242  }
243  };
244 
247  static FORCEINLINE bool _isAlignedForSSE(const void *p)
248  {
249  return (((size_t)p) & 15) == 0;
250  }
251 
255  static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
256  {
257  static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
258  static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
259  __m128 t = _mm_rsqrt_ps(x);
260  return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
261  _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
262  }
263 
264 // Macro to check the stack aligned for SSE
265 #if OGRE_DEBUG_MODE
266 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
267  { \
268  __m128 test; \
269  assert(_isAlignedForSSE(&test)); \
270  }
271 
272 #else // !OGRE_DEBUG_MODE
273 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
274 
275 #endif // OGRE_DEBUG_MODE
276 
277 
278 #endif // __OGRE_HAVE_SSE
279 
282 }
283 
284 #endif // __SIMDHelper_H__
#define FORCEINLINE
Definition: OgrePlatform.h:104