OgreSIMDHelper.h
Go to the documentation of this file.
00001 /*
00002 -----------------------------------------------------------------------------
00003 This source file is part of OGRE
00004     (Object-oriented Graphics Rendering Engine)
00005 For the latest info, see http://www.ogre3d.org/
00006 
00007 Copyright (c) 2000-2011 Torus Knot Software Ltd
00008 
00009 Permission is hereby granted, free of charge, to any person obtaining a copy
00010 of this software and associated documentation files (the "Software"), to deal
00011 in the Software without restriction, including without limitation the rights
00012 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00013 copies of the Software, and to permit persons to whom the Software is
00014 furnished to do so, subject to the following conditions:
00015 
00016 The above copyright notice and this permission notice shall be included in
00017 all copies or substantial portions of the Software.
00018 
00019 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00020 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00021 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00022 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00023 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00024 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00025 THE SOFTWARE.
00026 -----------------------------------------------------------------------------
00027 */
00028 #ifndef __SIMDHelper_H__
00029 #define __SIMDHelper_H__
00030 
00031 #include "OgrePrerequisites.h"
00032 #include "OgrePlatformInformation.h"
00033 
00034 // Stack-alignment hackery.
00035 //
00036 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
00037 // special code to ensure stack align to a 16-bytes boundary.
00038 //
00039 // Note:
00040 //   This macro can only guarantee callee stack pointer (esp) align
00041 // to a 16-bytes boundary, but not that for frame pointer (ebp).
00042 // Because most compiler might use frame pointer to access to stack
00043 // variables, so you need to wrap those alignment required functions
00044 // with extra function call.
00045 //
00046 #if defined(__INTEL_COMPILER)
00047 // For intel's compiler, simply calling alloca seems to do the right
00048 // thing. The size of the allocated block seems to be irrelevant.
00049 #define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
00050 
00051 #elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC
00052 //
00053 // Horrible hack to align the stack to a 16-bytes boundary for gcc.
00054 //
00055 // We assume a gcc version >= 2.95 so that
00056 // -mpreferred-stack-boundary works.  Otherwise, all bets are
00057 // off.  However, -mpreferred-stack-boundary does not create a
00058 // stack alignment, but it only preserves it.  Unfortunately,
00059 // since Ogre are designed as a flexibility library, user might
00060 // compile their application with wrong stack alignment, even
00061 // if user taken care with stack alignment, but many versions
00062 // of libc on linux call main() with the wrong initial stack
00063 // alignment the result that the code is now pessimally aligned
00064 // instead of having a 50% chance of being correct.
00065 //
00066 #if OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64
00067 
00068 #define __OGRE_SIMD_ALIGN_STACK()                                   \
00069     {                                                               \
00070         /* Use alloca to allocate some memory on the stack.  */     \
00071         /* This alerts gcc that something funny is going on, */     \
00072         /* so that it does not omit the frame pointer etc.   */     \
00073         (void)__builtin_alloca(16);                                 \
00074         /* Now align the stack pointer */                           \
00075         __asm__ __volatile__ ("andl $-16, %esp");                   \
00076     }
00077 
00078 #else // 64
00079 #define __OGRE_SIMD_ALIGN_STACK()                                   \
00080     {                                                               \
00081         /* Use alloca to allocate some memory on the stack.  */     \
00082         /* This alerts gcc that something funny is going on, */     \
00083         /* so that it does not omit the frame pointer etc.   */     \
00084         (void)__builtin_alloca(16);                                 \
00085         /* Now align the stack pointer */                           \
00086         __asm__ __volatile__ ("andq $-16, %rsp");                   \
00087     }
00088 #endif //64
00089 
00090 #elif defined(_MSC_VER)
00091 // Fortunately, MSVC will align the stack automatically
00092 
00093 #endif
00094 
00095 
00096 // Additional platform-dependent header files and declares.
00097 //
00098 // NOTE: Should be sync with __OGRE_HAVE_SSE macro.
00099 //
00100 
00101 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00102 
00103 #if OGRE_COMPILER == OGRE_COMPILER_MSVC || defined(__INTEL_COMPILER)
00104 #include <xmmintrin.h>
00105 
00106 #elif OGRE_COMPILER == OGRE_COMPILER_GNUC
00107 // Don't define ourself version SSE intrinsics if "xmmintrin.h" already included.
00108 //
00109 // Note: gcc in some platform already included "xmmintrin.h" for some reason.
00110 // I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h"
00111 // comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition
00112 // problem on gcc for x86.
00113 //
00114 #if !defined(_XMMINTRIN_H_INCLUDED)
00115 
00116 // Simulate VC/ICC intrinsics. Only used intrinsics are declared here.
00117 #   if OGRE_COMP_VER >= 350
00118 typedef float __m128 __attribute__ ((vector_size (16), aligned(16)));
00119 typedef int __m64 __attribute__ ((vector_size (8)));
00120 #   else
00121 typedef float __m128 __attribute__ ((mode(V4SF),aligned(16)));
00122 typedef int __m64 __attribute__ ((mode(V2SI)));
00123 #   endif
00124 
00125 // Macro for declare intrinsic routines always inline even if in debug build
00126 #define __ALWAYS_INLINE    FORCEINLINE __attribute__ ((__always_inline__))
00127 
00128 // Shuffle instruction must be declare as macro
00129 
00130 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
00131     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
00132 
00133 #define _mm_shuffle_ps(a, b, imm8) __extension__                                        \
00134     ({                                                                                  \
00135         __m128 result;                                                                  \
00136         __asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8));   \
00137         result;                                                                         \
00138     })
00139 
00140 
00141 // Load/store instructions
00142 
00143 #define __MM_DECL_LD(name, instruction, type)                               \
00144     static __ALWAYS_INLINE __m128 _mm_##name(const type *addr)              \
00145     {                                                                       \
00146         __m128 result;                                                      \
00147         __asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr));     \
00148         return result;                                                      \
00149     }
00150 
00151 #define __MM_DECL_LD2(name, instruction, type)                                      \
00152     static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr)          \
00153     {                                                                               \
00154         __m128 result;                                                              \
00155         __asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr));   \
00156         return result;                                                              \
00157     }
00158 
00159 #define __MM_DECL_ST(name, instruction, type)                               \
00160     static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val)          \
00161     {                                                                       \
00162         __asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val));        \
00163     }
00164 
00165 __MM_DECL_LD(loadu_ps, movups, float)
00166 __MM_DECL_ST(storeu_ps, movups, float)
00167 
00168 __MM_DECL_LD(load_ss, movss, float)
00169 __MM_DECL_ST(store_ss, movss, float)
00170 
00171 __MM_DECL_ST(storel_pi, movlps, __m64)
00172 __MM_DECL_ST(storeh_pi, movhps, __m64)
00173 __MM_DECL_LD2(loadl_pi, movlps, __m64)
00174 __MM_DECL_LD2(loadh_pi, movhps, __m64)
00175 
00176 #undef __MM_DECL_LD
00177 #undef __MM_DECL_LD2
00178 #undef __MM_DECL_ST
00179 
00180 // Two operand instructions
00181 
00182 #define __MM_DECL_OP2(name, instruction, constraint)                                    \
00183     static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b)                        \
00184     {                                                                                   \
00185         __m128 result;                                                                  \
00186         __asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b));    \
00187         return result;                                                                  \
00188     }
00189 
00190 __MM_DECL_OP2(add_ps, addps, xm)
00191 __MM_DECL_OP2(add_ss, addss, xm)
00192 __MM_DECL_OP2(sub_ps, subps, xm)
00193 __MM_DECL_OP2(sub_ss, subss, xm)
00194 __MM_DECL_OP2(mul_ps, mulps, xm)
00195 __MM_DECL_OP2(mul_ss, mulss, xm)
00196 
00197 __MM_DECL_OP2(xor_ps, xorps, xm)
00198 
00199 __MM_DECL_OP2(unpacklo_ps, unpcklps, xm)
00200 __MM_DECL_OP2(unpackhi_ps, unpckhps, xm)
00201 
00202 __MM_DECL_OP2(movehl_ps, movhlps, x)
00203 __MM_DECL_OP2(movelh_ps, movlhps, x)
00204 
00205 __MM_DECL_OP2(cmpnle_ps, cmpnleps, xm)
00206 
00207 #undef __MM_DECL_OP2
00208 
00209 // Other used instructions
00210 
00211     static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr)
00212     {
00213         __m128 tmp = _mm_load_ss(addr);
00214         return _mm_shuffle_ps(tmp, tmp, 0);
00215     }
00216 
00217     static __ALWAYS_INLINE __m128 _mm_setzero_ps(void)
00218     {
00219         __m128 result;
00220         __asm__("xorps %0, %0" : "=x" (result));
00221         return result;
00222     }
00223 
00224     static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val)
00225     {
00226         __m128 result;
00227         __asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val));
00228         //__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val));
00229         return result;
00230     }
00231 
00232     static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val)
00233     {
00234         int result;
00235         __asm__("movmskps %1, %0" : "=r" (result) : "x" (val));
00236         return result;
00237     }
00238 
00239 #endif // !defined(_XMMINTRIN_H_INCLUDED)
00240 
00241 #endif // OGRE_COMPILER == OGRE_COMPILER_GNUC
00242 
00243 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00244 
00245 
00246 
00247 //---------------------------------------------------------------------
00248 // SIMD macros and helpers
00249 //---------------------------------------------------------------------
00250 
00251 
00252 namespace Ogre {
00260 #if __OGRE_HAVE_SSE
00261 
00272 #if 1
00273 #define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)
00274 #else
00275 #define __MM_RSQRT_PS(x)    __mm_rsqrt_nr_ps(x) // Implemented below
00276 #endif
00277 
00286 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                            \
00287     {                                                                                   \
00288         __m128 tmp3, tmp2, tmp1, tmp0;                                                  \
00289                                                                                         \
00290                                                             /* r00 r01 r02 r03 */       \
00291                                                             /* r10 r11 r12 r13 */       \
00292                                                             /* r20 r21 r22 r23 */       \
00293                                                             /* r30 r31 r32 r33 */       \
00294                                                                                         \
00295         tmp0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */     \
00296         tmp2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */     \
00297         tmp1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */     \
00298         tmp3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */     \
00299                                                                                         \
00300         r0 = _mm_movelh_ps(tmp0, tmp1);                         /* r00 r10 r20 r30 */   \
00301         r1 = _mm_movehl_ps(tmp1, tmp0);                         /* r01 r11 r21 r31 */   \
00302         r2 = _mm_movelh_ps(tmp2, tmp3);                         /* r02 r12 r22 r32 */   \
00303         r3 = _mm_movehl_ps(tmp3, tmp2);                         /* r03 r13 r23 r33 */   \
00304     }
00305 
00314 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                                \
00315     {                                                                                   \
00316         __m128 tmp0, tmp1, tmp2;                                                        \
00317                                                                                         \
00318                                                             /* r00 r01 r02 r10 */       \
00319                                                             /* r11 r12 r20 r21 */       \
00320                                                             /* r22 r30 r31 r32 */       \
00321                                                                                         \
00322         tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */     \
00323         tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */     \
00324         tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */     \
00325                                                                                         \
00326         v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
00327         v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
00328         v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
00329     }
00330 
00338 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
00339     {                                                                               \
00340         __m128 tmp0, tmp1, tmp2;                                                    \
00341                                                                                     \
00342                                                             /* r00 r10 r20 r30 */   \
00343                                                             /* r01 r11 r21 r31 */   \
00344                                                             /* r02 r12 r22 r32 */   \
00345                                                                                     \
00346         tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
00347         tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
00348         tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
00349                                                                                     \
00350         v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
00351         v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
00352         v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
00353     }
00354 
00358 #define __MM_SELECT(v, fp)                                                          \
00359     _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
00360 
00362 #define __MM_ACCUM4_PS(a, b, c, d)                                                  \
00363     _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
00364 
00368 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
00369     __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
00370 
00374 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
00375     __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
00376 
00378 #define __MM_ACCUM3_PS(a, b, c)                                                     \
00379     _mm_add_ps(_mm_add_ps(a, b), c)
00380 
00384 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
00385     __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
00386 
00388 #define __MM_MADD_PS(a, b, c)                                                       \
00389     _mm_add_ps(_mm_mul_ps(a, b), c)
00390 
00392 #define __MM_LERP_PS(t, a, b)                                                       \
00393     __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
00394 
00396 #define __MM_MADD_SS(a, b, c)                                                       \
00397     _mm_add_ss(_mm_mul_ss(a, b), c)
00398 
00400 #define __MM_LERP_SS(t, a, b)                                                       \
00401     __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
00402 
00404 #define __MM_LOAD_PS(p)                                                             \
00405     (*(__m128*)(p))
00406 
00408 #define __MM_STORE_PS(p, v)                                                         \
00409     (*(__m128*)(p) = (v))
00410 
00411 
00414     template <bool aligned = false>
00415     struct SSEMemoryAccessor
00416     {
00417         static FORCEINLINE __m128 load(const float *p)
00418         {
00419             return _mm_loadu_ps(p);
00420         }
00421         static FORCEINLINE void store(float *p, const __m128& v)
00422         {
00423             _mm_storeu_ps(p, v);
00424         }
00425     };
00426     // Special aligned accessor
00427     template <>
00428     struct SSEMemoryAccessor<true>
00429     {
00430         static FORCEINLINE const __m128& load(const float *p)
00431         {
00432             return __MM_LOAD_PS(p);
00433         }
00434         static FORCEINLINE void store(float *p, const __m128& v)
00435         {
00436             __MM_STORE_PS(p, v);
00437         }
00438     };
00439 
00442     static FORCEINLINE bool _isAlignedForSSE(const void *p)
00443     {
00444         return (((size_t)p) & 15) == 0;
00445     }
00446 
00450     static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
00451     {
00452         static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
00453         static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
00454         __m128 t = _mm_rsqrt_ps(x);
00455         return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
00456             _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
00457     }
00458 
00459 // Macro to check the stack aligned for SSE
00460 #if OGRE_DEBUG_MODE
00461 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
00462     {                                               \
00463         __m128 test;                                \
00464         assert(_isAlignedForSSE(&test));            \
00465     }
00466 
00467 #else   // !OGRE_DEBUG_MODE
00468 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
00469 
00470 #endif  // OGRE_DEBUG_MODE
00471 
00472 
00473 #endif  // __OGRE_HAVE_SSE
00474 
00477 }
00478 
00479 #endif // __SIMDHelper_H__

Copyright © 2008 Torus Knot Software Ltd
Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Last modified Sat Jan 14 2012 18:40:44