00001 /* 00002 ----------------------------------------------------------------------------- 00003 This source file is part of OGRE 00004 (Object-oriented Graphics Rendering Engine) 00005 For the latest info, see http://www.ogre3d.org/ 00006 00007 Copyright (c) 2000-2011 Torus Knot Software Ltd 00008 00009 Permission is hereby granted, free of charge, to any person obtaining a copy 00010 of this software and associated documentation files (the "Software"), to deal 00011 in the Software without restriction, including without limitation the rights 00012 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00013 copies of the Software, and to permit persons to whom the Software is 00014 furnished to do so, subject to the following conditions: 00015 00016 The above copyright notice and this permission notice shall be included in 00017 all copies or substantial portions of the Software. 00018 00019 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00020 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00021 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00022 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00023 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00024 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00025 THE SOFTWARE. 00026 ----------------------------------------------------------------------------- 00027 */ 00028 #ifndef __SIMDHelper_H__ 00029 #define __SIMDHelper_H__ 00030 00031 #include "OgrePrerequisites.h" 00032 #include "OgrePlatformInformation.h" 00033 00034 // Stack-alignment hackery. 00035 // 00036 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests 00037 // special code to ensure stack align to a 16-bytes boundary. 00038 // 00039 // Note: 00040 // This macro can only guarantee callee stack pointer (esp) align 00041 // to a 16-bytes boundary, but not that for frame pointer (ebp). 00042 // Because most compiler might use frame pointer to access to stack 00043 // variables, so you need to wrap those alignment required functions 00044 // with extra function call. 00045 // 00046 #if defined(__INTEL_COMPILER) 00047 // For intel's compiler, simply calling alloca seems to do the right 00048 // thing. The size of the allocated block seems to be irrelevant. 00049 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16) 00050 00051 #elif OGRE_CPU == OGRE_CPU_X86 && OGRE_COMPILER == OGRE_COMPILER_GNUC 00052 // 00053 // Horrible hack to align the stack to a 16-bytes boundary for gcc. 00054 // 00055 // We assume a gcc version >= 2.95 so that 00056 // -mpreferred-stack-boundary works. Otherwise, all bets are 00057 // off. However, -mpreferred-stack-boundary does not create a 00058 // stack alignment, but it only preserves it. Unfortunately, 00059 // since Ogre are designed as a flexibility library, user might 00060 // compile their application with wrong stack alignment, even 00061 // if user taken care with stack alignment, but many versions 00062 // of libc on linux call main() with the wrong initial stack 00063 // alignment the result that the code is now pessimally aligned 00064 // instead of having a 50% chance of being correct. 00065 // 00066 #if OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64 00067 00068 #define __OGRE_SIMD_ALIGN_STACK() \ 00069 { \ 00070 /* Use alloca to allocate some memory on the stack. */ \ 00071 /* This alerts gcc that something funny is going on, */ \ 00072 /* so that it does not omit the frame pointer etc. */ \ 00073 (void)__builtin_alloca(16); \ 00074 /* Now align the stack pointer */ \ 00075 __asm__ __volatile__ ("andl $-16, %esp"); \ 00076 } 00077 00078 #else // 64 00079 #define __OGRE_SIMD_ALIGN_STACK() \ 00080 { \ 00081 /* Use alloca to allocate some memory on the stack. */ \ 00082 /* This alerts gcc that something funny is going on, */ \ 00083 /* so that it does not omit the frame pointer etc. */ \ 00084 (void)__builtin_alloca(16); \ 00085 /* Now align the stack pointer */ \ 00086 __asm__ __volatile__ ("andq $-16, %rsp"); \ 00087 } 00088 #endif //64 00089 00090 #elif defined(_MSC_VER) 00091 // Fortunately, MSVC will align the stack automatically 00092 00093 #endif 00094 00095 00096 // Additional platform-dependent header files and declares. 00097 // 00098 // NOTE: Should be sync with __OGRE_HAVE_SSE macro. 00099 // 00100 00101 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00102 00103 #if OGRE_COMPILER == OGRE_COMPILER_MSVC || defined(__INTEL_COMPILER) 00104 #include <xmmintrin.h> 00105 00106 #elif OGRE_COMPILER == OGRE_COMPILER_GNUC 00107 // Don't define ourself version SSE intrinsics if "xmmintrin.h" already included. 00108 // 00109 // Note: gcc in some platform already included "xmmintrin.h" for some reason. 00110 // I pick up macro _XMMINTRIN_H_INCLUDED here which based on the "xmmintrin.h" 00111 // comes with cygwin gcc 3.4.4, guess it should be solved duplicate definition 00112 // problem on gcc for x86. 00113 // 00114 #if !defined(_XMMINTRIN_H_INCLUDED) 00115 00116 // Simulate VC/ICC intrinsics. Only used intrinsics are declared here. 00117 # if OGRE_COMP_VER >= 350 00118 typedef float __m128 __attribute__ ((vector_size (16), aligned(16))); 00119 typedef int __m64 __attribute__ ((vector_size (8))); 00120 # else 00121 typedef float __m128 __attribute__ ((mode(V4SF),aligned(16))); 00122 typedef int __m64 __attribute__ ((mode(V2SI))); 00123 # endif 00124 00125 // Macro for declare intrinsic routines always inline even if in debug build 00126 #define __ALWAYS_INLINE FORCEINLINE __attribute__ ((__always_inline__)) 00127 00128 // Shuffle instruction must be declare as macro 00129 00130 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 00131 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) 00132 00133 #define _mm_shuffle_ps(a, b, imm8) __extension__ \ 00134 ({ \ 00135 __m128 result; \ 00136 __asm__("shufps %3, %2, %0" : "=x" (result) : "0" (a), "xm" (b), "N" (imm8)); \ 00137 result; \ 00138 }) 00139 00140 00141 // Load/store instructions 00142 00143 #define __MM_DECL_LD(name, instruction, type) \ 00144 static __ALWAYS_INLINE __m128 _mm_##name(const type *addr) \ 00145 { \ 00146 __m128 result; \ 00147 __asm__( #instruction " %1, %0" : "=x" (result) : "m" (*addr)); \ 00148 return result; \ 00149 } 00150 00151 #define __MM_DECL_LD2(name, instruction, type) \ 00152 static __ALWAYS_INLINE __m128 _mm_##name(__m128 val, const type *addr) \ 00153 { \ 00154 __m128 result; \ 00155 __asm__( #instruction " %2, %0" : "=x" (result) : "0"(val), "m" (*addr)); \ 00156 return result; \ 00157 } 00158 00159 #define __MM_DECL_ST(name, instruction, type) \ 00160 static __ALWAYS_INLINE void _mm_##name(type *addr, __m128 val) \ 00161 { \ 00162 __asm__( #instruction " %1, %0" : "=m" (*addr) : "x" (val)); \ 00163 } 00164 00165 __MM_DECL_LD(loadu_ps, movups, float) 00166 __MM_DECL_ST(storeu_ps, movups, float) 00167 00168 __MM_DECL_LD(load_ss, movss, float) 00169 __MM_DECL_ST(store_ss, movss, float) 00170 00171 __MM_DECL_ST(storel_pi, movlps, __m64) 00172 __MM_DECL_ST(storeh_pi, movhps, __m64) 00173 __MM_DECL_LD2(loadl_pi, movlps, __m64) 00174 __MM_DECL_LD2(loadh_pi, movhps, __m64) 00175 00176 #undef __MM_DECL_LD 00177 #undef __MM_DECL_LD2 00178 #undef __MM_DECL_ST 00179 00180 // Two operand instructions 00181 00182 #define __MM_DECL_OP2(name, instruction, constraint) \ 00183 static __ALWAYS_INLINE __m128 _mm_##name(__m128 a, __m128 b) \ 00184 { \ 00185 __m128 result; \ 00186 __asm__( #instruction " %2, %0" : "=x" (result) : "0" (a), #constraint (b)); \ 00187 return result; \ 00188 } 00189 00190 __MM_DECL_OP2(add_ps, addps, xm) 00191 __MM_DECL_OP2(add_ss, addss, xm) 00192 __MM_DECL_OP2(sub_ps, subps, xm) 00193 __MM_DECL_OP2(sub_ss, subss, xm) 00194 __MM_DECL_OP2(mul_ps, mulps, xm) 00195 __MM_DECL_OP2(mul_ss, mulss, xm) 00196 00197 __MM_DECL_OP2(xor_ps, xorps, xm) 00198 00199 __MM_DECL_OP2(unpacklo_ps, unpcklps, xm) 00200 __MM_DECL_OP2(unpackhi_ps, unpckhps, xm) 00201 00202 __MM_DECL_OP2(movehl_ps, movhlps, x) 00203 __MM_DECL_OP2(movelh_ps, movlhps, x) 00204 00205 __MM_DECL_OP2(cmpnle_ps, cmpnleps, xm) 00206 00207 #undef __MM_DECL_OP2 00208 00209 // Other used instructions 00210 00211 static __ALWAYS_INLINE __m128 _mm_load_ps1(const float *addr) 00212 { 00213 __m128 tmp = _mm_load_ss(addr); 00214 return _mm_shuffle_ps(tmp, tmp, 0); 00215 } 00216 00217 static __ALWAYS_INLINE __m128 _mm_setzero_ps(void) 00218 { 00219 __m128 result; 00220 __asm__("xorps %0, %0" : "=x" (result)); 00221 return result; 00222 } 00223 00224 static __ALWAYS_INLINE __m128 _mm_rsqrt_ps(__m128 val) 00225 { 00226 __m128 result; 00227 __asm__("rsqrtps %1, %0" : "=x" (result) : "xm" (val)); 00228 //__asm__("rsqrtps %0, %0" : "=x" (result) : "0" (val)); 00229 return result; 00230 } 00231 00232 static __ALWAYS_INLINE int _mm_movemask_ps(__m128 val) 00233 { 00234 int result; 00235 __asm__("movmskps %1, %0" : "=r" (result) : "x" (val)); 00236 return result; 00237 } 00238 00239 #endif // !defined(_XMMINTRIN_H_INCLUDED) 00240 00241 #endif // OGRE_COMPILER == OGRE_COMPILER_GNUC 00242 00243 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00244 00245 00246 00247 //--------------------------------------------------------------------- 00248 // SIMD macros and helpers 00249 //--------------------------------------------------------------------- 00250 00251 00252 namespace Ogre { 00260 #if __OGRE_HAVE_SSE 00261 00272 #if 1 00273 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x) 00274 #else 00275 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below 00276 #endif 00277 00286 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \ 00287 { \ 00288 __m128 tmp3, tmp2, tmp1, tmp0; \ 00289 \ 00290 /* r00 r01 r02 r03 */ \ 00291 /* r10 r11 r12 r13 */ \ 00292 /* r20 r21 r22 r23 */ \ 00293 /* r30 r31 r32 r33 */ \ 00294 \ 00295 tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \ 00296 tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \ 00297 tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \ 00298 tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \ 00299 \ 00300 r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \ 00301 r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \ 00302 r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \ 00303 r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \ 00304 } 00305 00314 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \ 00315 { \ 00316 __m128 tmp0, tmp1, tmp2; \ 00317 \ 00318 /* r00 r01 r02 r10 */ \ 00319 /* r11 r12 r20 r21 */ \ 00320 /* r22 r30 r31 r32 */ \ 00321 \ 00322 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \ 00323 tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \ 00324 tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \ 00325 \ 00326 v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \ 00327 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \ 00328 v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \ 00329 } 00330 00338 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \ 00339 { \ 00340 __m128 tmp0, tmp1, tmp2; \ 00341 \ 00342 /* r00 r10 r20 r30 */ \ 00343 /* r01 r11 r21 r31 */ \ 00344 /* r02 r12 r22 r32 */ \ 00345 \ 00346 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \ 00347 tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \ 00348 tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \ 00349 \ 00350 v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \ 00351 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \ 00352 v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \ 00353 } 00354 00358 #define __MM_SELECT(v, fp) \ 00359 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp))) 00360 00362 #define __MM_ACCUM4_PS(a, b, c, d) \ 00363 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d)) 00364 00368 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \ 00369 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3)) 00370 00374 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \ 00375 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3) 00376 00378 #define __MM_ACCUM3_PS(a, b, c) \ 00379 _mm_add_ps(_mm_add_ps(a, b), c) 00380 00384 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \ 00385 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2)) 00386 00388 #define __MM_MADD_PS(a, b, c) \ 00389 _mm_add_ps(_mm_mul_ps(a, b), c) 00390 00392 #define __MM_LERP_PS(t, a, b) \ 00393 __MM_MADD_PS(_mm_sub_ps(b, a), t, a) 00394 00396 #define __MM_MADD_SS(a, b, c) \ 00397 _mm_add_ss(_mm_mul_ss(a, b), c) 00398 00400 #define __MM_LERP_SS(t, a, b) \ 00401 __MM_MADD_SS(_mm_sub_ss(b, a), t, a) 00402 00404 #define __MM_LOAD_PS(p) \ 00405 (*(__m128*)(p)) 00406 00408 #define __MM_STORE_PS(p, v) \ 00409 (*(__m128*)(p) = (v)) 00410 00411 00414 template <bool aligned = false> 00415 struct SSEMemoryAccessor 00416 { 00417 static FORCEINLINE __m128 load(const float *p) 00418 { 00419 return _mm_loadu_ps(p); 00420 } 00421 static FORCEINLINE void store(float *p, const __m128& v) 00422 { 00423 _mm_storeu_ps(p, v); 00424 } 00425 }; 00426 // Special aligned accessor 00427 template <> 00428 struct SSEMemoryAccessor<true> 00429 { 00430 static FORCEINLINE const __m128& load(const float *p) 00431 { 00432 return __MM_LOAD_PS(p); 00433 } 00434 static FORCEINLINE void store(float *p, const __m128& v) 00435 { 00436 __MM_STORE_PS(p, v); 00437 } 00438 }; 00439 00442 static FORCEINLINE bool _isAlignedForSSE(const void *p) 00443 { 00444 return (((size_t)p) & 15) == 0; 00445 } 00446 00450 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x) 00451 { 00452 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f }; 00453 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f }; 00454 __m128 t = _mm_rsqrt_ps(x); 00455 return _mm_mul_ps(_mm_mul_ps(v0pt5, t), 00456 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t))); 00457 } 00458 00459 // Macro to check the stack aligned for SSE 00460 #if OGRE_DEBUG_MODE 00461 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \ 00462 { \ 00463 __m128 test; \ 00464 assert(_isAlignedForSSE(&test)); \ 00465 } 00466 00467 #else // !OGRE_DEBUG_MODE 00468 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() 00469 00470 #endif // OGRE_DEBUG_MODE 00471 00472 00473 #endif // __OGRE_HAVE_SSE 00474 00477 } 00478 00479 #endif // __SIMDHelper_H__
Copyright © 2008 Torus Knot Software Ltd
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Last modified Sat Jan 14 2012 18:40:44