00001 #ifndef BMSSE4__H__INCLUDED__
00002 #define BMSSE4__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035 #include<smmintrin.h>
00036
00037 #include "bmdef.h"
00038 #include "bmsse_util.h"
00039
00040 namespace bm
00041 {
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053 inline
00054 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
00055 {
00056 bm::id_t count = 0;
00057 #ifdef BM64_SSE4
00058 const bm::id64_t* b = (bm::id64_t*) block;
00059 const bm::id64_t* b_end = (bm::id64_t*) block_end;
00060 do
00061 {
00062 count += _mm_popcnt_u64(b[0]) +
00063 _mm_popcnt_u64(b[1]);
00064 b += 2;
00065 } while (b < b_end);
00066 #else
00067 do
00068 {
00069 const unsigned* b = (unsigned*) block;
00070 count += _mm_popcnt_u32(b[0]) +
00071 _mm_popcnt_u32(b[1]) +
00072 _mm_popcnt_u32(b[2]) +
00073 _mm_popcnt_u32(b[3]);
00074 } while (++block < block_end);
00075 #endif
00076 return count;
00077 }
00078
00079
00080
00081
00082 BMFORCEINLINE
00083 unsigned op_xor(unsigned a, unsigned b)
00084 {
00085 unsigned ret = (a ^ b);
00086 return ret;
00087 }
00088
00089
00090
00091
00092 BMFORCEINLINE
00093 unsigned op_or(unsigned a, unsigned b)
00094 {
00095 return (a | b);
00096 }
00097
00098
00099
00100
00101 BMFORCEINLINE
00102 unsigned op_and(unsigned a, unsigned b)
00103 {
00104 return (a & b);
00105 }
00106
00107
00108 template<class Func>
00109 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
00110 const __m128i* BMRESTRICT block_end,
00111 const __m128i* BMRESTRICT mask_block,
00112 Func sse2_func)
00113 {
00114 bm::id_t count = 0;
00115 #ifdef BM64_SSE4
00116 do
00117 {
00118 __m128i tmp0 = _mm_load_si128(block);
00119 __m128i tmp1 = _mm_load_si128(mask_block);
00120 __m128i b = sse2_func(tmp0, tmp1);
00121
00122 count += _mm_popcnt_u64(_mm_extract_epi64(b, 0));
00123 count += _mm_popcnt_u64(_mm_extract_epi64(b, 1));
00124
00125 ++block; ++mask_block;
00126 } while (block < block_end);
00127 #else
00128 do
00129 {
00130 __m128i tmp0 = _mm_load_si128(block);
00131 __m128i tmp1 = _mm_load_si128(mask_block);
00132 __m128i b = sse2_func(tmp0, tmp1);
00133
00134 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
00135 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
00136 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
00137 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
00138
00139 ++block; ++mask_block;
00140 } while (block < block_end);
00141 #endif
00142
00143 return count;
00144 }
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00190 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00191
00192 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00193 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00194
00195 #define VECT_BITCOUNT(first, last) \
00196 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
00197
00198 #define VECT_BITCOUNT_AND(first, last, mask) \
00199 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
00200
00201 #define VECT_BITCOUNT_OR(first, last, mask) \
00202 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
00203
00204 #define VECT_BITCOUNT_XOR(first, last, mask) \
00205 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
00206
00207 #define VECT_BITCOUNT_SUB(first, last, mask) \
00208 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
00209
00210 #define VECT_INVERT_ARR(first, last) \
00211 sse2_invert_arr(first, last);
00212
00213 #define VECT_AND_ARR(dst, src, src_end) \
00214 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00215
00216 #define VECT_OR_ARR(dst, src, src_end) \
00217 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00218
00219 #define VECT_SUB_ARR(dst, src, src_end) \
00220 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00221
00222 #define VECT_XOR_ARR(dst, src, src_end) \
00223 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00224
00225 #define VECT_COPY_BLOCK(dst, src, src_end) \
00226 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00227
00228 #define VECT_SET_BLOCK(dst, dst_end, value) \
00229 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241 inline
00242 bm::id_t sse4_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00243 const __m128i* BMRESTRICT block_end,
00244 unsigned* BMRESTRICT bit_count)
00245 {
00246
00247 register int count = (block_end - block)*4;
00248
00249 register bm::word_t w0, w_prev;
00250 const int w_shift = sizeof(w0) * 8 - 1;
00251 bool first_word = true;
00252 *bit_count = 0;
00253
00254
00255 {
00256 bm::word_t w;
00257 const bm::word_t* blk = (const bm::word_t*) block;
00258 w = w0 = blk[0];
00259 *bit_count += _mm_popcnt_u32(w);
00260 w ^= (w >> 1);
00261 count += _mm_popcnt_u32(w);
00262 count -= (w_prev = (w0 >> w_shift));
00263 }
00264
00265 do
00266 {
00267 __m128i b = _mm_load_si128(block);
00268 __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1));
00269 __m128i tmp3 = _mm_srli_epi32(b, w_shift);
00270
00271
00272
00273 {
00274 if (first_word)
00275 {
00276 first_word = false;
00277 }
00278 else
00279 {
00280 w0 = _mm_extract_epi32(b, 0);
00281 if (w0)
00282 {
00283 *bit_count += _mm_popcnt_u32(w0);
00284 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
00285 count -= !(w_prev ^ (w0 & 1));
00286 count -= w_prev = _mm_extract_epi32(tmp3, 0);
00287 }
00288 else
00289 {
00290 count -= !w_prev; w_prev ^= w_prev;
00291 }
00292 }
00293 w0 = _mm_extract_epi32(b, 1);
00294 if (w0)
00295 {
00296 *bit_count += _mm_popcnt_u32(w0);
00297 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
00298 count -= !(w_prev ^ (w0 & 1));
00299 count -= w_prev = _mm_extract_epi32(tmp3, 1);
00300 }
00301 else
00302 {
00303 count -= !w_prev; w_prev ^= w_prev;
00304 }
00305 w0 = _mm_extract_epi32(b, 2);
00306 if (w0)
00307 {
00308 *bit_count += _mm_popcnt_u32(w0);
00309 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
00310 count -= !(w_prev ^ (w0 & 1));
00311 count -= w_prev = _mm_extract_epi32(tmp3, 2);
00312 }
00313 else
00314 {
00315 count -= !w_prev; w_prev ^= w_prev;
00316 }
00317 w0 = _mm_extract_epi32(b, 3);
00318 if (w0)
00319 {
00320 *bit_count += _mm_popcnt_u32(w0);
00321 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
00322 count -= !(w_prev ^ (w0 & 1));
00323 count -= w_prev = _mm_extract_epi32(tmp3, 3);
00324 }
00325 else
00326 {
00327 count -= !w_prev; w_prev ^= w_prev;
00328 }
00329 }
00330 } while (++block < block_end);
00331
00332 return count;
00333 }
00334
00335
00336
00337 }
00338
00339
00340
00341
00342 #endif