00001 #ifndef BMSSE2__H__INCLUDED__
00002 #define BMSSE2__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035
00036 #include "bmdef.h"
00037 #include "bmsse_util.h"
00038
00039
00040 namespace bm
00041 {
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063 inline
00064 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
00065 {
00066 const unsigned mu1 = 0x55555555;
00067 const unsigned mu2 = 0x33333333;
00068 const unsigned mu3 = 0x0F0F0F0F;
00069 const unsigned mu4 = 0x0000003F;
00070
00071
00072 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00073 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00074 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00075 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00076 __m128i mcnt;
00077 mcnt = _mm_xor_si128(m1, m1);
00078
00079 __m128i tmp1, tmp2;
00080 do
00081 {
00082 __m128i b = _mm_load_si128(block);
00083 ++block;
00084
00085
00086 tmp1 = _mm_srli_epi32(b, 1);
00087 tmp1 = _mm_and_si128(tmp1, m1);
00088 tmp2 = _mm_and_si128(b, m1);
00089 b = _mm_add_epi32(tmp1, tmp2);
00090
00091
00092 tmp1 = _mm_srli_epi32(b, 2);
00093 tmp1 = _mm_and_si128(tmp1, m2);
00094 tmp2 = _mm_and_si128(b, m2);
00095 b = _mm_add_epi32(tmp1, tmp2);
00096
00097
00098 tmp1 = _mm_srli_epi32(b, 4);
00099 b = _mm_add_epi32(b, tmp1);
00100 b = _mm_and_si128(b, m3);
00101
00102
00103 tmp1 = _mm_srli_epi32 (b, 8);
00104 b = _mm_add_epi32(b, tmp1);
00105
00106
00107 tmp1 = _mm_srli_epi32 (b, 16);
00108 b = _mm_add_epi32(b, tmp1);
00109 b = _mm_and_si128(b, m4);
00110
00111 mcnt = _mm_add_epi32(mcnt, b);
00112
00113 } while (block < block_end);
00114
00115
00116 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00117 _mm_store_si128((__m128i*)tcnt, mcnt);
00118
00119 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00120 }
00121
00122
00123
00124 template<class Func>
00125 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
00126 const __m128i* BMRESTRICT block_end,
00127 const __m128i* BMRESTRICT mask_block,
00128 Func sse2_func)
00129 {
00130 const unsigned mu1 = 0x55555555;
00131 const unsigned mu2 = 0x33333333;
00132 const unsigned mu3 = 0x0F0F0F0F;
00133 const unsigned mu4 = 0x0000003F;
00134
00135
00136 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00137 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00138 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00139 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00140 __m128i mcnt;
00141 mcnt = _mm_xor_si128(m1, m1);
00142 do
00143 {
00144 __m128i tmp1, tmp2;
00145 __m128i b = _mm_load_si128(block++);
00146
00147 tmp1 = _mm_load_si128(mask_block++);
00148
00149 b = sse2_func(b, tmp1);
00150
00151
00152 tmp1 = _mm_srli_epi32(b, 1);
00153 tmp1 = _mm_and_si128(tmp1, m1);
00154 tmp2 = _mm_and_si128(b, m1);
00155 b = _mm_add_epi32(tmp1, tmp2);
00156
00157
00158 tmp1 = _mm_srli_epi32(b, 2);
00159 tmp1 = _mm_and_si128(tmp1, m2);
00160 tmp2 = _mm_and_si128(b, m2);
00161 b = _mm_add_epi32(tmp1, tmp2);
00162
00163
00164 tmp1 = _mm_srli_epi32(b, 4);
00165 b = _mm_add_epi32(b, tmp1);
00166 b = _mm_and_si128(b, m3);
00167
00168
00169 tmp1 = _mm_srli_epi32 (b, 8);
00170 b = _mm_add_epi32(b, tmp1);
00171
00172
00173 tmp1 = _mm_srli_epi32 (b, 16);
00174 b = _mm_add_epi32(b, tmp1);
00175 b = _mm_and_si128(b, m4);
00176
00177 mcnt = _mm_add_epi32(mcnt, b);
00178
00179 } while (block < block_end);
00180
00181 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00182 _mm_store_si128((__m128i*)tcnt, mcnt);
00183
00184 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00185 }
00186
00187
00188
00189
00190 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00191 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00192
00193 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00194 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00195
00196 #define VECT_BITCOUNT(first, last) \
00197 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
00198
00199 #define VECT_BITCOUNT_AND(first, last, mask) \
00200 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
00201
00202 #define VECT_BITCOUNT_OR(first, last, mask) \
00203 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
00204
00205 #define VECT_BITCOUNT_XOR(first, last, mask) \
00206 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
00207
00208 #define VECT_BITCOUNT_SUB(first, last, mask) \
00209 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
00210
00211 #define VECT_INVERT_ARR(first, last) \
00212 sse2_invert_arr(first, last);
00213
00214 #define VECT_AND_ARR(dst, src, src_end) \
00215 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00216
00217 #define VECT_OR_ARR(dst, src, src_end) \
00218 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00219
00220 #define VECT_SUB_ARR(dst, src, src_end) \
00221 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00222
00223 #define VECT_XOR_ARR(dst, src, src_end) \
00224 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00225
00226 #define VECT_COPY_BLOCK(dst, src, src_end) \
00227 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00228
00229 #define VECT_SET_BLOCK(dst, dst_end, value) \
00230 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00231
00232
00233
00234
00235
00236 inline
00237 bm::id_t sse2_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00238 const __m128i* BMRESTRICT block_end,
00239 unsigned* BMRESTRICT bit_count)
00240 {
00241 const unsigned mu1 = 0x55555555;
00242 const unsigned mu2 = 0x33333333;
00243 const unsigned mu3 = 0x0F0F0F0F;
00244 const unsigned mu4 = 0x0000003F;
00245
00246
00247 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00248 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00249 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00250 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00251 __m128i mcnt, ccnt;
00252 mcnt = _mm_xor_si128(m1, m1);
00253 ccnt = _mm_xor_si128(m1, m1);
00254
00255 __m128i tmp1, tmp2;
00256
00257 int count = (block_end - block)*4;
00258
00259 bm::word_t w, w0, w_prev;
00260 const int w_shift = sizeof(w) * 8 - 1;
00261 bool first_word = true;
00262
00263
00264 {
00265 const bm::word_t* blk = (const bm::word_t*) block;
00266 w = w0 = blk[0];
00267 w ^= (w >> 1);
00268 BM_INCWORD_BITCOUNT(count, w);
00269 count -= (w_prev = (w0 >> w_shift));
00270 }
00271
00272 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00273
00274 do
00275 {
00276
00277
00278 {
00279 __m128i b = _mm_load_si128(block);
00280
00281
00282 tmp1 = _mm_srli_epi32(b, 1);
00283 tmp2 = _mm_xor_si128(b, tmp1);
00284 _mm_store_si128((__m128i*)tcnt, tmp2);
00285
00286
00287
00288
00289 {
00290
00291
00292 tmp1 = _mm_and_si128(tmp1, m1);
00293 tmp2 = _mm_and_si128(b, m1);
00294 b = _mm_add_epi32(tmp1, tmp2);
00295
00296
00297 tmp1 = _mm_srli_epi32(b, 2);
00298 tmp1 = _mm_and_si128(tmp1, m2);
00299 tmp2 = _mm_and_si128(b, m2);
00300 b = _mm_add_epi32(tmp1, tmp2);
00301
00302
00303 tmp1 = _mm_srli_epi32(b, 4);
00304 b = _mm_add_epi32(b, tmp1);
00305 b = _mm_and_si128(b, m3);
00306
00307
00308 tmp1 = _mm_srli_epi32 (b, 8);
00309 b = _mm_add_epi32(b, tmp1);
00310
00311
00312 tmp1 = _mm_srli_epi32 (b, 16);
00313 b = _mm_add_epi32(b, tmp1);
00314 b = _mm_and_si128(b, m4);
00315
00316 mcnt = _mm_add_epi32(mcnt, b);
00317 }
00318
00319 }
00320
00321 {
00322
00323
00324
00325
00326 const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
00327
00328 if (first_word)
00329 {
00330 first_word = false;
00331 }
00332 else
00333 {
00334 if ((w0=blk[0]))
00335 {
00336 BM_INCWORD_BITCOUNT(count, tcnt[0]);
00337 count -= !(w_prev ^ (w0 & 1));
00338 count -= w_prev = (w0 >> w_shift);
00339 }
00340 else
00341 {
00342 count -= !w_prev; w_prev ^= w_prev;
00343 }
00344 }
00345 if ((w0=blk[1]))
00346 {
00347 BM_INCWORD_BITCOUNT(count, tcnt[1]);
00348 count -= !(w_prev ^ (w0 & 1));
00349 count -= w_prev = (w0 >> w_shift);
00350 }
00351 else
00352 {
00353 count -= !w_prev; w_prev ^= w_prev;
00354 }
00355 if ((w0=blk[2]))
00356 {
00357 BM_INCWORD_BITCOUNT(count, tcnt[2]);
00358 count -= !(w_prev ^ (w0 & 1));
00359 count -= w_prev = (w0 >> w_shift);
00360 }
00361 else
00362 {
00363 count -= !w_prev; w_prev ^= w_prev;
00364 }
00365 if ((w0=blk[3]))
00366 {
00367 BM_INCWORD_BITCOUNT(count, tcnt[3]);
00368 count -= !(w_prev ^ (w0 & 1));
00369 count -= w_prev = (w0 >> w_shift);
00370 }
00371 else
00372 {
00373 count -= !w_prev; w_prev ^= w_prev;
00374 }
00375 }
00376 } while (++block < block_end);
00377
00378 _mm_store_si128((__m128i*)tcnt, mcnt);
00379 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00380
00381 return count;
00382 }
00383
00384 }
00385
00386
00387
00388
00389 #endif