NCBI C++ ToolKit
bmsse2.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef BMSSE2__H__INCLUDED__
2 #define BMSSE2__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse2.h
22  \brief Compute functions for SSE2 SIMD instruction set (internal)
23 */
24 
25 #if !defined(__arm64__) && !defined(__arm__)
26 #ifndef BMWASMSIMDOPT
27 #include<mmintrin.h>
28 #endif
29 #include<emmintrin.h>
30 #endif
31 
32 #include "bmdef.h"
33 #include "bmutil.h"
34 #include "bmsse_util.h"
35 
36 
37 #ifdef __GNUG__
38 #pragma GCC diagnostic push
39 #pragma GCC diagnostic ignored "-Wconversion"
40 #endif
41 
42 namespace bm
43 {
44 
45 
46 /*!
47  SSE2 optimized bitcounting function implements parallel bitcounting
48  algorithm for SSE2 instruction set.
49 
50 <pre>
51 unsigned CalcBitCount32(unsigned b)
52 {
53  b = (b & 0x55555555) + (b >> 1 & 0x55555555);
54  b = (b & 0x33333333) + (b >> 2 & 0x33333333);
55  b = (b + (b >> 4)) & 0x0F0F0F0F;
56  b = b + (b >> 8);
57  b = (b + (b >> 16)) & 0x0000003F;
58  return b;
59 }
60 </pre>
61 
62  @ingroup SSE2
63 
64 */
65 inline
66 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
67 {
68  const unsigned mu1 = 0x55555555;
69  const unsigned mu2 = 0x33333333;
70  const unsigned mu3 = 0x0F0F0F0F;
71  const unsigned mu4 = 0x0000003F;
72 
73  // Loading masks
74  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
75  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
76  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
77  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
78  __m128i mcnt;
79  mcnt = _mm_xor_si128(m1, m1); // cnt = 0
80 
81  __m128i tmp1, tmp2;
82  do
83  {
84  __m128i b = _mm_load_si128(block);
85  ++block;
86 
87  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
88  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
89  tmp1 = _mm_and_si128(tmp1, m1);
90  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
91  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
92 
93  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
94  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
95  tmp1 = _mm_and_si128(tmp1, m2);
96  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
97  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
98 
99  // b = (b + (b >> 4)) & 0x0F0F0F0F;
100  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
101  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
102  b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
103 
104  // b = b + (b >> 8);
105  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
106  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
107 
108  // b = (b + (b >> 16)) & 0x0000003F;
109  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
110  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
111  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
112 
113  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
114 
115  } while (block < block_end);
116 
117 
119  _mm_store_si128((__m128i*)tcnt, mcnt);
120 
121  return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
122 }
123 
124 
125 
126 template<class Func>
128  const __m128i* BMRESTRICT block_end,
129  const __m128i* BMRESTRICT mask_block,
130  Func sse2_func)
131 {
132  const unsigned mu1 = 0x55555555;
133  const unsigned mu2 = 0x33333333;
134  const unsigned mu3 = 0x0F0F0F0F;
135  const unsigned mu4 = 0x0000003F;
136 
137  // Loading masks
138  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
139  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
140  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
141  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
142  __m128i mcnt;
143  mcnt = _mm_xor_si128(m1, m1); // cnt = 0
144  do
145  {
146  __m128i tmp1, tmp2;
147  __m128i b = _mm_load_si128(block++);
148 
149  tmp1 = _mm_load_si128(mask_block++);
150 
151  b = sse2_func(b, tmp1);
152 
153  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
154  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
155  tmp1 = _mm_and_si128(tmp1, m1);
156  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
157  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
158 
159  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
160  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
161  tmp1 = _mm_and_si128(tmp1, m2);
162  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
163  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
164 
165  // b = (b + (b >> 4)) & 0x0F0F0F0F;
166  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
167  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
168  b = _mm_and_si128(b, m3); // & 0x0F0F0F0F
169 
170  // b = b + (b >> 8);
171  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
172  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
173 
174  // b = (b + (b >> 16)) & 0x0000003F;
175  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
176  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
177  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
178 
179  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
180 
181  } while (block < block_end);
182 
184  _mm_store_si128((__m128i*)tcnt, mcnt);
185 
186  return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
187 }
188 
189 /*!
190  @brief check if block is all zero bits
191  @ingroup SSE2
192 */
193 inline
195 {
196  __m128i w;
197  const __m128i maskz = _mm_setzero_si128();
198  const __m128i* BMRESTRICT block_end =
199  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
200 
201  do
202  {
203  w = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
204  auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
205  w = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
206  auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, maskz));
207  if (m1 != 0xFFFF || m2 != 0xFFFF)
208  return false;
209  block += 4;
210  } while (block < block_end);
211  return true;
212 }
213 
214 /*!
215  @brief check if block is all ONE bits
216  @ingroup SSE2
217 */
218 inline
220 {
221  __m128i w;
222  const __m128i mask1 = _mm_set_epi32 (~0u, ~0u, ~0u, ~0u);
223  const __m128i* BMRESTRICT block_end =
224  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
225 
226  do
227  {
228  w = _mm_and_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
229  auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
230  w = _mm_and_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
231  auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(w, mask1));
232  if (m1 != 0xFFFF || m2 != 0xFFFF)
233  return false;
234  block+=4;
235  } while (block < block_end);
236  return true;
237 }
238 
239 /*!
240  @brief check if digest stride is all zero bits
241  @ingroup SSE2
242 */
245 {
246  const __m128i maskz = _mm_setzero_si128();
247 
248  __m128i wA = _mm_or_si128(_mm_load_si128(block+0), _mm_load_si128(block+1));
249  __m128i wB = _mm_or_si128(_mm_load_si128(block+2), _mm_load_si128(block+3));
250  wA = _mm_or_si128(wA, wB);
251  auto m1 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
252 
253  wA = _mm_or_si128(_mm_load_si128(block+4), _mm_load_si128(block+5));
254  wB = _mm_or_si128(_mm_load_si128(block+6), _mm_load_si128(block+7));
255  wA = _mm_or_si128(wA, wB);
256  auto m2 = _mm_movemask_epi8(_mm_cmpeq_epi8(wA, maskz));
257 
258  if (m1 != 0xFFFF || m2 != 0xFFFF)
259  return false;
260  return true;
261 }
262 
263 /*!
264  @brief set digest stride to 0xFF.. or 0x0 value
265  @ingroup SSE2
266 */
269 {
270  __m128i mV = _mm_set1_epi32(int(value));
271  _mm_store_si128(dst, mV); _mm_store_si128(dst + 1, mV);
272  _mm_store_si128(dst + 2, mV); _mm_store_si128(dst + 3, mV);
273  _mm_store_si128(dst + 4, mV); _mm_store_si128(dst + 5, mV);
274  _mm_store_si128(dst + 6, mV); _mm_store_si128(dst + 7, mV);
275 }
276 
277 
278 /**
279  Build partial XOR product of 2 bit-blocks using digest mask
280 
281  @param target_block - target := block ^ xor_block
282  @param block - arg1
283  @param xor_block - arg2
284  @param digest - mask for each block wave to XOR (1) or just copy (0)
285 
286  @ingroup SSE2
287 */
288 inline
289 void sse2_bit_block_xor(bm::word_t* target_block,
290  const bm::word_t* block,
291  const bm::word_t* xor_block,
292  bm::id64_t digest) BMNOEXCEPT
293 {
294  for (unsigned i = 0; i < bm::block_waves; ++i)
295  {
296  const bm::id64_t mask = (1ull << i);
297  unsigned off = (i * bm::set_block_digest_wave_size);
298  const __m128i* sub_block = (__m128i*) (block + off);
299  __m128i* t_sub_block = (__m128i*)(target_block + off);
300 
301  if (digest & mask) // XOR filtered sub-block
302  {
303  const __m128i* xor_sub_block = (__m128i*) (xor_block + off);
304  __m128i mA, mB, mC, mD;
305  mA = _mm_xor_si128(_mm_load_si128(sub_block),
306  _mm_load_si128(xor_sub_block));
307  mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
308  _mm_load_si128(xor_sub_block+1));
309  mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
310  _mm_load_si128(xor_sub_block+2));
311  mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
312  _mm_load_si128(xor_sub_block+3));
313 
314  _mm_store_si128(t_sub_block, mA);
315  _mm_store_si128(t_sub_block+1, mB);
316  _mm_store_si128(t_sub_block+2, mC);
317  _mm_store_si128(t_sub_block+3, mD);
318 
319  mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
320  _mm_load_si128(xor_sub_block+4));
321  mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
322  _mm_load_si128(xor_sub_block+5));
323  mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
324  _mm_load_si128(xor_sub_block+6));
325  mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
326  _mm_load_si128(xor_sub_block+7));
327 
328  _mm_store_si128(t_sub_block+4, mA);
329  _mm_store_si128(t_sub_block+5, mB);
330  _mm_store_si128(t_sub_block+6, mC);
331  _mm_store_si128(t_sub_block+7, mD);
332 
333  }
334  else // just copy source
335  {
336  _mm_store_si128(t_sub_block , _mm_load_si128(sub_block));
337  _mm_store_si128(t_sub_block+1, _mm_load_si128(sub_block+1));
338  _mm_store_si128(t_sub_block+2, _mm_load_si128(sub_block+2));
339  _mm_store_si128(t_sub_block+3, _mm_load_si128(sub_block+3));
340 
341  _mm_store_si128(t_sub_block+4, _mm_load_si128(sub_block+4));
342  _mm_store_si128(t_sub_block+5, _mm_load_si128(sub_block+5));
343  _mm_store_si128(t_sub_block+6, _mm_load_si128(sub_block+6));
344  _mm_store_si128(t_sub_block+7, _mm_load_si128(sub_block+7));
345  }
346  } // for i
347 }
348 
349 /**
350  Build partial XOR product of 2 bit-blocks using digest mask
351 
352  @param target_block - target ^= xor_block
353  @param xor_block - arg1
354  @param digest - mask for each block wave to XOR (if 1)
355 
356  @ingroup SSE2
357  @internal
358 */
359 inline
361  const bm::word_t* xor_block,
362  bm::id64_t digest) BMNOEXCEPT
363 {
364  while (digest)
365  {
366  bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
367  unsigned wave = bm::word_bitcount64(t - 1);
368  unsigned off = wave * bm::set_block_digest_wave_size;
369 
370  const __m128i* sub_block = (const __m128i*) (xor_block + off);
371  __m128i* t_sub_block = (__m128i*)(target_block + off);
372 
373  __m128i mA, mB, mC, mD;
374  mA = _mm_xor_si128(_mm_load_si128(sub_block),
375  _mm_load_si128(t_sub_block));
376  mB = _mm_xor_si128(_mm_load_si128(sub_block+1),
377  _mm_load_si128(t_sub_block+1));
378  mC = _mm_xor_si128(_mm_load_si128(sub_block+2),
379  _mm_load_si128(t_sub_block+2));
380  mD = _mm_xor_si128(_mm_load_si128(sub_block+3),
381  _mm_load_si128(t_sub_block+3));
382 
383  _mm_store_si128(t_sub_block, mA);
384  _mm_store_si128(t_sub_block+1, mB);
385  _mm_store_si128(t_sub_block+2, mC);
386  _mm_store_si128(t_sub_block+3, mD);
387 
388  mA = _mm_xor_si128(_mm_load_si128(sub_block+4),
389  _mm_load_si128(t_sub_block+4));
390  mB = _mm_xor_si128(_mm_load_si128(sub_block+5),
391  _mm_load_si128(t_sub_block+5));
392  mC = _mm_xor_si128(_mm_load_si128(sub_block+6),
393  _mm_load_si128(t_sub_block+6));
394  mD = _mm_xor_si128(_mm_load_si128(sub_block+7),
395  _mm_load_si128(t_sub_block+7));
396 
397  _mm_store_si128(t_sub_block+4, mA);
398  _mm_store_si128(t_sub_block+5, mB);
399  _mm_store_si128(t_sub_block+6, mC);
400  _mm_store_si128(t_sub_block+7, mD);
401 
402  digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
403  } // while
404 }
405 
406 
407 
408 /*!
409  @brief AND block digest stride
410  *dst &= *src
411  @return true if stide is all zero
412  @ingroup SSE2
413 */
416  const __m128i* BMRESTRICT src) BMNOEXCEPT
417 {
418  __m128i m1A, m1B, m1C, m1D;
419  const __m128i maskz = _mm_setzero_si128();
420 
421  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
422  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
423  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
424  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
425 
426  _mm_store_si128(dst+0, m1A);
427  _mm_store_si128(dst+1, m1B);
428  _mm_store_si128(dst+2, m1C);
429  _mm_store_si128(dst+3, m1D);
430 
431  m1A = _mm_or_si128(m1A, m1B);
432  m1C = _mm_or_si128(m1C, m1D);
433  m1A = _mm_or_si128(m1A, m1C);
434 
435  bool z1 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
436 
437  m1A = _mm_and_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
438  m1B = _mm_and_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
439  m1C = _mm_and_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
440  m1D = _mm_and_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
441 
442  _mm_store_si128(dst+4, m1A);
443  _mm_store_si128(dst+5, m1B);
444  _mm_store_si128(dst+6, m1C);
445  _mm_store_si128(dst+7, m1D);
446 
447  m1A = _mm_or_si128(m1A, m1B);
448  m1C = _mm_or_si128(m1C, m1D);
449  m1A = _mm_or_si128(m1A, m1C);
450 
451  bool z2 = _mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF;
452 
453  return z1 & z2;
454 }
455 
456 /*!
457  @brief AND-OR block digest stride
458  *dst |= *src1 & src2
459 
460  @return true if stide is all zero
461  @ingroup SSE2
462 */
465  const __m128i* BMRESTRICT src1,
466  const __m128i* BMRESTRICT src2) BMNOEXCEPT
467 {
468  __m128i m1A, m1B, m1C, m1D;
469  __m128i mACC1;
470  const __m128i maskz = _mm_setzero_si128();
471 
472  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
473  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
474  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
475  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
476 
477  mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
478  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
479 
480  m1A = _mm_or_si128(_mm_load_si128(dst+0), m1A);
481  m1B = _mm_or_si128(_mm_load_si128(dst+1), m1B);
482  m1C = _mm_or_si128(_mm_load_si128(dst+2), m1C);
483  m1D = _mm_or_si128(_mm_load_si128(dst+3), m1D);
484 
485  _mm_store_si128(dst+0, m1A);
486  _mm_store_si128(dst+1, m1B);
487  _mm_store_si128(dst+2, m1C);
488  _mm_store_si128(dst+3, m1D);
489 
490 
491  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
492  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
493  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
494  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
495 
496  mACC1 = _mm_or_si128(_mm_or_si128(m1A, m1B), _mm_or_si128(m1C, m1D));
497  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mACC1, maskz)) == 0xFFFF);
498 
499  m1A = _mm_or_si128(_mm_load_si128(dst+4), m1A);
500  m1B = _mm_or_si128(_mm_load_si128(dst+5), m1B);
501  m1C = _mm_or_si128(_mm_load_si128(dst+6), m1C);
502  m1D = _mm_or_si128(_mm_load_si128(dst+7), m1D);
503 
504  _mm_store_si128(dst+4, m1A);
505  _mm_store_si128(dst+5, m1B);
506  _mm_store_si128(dst+6, m1C);
507  _mm_store_si128(dst+7, m1D);
508 
509  return z1 & z2;
510 }
511 
512 
513 /*!
514  @brief AND block digest stride
515  @return true if stide is all zero
516  @ingroup SSE2
517 */
518 inline
520  const __m128i* BMRESTRICT src1,
521  const __m128i* BMRESTRICT src2,
522  const __m128i* BMRESTRICT src3,
523  const __m128i* BMRESTRICT src4) BMNOEXCEPT
524 {
525  __m128i m1A, m1B, m1C, m1D;
526  __m128i m1E, m1F, m1G, m1H;
527 
528  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
529  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
530  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
531  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
532 
533  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
534  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
535  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
536  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
537 
538  m1A = _mm_and_si128(m1A, m1E);
539  m1B = _mm_and_si128(m1B, m1F);
540  m1C = _mm_and_si128(m1C, m1G);
541  m1D = _mm_and_si128(m1D, m1H);
542 
543  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
544  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
545  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
546  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
547 
548  _mm_store_si128(dst+0, m1A);
549  _mm_store_si128(dst+1, m1B);
550  _mm_store_si128(dst+2, m1C);
551  _mm_store_si128(dst+3, m1D);
552 
553  m1A = _mm_or_si128(m1A, m1B);
554  m1C = _mm_or_si128(m1C, m1D);
555  m1A = _mm_or_si128(m1A, m1C);
556 
557  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
558 
559  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
560  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
561  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
562  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
563 
564  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
565  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
566  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
567  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
568 
569  m1A = _mm_and_si128(m1A, m1E);
570  m1B = _mm_and_si128(m1B, m1F);
571  m1C = _mm_and_si128(m1C, m1G);
572  m1D = _mm_and_si128(m1D, m1H);
573 
574  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
575  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
576  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
577  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
578 
579  _mm_store_si128(dst+4, m1A);
580  _mm_store_si128(dst+5, m1B);
581  _mm_store_si128(dst+6, m1C);
582  _mm_store_si128(dst+7, m1D);
583 
584  m1A = _mm_or_si128(m1A, m1B);
585  m1C = _mm_or_si128(m1C, m1D);
586  m1A = _mm_or_si128(m1A, m1C);
587 
588  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
589 
590  return z1 & z2;
591 }
592 
593 /*!
594  @brief AND block digest stride
595  @return true if stide is all zero
596  @ingroup SSE2
597 */
598 inline
600  const __m128i* BMRESTRICT src1,
601  const __m128i* BMRESTRICT src2) BMNOEXCEPT
602 {
603  __m128i m1A, m1B, m1C, m1D;
604 // __m128i m1E, m1F, m1G, m1H;
605 
606  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
607  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
608  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
609  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
610 /*
611  m1E = _mm_and_si128(_mm_load_si128(src3+0), _mm_load_si128(src4+0));
612  m1F = _mm_and_si128(_mm_load_si128(src3+1), _mm_load_si128(src4+1));
613  m1G = _mm_and_si128(_mm_load_si128(src3+2), _mm_load_si128(src4+2));
614  m1H = _mm_and_si128(_mm_load_si128(src3+3), _mm_load_si128(src4+3));
615 
616  m1A = _mm_and_si128(m1A, m1E);
617  m1B = _mm_and_si128(m1B, m1F);
618  m1C = _mm_and_si128(m1C, m1G);
619  m1D = _mm_and_si128(m1D, m1H);
620 */
621  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
622  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
623  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
624  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
625 
626  _mm_store_si128(dst+0, m1A);
627  _mm_store_si128(dst+1, m1B);
628  _mm_store_si128(dst+2, m1C);
629  _mm_store_si128(dst+3, m1D);
630 
631  m1A = _mm_or_si128(m1A, m1B);
632  m1C = _mm_or_si128(m1C, m1D);
633  m1A = _mm_or_si128(m1A, m1C);
634 
635  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
636 
637  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
638  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
639  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
640  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
641 /*
642  m1E = _mm_and_si128(_mm_load_si128(src3+4), _mm_load_si128(src4+4));
643  m1F = _mm_and_si128(_mm_load_si128(src3+5), _mm_load_si128(src4+5));
644  m1G = _mm_and_si128(_mm_load_si128(src3+6), _mm_load_si128(src4+6));
645  m1H = _mm_and_si128(_mm_load_si128(src3+7), _mm_load_si128(src4+7));
646 
647  m1A = _mm_and_si128(m1A, m1E);
648  m1B = _mm_and_si128(m1B, m1F);
649  m1C = _mm_and_si128(m1C, m1G);
650  m1D = _mm_and_si128(m1D, m1H);
651 */
652  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
653  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
654  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
655  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
656 
657  _mm_store_si128(dst+4, m1A);
658  _mm_store_si128(dst+5, m1B);
659  _mm_store_si128(dst+6, m1C);
660  _mm_store_si128(dst+7, m1D);
661 
662  m1A = _mm_or_si128(m1A, m1B);
663  m1C = _mm_or_si128(m1C, m1D);
664  m1A = _mm_or_si128(m1A, m1C);
665 
666  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, _mm_setzero_si128())) == 0xFFFF);
667 
668  return z1 & z2;
669 }
670 
671 
672 
673 /*!
674  @brief AND block digest stride
675  *dst = *src1 & src2
676 
677  @return true if stide is all zero
678  @ingroup SSE2
679 */
682  const __m128i* BMRESTRICT src1,
683  const __m128i* BMRESTRICT src2) BMNOEXCEPT
684 {
685  __m128i m1A, m1B, m1C, m1D;
686 
687  m1A = _mm_and_si128(_mm_load_si128(src1+0), _mm_load_si128(src2+0));
688  m1B = _mm_and_si128(_mm_load_si128(src1+1), _mm_load_si128(src2+1));
689  m1C = _mm_and_si128(_mm_load_si128(src1+2), _mm_load_si128(src2+2));
690  m1D = _mm_and_si128(_mm_load_si128(src1+3), _mm_load_si128(src2+3));
691 
692  _mm_store_si128(dst+0, m1A);
693  _mm_store_si128(dst+1, m1B);
694  _mm_store_si128(dst+2, m1C);
695  _mm_store_si128(dst+3, m1D);
696 
697  m1A = _mm_or_si128(m1A, m1B);
698  m1C = _mm_or_si128(m1C, m1D);
699  m1A = _mm_or_si128(m1A, m1C);
700 
701  const __m128i maskz = _mm_setzero_si128();
702  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
703 
704  m1A = _mm_and_si128(_mm_load_si128(src1+4), _mm_load_si128(src2+4));
705  m1B = _mm_and_si128(_mm_load_si128(src1+5), _mm_load_si128(src2+5));
706  m1C = _mm_and_si128(_mm_load_si128(src1+6), _mm_load_si128(src2+6));
707  m1D = _mm_and_si128(_mm_load_si128(src1+7), _mm_load_si128(src2+7));
708 
709  _mm_store_si128(dst+4, m1A);
710  _mm_store_si128(dst+5, m1B);
711  _mm_store_si128(dst+6, m1C);
712  _mm_store_si128(dst+7, m1D);
713 
714  m1A = _mm_or_si128(m1A, m1B);
715  m1C = _mm_or_si128(m1C, m1D);
716  m1A = _mm_or_si128(m1A, m1C);
717 
718  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
719 
720  return z1 & z2;
721 }
722 
723 /*!
724  @brief SUB (AND NOT) block digest stride
725  *dst &= ~*src
726 
727  @return true if stide is all zero
728  @ingroup SSE2
729 */
732  const __m128i* BMRESTRICT src) BMNOEXCEPT
733 {
734  __m128i m1A, m1B, m1C, m1D;
735  const __m128i maskz = _mm_setzero_si128();
736 
737  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
738  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
739  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
740  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
741 
742  _mm_store_si128(dst+0, m1A);
743  _mm_store_si128(dst+1, m1B);
744  _mm_store_si128(dst+2, m1C);
745  _mm_store_si128(dst+3, m1D);
746 
747  m1A = _mm_or_si128(m1A, m1B);
748  m1C = _mm_or_si128(m1C, m1D);
749  m1A = _mm_or_si128(m1A, m1C);
750 
751  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
752 
753  m1A = _mm_andnot_si128(_mm_load_si128(src+4), _mm_load_si128(dst+4));
754  m1B = _mm_andnot_si128(_mm_load_si128(src+5), _mm_load_si128(dst+5));
755  m1C = _mm_andnot_si128(_mm_load_si128(src+6), _mm_load_si128(dst+6));
756  m1D = _mm_andnot_si128(_mm_load_si128(src+7), _mm_load_si128(dst+7));
757 
758  _mm_store_si128(dst+4, m1A);
759  _mm_store_si128(dst+5, m1B);
760  _mm_store_si128(dst+6, m1C);
761  _mm_store_si128(dst+7, m1D);
762 
763  m1A = _mm_or_si128(m1A, m1B);
764  m1C = _mm_or_si128(m1C, m1D);
765  m1A = _mm_or_si128(m1A, m1C);
766 
767  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
768 
769  return z1 & z2;
770 }
771 
772 /*!
773  @brief 2-operand SUB (AND NOT) block digest stride
774  *dst = src1 & ~*src2
775 
776  @return true if stide is all zero
777  @ingroup SSE2
778 */
781  const __m128i* BMRESTRICT src1,
782  const __m128i* BMRESTRICT src2) BMNOEXCEPT
783 {
784  __m128i m1A, m1B, m1C, m1D;
785  const __m128i maskz = _mm_setzero_si128();
786 
787  m1A = _mm_andnot_si128(_mm_load_si128(src2+0), _mm_load_si128(src1+0));
788  m1B = _mm_andnot_si128(_mm_load_si128(src2+1), _mm_load_si128(src1+1));
789  m1C = _mm_andnot_si128(_mm_load_si128(src2+2), _mm_load_si128(src1+2));
790  m1D = _mm_andnot_si128(_mm_load_si128(src2+3), _mm_load_si128(src1+3));
791 
792  _mm_store_si128(dst+0, m1A);
793  _mm_store_si128(dst+1, m1B);
794  _mm_store_si128(dst+2, m1C);
795  _mm_store_si128(dst+3, m1D);
796 
797  m1A = _mm_or_si128(m1A, m1B);
798  m1C = _mm_or_si128(m1C, m1D);
799  m1A = _mm_or_si128(m1A, m1C);
800 
801  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
802 
803  m1A = _mm_andnot_si128(_mm_load_si128(src2+4), _mm_load_si128(src1+4));
804  m1B = _mm_andnot_si128(_mm_load_si128(src2+5), _mm_load_si128(src1+5));
805  m1C = _mm_andnot_si128(_mm_load_si128(src2+6), _mm_load_si128(src1+6));
806  m1D = _mm_andnot_si128(_mm_load_si128(src2+7), _mm_load_si128(src1+7));
807 
808  _mm_store_si128(dst+4, m1A);
809  _mm_store_si128(dst+5, m1B);
810  _mm_store_si128(dst+6, m1C);
811  _mm_store_si128(dst+7, m1D);
812 
813  m1A = _mm_or_si128(m1A, m1B);
814  m1C = _mm_or_si128(m1C, m1D);
815  m1A = _mm_or_si128(m1A, m1C);
816 
817  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
818 
819  return z1 & z2;
820 }
821 
822 /*!
823  @brief SUB block digest stride
824  @return true if stide is all zero
825  @ingroup SSE4
826 */
827 inline
829  const __m128i* BMRESTRICT src1,
830  const __m128i* BMRESTRICT src2,
831  const __m128i* BMRESTRICT src3,
832  const __m128i* BMRESTRICT src4) BMNOEXCEPT
833 {
834  __m128i m1A, m1B, m1C, m1D;
835  __m128i m1E, m1F, m1G, m1H;
836  __m128i maskFF = _mm_set1_epi32(~0u);
837 
838  m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
839  m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
840  m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
841  m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
842 
843  m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
844  m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
845  m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
846  m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
847 
848  m1A = _mm_and_si128(m1A, m1E);
849  m1B = _mm_and_si128(m1B, m1F);
850  m1C = _mm_and_si128(m1C, m1G);
851  m1D = _mm_and_si128(m1D, m1H);
852 
853  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
854  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
855  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
856  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
857 
858  _mm_store_si128(dst+0, m1A);
859  _mm_store_si128(dst+1, m1B);
860  _mm_store_si128(dst+2, m1C);
861  _mm_store_si128(dst+3, m1D);
862 
863  m1A = _mm_or_si128(m1A, m1B);
864  m1C = _mm_or_si128(m1C, m1D);
865  m1A = _mm_or_si128(m1A, m1C);
866 
867  const __m128i maskz = _mm_setzero_si128();
868  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
869 
870  m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
871  m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
872  m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
873  m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
874 
875  m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
876  m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
877  m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
878  m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
879 
880  m1A = _mm_and_si128(m1A, m1E);
881  m1B = _mm_and_si128(m1B, m1F);
882  m1C = _mm_and_si128(m1C, m1G);
883  m1D = _mm_and_si128(m1D, m1H);
884 
885  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
886  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
887  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
888  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
889 
890  _mm_store_si128(dst+4, m1A);
891  _mm_store_si128(dst+5, m1B);
892  _mm_store_si128(dst+6, m1C);
893  _mm_store_si128(dst+7, m1D);
894 
895  m1A = _mm_or_si128(m1A, m1B);
896  m1C = _mm_or_si128(m1C, m1D);
897  m1A = _mm_or_si128(m1A, m1C);
898 
899  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
900 
901  return z1 & z2;
902 }
903 
904 
905 /*!
906  @brief SUB block digest stride
907  @return true if stide is all zero
908  @ingroup SSE4
909 */
910 inline
912  const __m128i* BMRESTRICT src1,
913  const __m128i* BMRESTRICT src2) BMNOEXCEPT
914 {
915  __m128i m1A, m1B, m1C, m1D;
916 // __m128i m1E, m1F, m1G, m1H;
917  __m128i maskFF = _mm_set1_epi32(~0u);
918 
919  m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+0)), _mm_xor_si128(maskFF,_mm_load_si128(src2+0)));
920  m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+1)), _mm_xor_si128(maskFF,_mm_load_si128(src2+1)));
921  m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+2)), _mm_xor_si128(maskFF,_mm_load_si128(src2+2)));
922  m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+3)), _mm_xor_si128(maskFF,_mm_load_si128(src2+3)));
923 /*
924  m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+0)), _mm_xor_si128(maskFF,_mm_load_si128(src4+0)));
925  m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+1)), _mm_xor_si128(maskFF,_mm_load_si128(src4+1)));
926  m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+2)), _mm_xor_si128(maskFF,_mm_load_si128(src4+2)));
927  m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+3)), _mm_xor_si128(maskFF,_mm_load_si128(src4+3)));
928 
929  m1A = _mm_and_si128(m1A, m1E);
930  m1B = _mm_and_si128(m1B, m1F);
931  m1C = _mm_and_si128(m1C, m1G);
932  m1D = _mm_and_si128(m1D, m1H);
933 */
934  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+0));
935  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+1));
936  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+2));
937  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+3));
938 
939  _mm_store_si128(dst+0, m1A);
940  _mm_store_si128(dst+1, m1B);
941  _mm_store_si128(dst+2, m1C);
942  _mm_store_si128(dst+3, m1D);
943 
944  m1A = _mm_or_si128(m1A, m1B);
945  m1C = _mm_or_si128(m1C, m1D);
946  m1A = _mm_or_si128(m1A, m1C);
947 
948  const __m128i maskz = _mm_setzero_si128();
949  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
950 
951  m1A = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+4)), _mm_xor_si128(maskFF,_mm_load_si128(src2+4)));
952  m1B = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+5)), _mm_xor_si128(maskFF,_mm_load_si128(src2+5)));
953  m1C = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+6)), _mm_xor_si128(maskFF,_mm_load_si128(src2+6)));
954  m1D = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src1+7)), _mm_xor_si128(maskFF,_mm_load_si128(src2+7)));
955 /*
956  m1E = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+4)), _mm_xor_si128(maskFF,_mm_load_si128(src4+4)));
957  m1F = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+5)), _mm_xor_si128(maskFF,_mm_load_si128(src4+5)));
958  m1G = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+6)), _mm_xor_si128(maskFF,_mm_load_si128(src4+6)));
959  m1H = _mm_and_si128(_mm_xor_si128(maskFF,_mm_load_si128(src3+7)), _mm_xor_si128(maskFF,_mm_load_si128(src4+7)));
960 
961  m1A = _mm_and_si128(m1A, m1E);
962  m1B = _mm_and_si128(m1B, m1F);
963  m1C = _mm_and_si128(m1C, m1G);
964  m1D = _mm_and_si128(m1D, m1H);
965 */
966  m1A = _mm_and_si128(m1A, _mm_load_si128(dst+4));
967  m1B = _mm_and_si128(m1B, _mm_load_si128(dst+5));
968  m1C = _mm_and_si128(m1C, _mm_load_si128(dst+6));
969  m1D = _mm_and_si128(m1D, _mm_load_si128(dst+7));
970 
971  _mm_store_si128(dst+4, m1A);
972  _mm_store_si128(dst+5, m1B);
973  _mm_store_si128(dst+6, m1C);
974  _mm_store_si128(dst+7, m1D);
975 
976  m1A = _mm_or_si128(m1A, m1B);
977  m1C = _mm_or_si128(m1C, m1D);
978  m1A = _mm_or_si128(m1A, m1C);
979 
980  bool z2 = (_mm_movemask_epi8(_mm_cmpeq_epi8(m1A, maskz)) == 0xFFFF);
981  return z1 & z2;
982 }
983 
984 
985 
986 
987 /*!
988  \brief Find first non-zero bit
989  @ingroup SSE2
990 */
991 inline
992 bool sse2_bit_find_first(const __m128i* BMRESTRICT block, unsigned off,
993  unsigned* pos) BMNOEXCEPT
994 {
995  unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
996 
997  block = (const __m128i*)((bm::word_t*)(block) + off);
998  const __m128i* block_end =
999  (const __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1000  const __m128i maskZ = _mm_setzero_si128();
1001  __m128i mA, mB;
1002  unsigned simd_lane = 0;
1003  int bsf;
1004  do
1005  {
1006  mA = _mm_load_si128(block); mB = _mm_load_si128(block+1);
1007  __m128i mOR = _mm_or_si128(mA, mB);
1008  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
1009  if (!z1) // test 2x128 lanes
1010  {
1011  z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
1012  if (!z1)
1013  {
1014  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1015  mask = ~mask; // invert to find (w != 0)
1016  BM_ASSERT(mask);
1017  bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1018  _mm_store_si128 ((__m128i*)simd_buf, mA);
1019  unsigned widx = bsf >> 2; // (bsf / 4);
1020  unsigned w = simd_buf[widx];
1021  bsf = bm::bit_scan_forward32(w); // find first bit != 0
1022  *pos = (off * 32) +(simd_lane * 128) + (widx * 32) + bsf;
1023  return true;
1024  }
1025  unsigned mask = (_mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ)));
1026  mask = ~mask; // invert to find (w != 0)
1027  BM_ASSERT(mask);
1028  bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1029  _mm_store_si128 ((__m128i*)simd_buf, mB);
1030  unsigned widx = bsf >> 2; // (bsf / 4);
1031  unsigned w = simd_buf[widx];
1032  bsf = bm::bit_scan_forward32(w); // find first bit != 0
1033  *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf;
1034  return true;
1035  }
1036  simd_lane+=2;
1037  block+=2;
1038  } while (block < block_end);
1039 
1040  return false;
1041 }
1042 
1043 /*!
1044  \brief Find first bit which is different between two bit-blocks
1045  @ingroup SSE2
1046 */
1047 inline
1049  const __m128i* BMRESTRICT block2,
1050  unsigned* pos) BMNOEXCEPT
1051 {
1052  unsigned BM_ALIGN32 simd_buf[4] BM_ALIGN32ATTR;
1053 
1054  const __m128i* block1_end =
1055  (const __m128i*)((bm::word_t*)(block1) + bm::set_block_size);
1056  const __m128i maskZ = _mm_setzero_si128();
1057  __m128i mA, mB;
1058  unsigned simd_lane = 0;
1059  do
1060  {
1061  mA = _mm_xor_si128(_mm_load_si128(block1), _mm_load_si128(block2));
1062  mB = _mm_xor_si128(_mm_load_si128(block1+1), _mm_load_si128(block2+1));
1063  __m128i mOR = _mm_or_si128(mA, mB);
1064  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mOR, maskZ)) == 0xFFFF);
1065  if (!z1) // test 2x128 lanes
1066  {
1067  z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mA, maskZ)) == 0xFFFF);
1068  if (!z1) // test 2x128 lanes
1069  {
1070  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mA, maskZ));
1071  mask = ~mask; // invert to find (w != 0)
1072  BM_ASSERT(mask);
1073  int bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1074  _mm_store_si128 ((__m128i*)simd_buf, mA);
1075  unsigned widx = bsf >> 2; // (bsf / 4);
1076  unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mA, widx);
1077  bsf = bm::bit_scan_forward32(w); // find first bit != 0
1078  *pos = (simd_lane * 128) + (widx * 32) + bsf;
1079  return true;
1080  }
1081  unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi32(mB, maskZ));
1082  mask = ~mask; // invert to find (w != 0)
1083  BM_ASSERT(mask);
1084  int bsf = bm::bit_scan_forward32(mask); // find first !=0 (could use lzcnt())
1085  _mm_store_si128 ((__m128i*)simd_buf, mB);
1086  unsigned widx = bsf >> 2; // (bsf / 4);
1087  unsigned w = simd_buf[widx]; // _mm_extract_epi32 (mB, widx);
1088  bsf = bm::bit_scan_forward32(w); // find first bit != 0
1089  *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1090  return true;
1091  }
1092  simd_lane+=2;
1093  block1+=2; block2+=2;
1094  } while (block1 < block1_end);
1095  return false;
1096 }
1097 
1098 /*
1099 Snippets to extract32 in SSE2:
1100 
1101 inline int get_x(const __m128i& vec){return _mm_cvtsi128_si32 (vec);}
1102 inline int get_y(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0x55));}
1103 inline int get_z(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0xAA));}
1104 inline int get_w(const __m128i& vec){return _mm_cvtsi128_si32 (_mm_shuffle_epi32(vec,0xFF));}
1105 */
1106 
1107 /*!
1108  @brief block shift right by 1
1109  @ingroup SSE2
1110 */
1111 inline
1112 bool sse2_shift_r1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1113 {
1114  __m128i* block_end =
1115  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1116  __m128i m1COshft, m2COshft;
1117  __m128i mAcc = _mm_set1_epi32(0);
1118 
1119  __m128i mMask0 = _mm_set_epi32(-1,-1,-1, 0);
1120 
1121  unsigned co2;
1122  for (;block < block_end; block += 2)
1123  {
1124  __m128i m1A = _mm_load_si128(block);
1125  __m128i m2A = _mm_load_si128(block+1);
1126 
1127  __m128i m1CO = _mm_srli_epi32(m1A, 31);
1128  __m128i m2CO = _mm_srli_epi32(m2A, 31);
1129 
1130  co2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m1CO, 0xFF));
1131 
1132  m1A = _mm_slli_epi32(m1A, 1); // (block[i] << 1u)
1133  m2A = _mm_slli_epi32(m2A, 1);
1134 
1135  m1COshft = _mm_slli_si128 (m1CO, 4); // byte shift-l by 1 int32
1136  m2COshft = _mm_slli_si128 (m2CO, 4);
1137 
1138  m1COshft = _mm_and_si128(m1COshft, mMask0); // clear the vec[0]
1139  m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(0, 0, 0, co1)); // vec[0] = co1
1140 
1141  m2COshft = _mm_and_si128(m2COshft, mMask0); // clear the vec[0]
1142  m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(0, 0, 0, co2)); // vec[0] = co2
1143 
1144  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1145  m2A = _mm_or_si128(m2A, m2COshft);
1146 
1147  co1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(m2CO, 0xFF));
1148 
1149  _mm_store_si128(block, m1A);
1150  _mm_store_si128(block+1, m2A);
1151 
1152  mAcc = _mm_or_si128(mAcc, m1A);
1153  mAcc = _mm_or_si128(mAcc, m2A);
1154  }
1155  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
1156  *empty_acc = !z1;
1157  return co1;
1158 }
1159 
1160 /*!
1161  @brief block shift left by 1
1162  @ingroup SSE2
1163 */
1164 inline
1165 bool sse2_shift_l1(__m128i* block, unsigned* empty_acc, unsigned co1) BMNOEXCEPT
1166 {
1167  __m128i* block_end =
1168  ( __m128i*)((bm::word_t*)(block) + bm::set_block_size);
1169  __m128i mAcc = _mm_set1_epi32(0);
1170  __m128i mMask1 = _mm_set1_epi32(1);
1171  __m128i mMask0 = _mm_set_epi32(0, -1, -1, -1);
1172 
1173  unsigned co2;
1174  for (--block_end; block_end >= block; block_end -= 2)
1175  {
1176  __m128i m1A = _mm_load_si128(block_end);
1177  __m128i m2A = _mm_load_si128(block_end-1);
1178 
1179  __m128i m1CO = _mm_and_si128(m1A, mMask1);
1180  __m128i m2CO = _mm_and_si128(m2A, mMask1);
1181 
1182  co2 = _mm_cvtsi128_si32 (m1CO); // get vec[0]
1183 
1184  m1A = _mm_srli_epi32(m1A, 1); // (block[i] >> 1u)
1185  m2A = _mm_srli_epi32(m2A, 1);
1186 
1187  __m128i m1COshft = _mm_srli_si128 (m1CO, 4); // byte shift-r by 1 int32
1188  __m128i m2COshft = _mm_srli_si128 (m2CO, 4);
1189 
1190  // m1COshft = _mm_insert_epi32 (m1COshft, co1, 3);
1191  // m2COshft = _mm_insert_epi32 (m2COshft, co2, 3);
1192  m1COshft = _mm_and_si128(m1COshft, mMask0); // clear the vec[0]
1193  m1COshft = _mm_or_si128(m1COshft, _mm_set_epi32(co1, 0, 0, 0)); // vec[3] = co1
1194  m2COshft = _mm_and_si128(m2COshft, mMask0); // clear the vec[0]
1195  m2COshft = _mm_or_si128(m2COshft, _mm_set_epi32(co2, 0, 0, 0)); // vec[3] = co2
1196 
1197 
1198  m1COshft = _mm_slli_epi32(m1COshft, 31);
1199  m2COshft = _mm_slli_epi32(m2COshft, 31);
1200 
1201  m1A = _mm_or_si128(m1A, m1COshft); // block[i] |= co_flag
1202  m2A = _mm_or_si128(m2A, m2COshft);
1203 
1204  co1 = _mm_cvtsi128_si32 (m2CO); // get vec[0]
1205 
1206  _mm_store_si128(block_end, m1A);
1207  _mm_store_si128(block_end-1, m2A);
1208 
1209  mAcc = _mm_or_si128(mAcc, m1A);
1210  mAcc = _mm_or_si128(mAcc, m2A);
1211  } // for
1212 
1213  bool z1 = (_mm_movemask_epi8(_mm_cmpeq_epi8(mAcc, _mm_set1_epi32(0))) == 0xFFFF);
1214  *empty_acc = !z1; // !_mm_testz_si128(mAcc, mAcc);
1215  return co1;
1216 }
1217 
1218 
1219 
1220 inline
1222  const __m128i* BMRESTRICT block_end,
1223  unsigned* BMRESTRICT bit_count)
1224 {
1225  const unsigned mu1 = 0x55555555;
1226  const unsigned mu2 = 0x33333333;
1227  const unsigned mu3 = 0x0F0F0F0F;
1228  const unsigned mu4 = 0x0000003F;
1229 
1230  // Loading masks
1231  __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
1232  __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
1233  __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
1234  __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
1235  __m128i mcnt;//, ccnt;
1236  mcnt = _mm_xor_si128(m1, m1); // bit_cnt = 0
1237  //ccnt = _mm_xor_si128(m1, m1); // change_cnt = 0
1238 
1239  __m128i tmp1, tmp2;
1240 
1241  int count = (int)(block_end - block)*4; //0;//1;
1242 
1243  bm::word_t w, w0, w_prev;//, w_l;
1244  const int w_shift = sizeof(w) * 8 - 1;
1245  bool first_word = true;
1246 
1247  // first word
1248  {
1249  const bm::word_t* blk = (const bm::word_t*) block;
1250  w = w0 = blk[0];
1251  w ^= (w >> 1);
1252  count += bm::word_bitcount(w);
1253  count -= (w_prev = (w0 >> w_shift)); // negative value correction
1254  }
1255 
1257 
1258  do
1259  {
1260  // compute bit-count
1261  // ---------------------------------------------------------------------
1262  {
1263  __m128i b = _mm_load_si128(block);
1264 
1265  // w ^(w >> 1)
1266  tmp1 = _mm_srli_epi32(b, 1); // tmp1 = b >> 1
1267  tmp2 = _mm_xor_si128(b, tmp1); // tmp2 = tmp1 ^ b;
1268  _mm_store_si128((__m128i*)tcnt, tmp2);
1269 
1270 
1271  // compare with zero
1272  // SSE4: _mm_test_all_zero()
1273  {
1274  // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
1275  //tmp1 = _mm_srli_epi32(b, 1); // tmp1 = (b >> 1 & 0x55555555)
1276  tmp1 = _mm_and_si128(tmp1, m1);
1277  tmp2 = _mm_and_si128(b, m1); // tmp2 = (b & 0x55555555)
1278  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
1279 
1280  // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
1281  tmp1 = _mm_srli_epi32(b, 2); // (b >> 2 & 0x33333333)
1282  tmp1 = _mm_and_si128(tmp1, m2);
1283  tmp2 = _mm_and_si128(b, m2); // (b & 0x33333333)
1284  b = _mm_add_epi32(tmp1, tmp2); // b = tmp1 + tmp2
1285 
1286  // b = (b + (b >> 4)) & 0x0F0F0F0F;
1287  tmp1 = _mm_srli_epi32(b, 4); // tmp1 = b >> 4
1288  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 4)
1289  b = _mm_and_si128(b, m3); //& 0x0F0F0F0F
1290 
1291  // b = b + (b >> 8);
1292  tmp1 = _mm_srli_epi32 (b, 8); // tmp1 = b >> 8
1293  b = _mm_add_epi32(b, tmp1); // b = b + (b >> 8)
1294 
1295  // b = (b + (b >> 16)) & 0x0000003F;
1296  tmp1 = _mm_srli_epi32 (b, 16); // b >> 16
1297  b = _mm_add_epi32(b, tmp1); // b + (b >> 16)
1298  b = _mm_and_si128(b, m4); // (b >> 16) & 0x0000003F;
1299 
1300  mcnt = _mm_add_epi32(mcnt, b); // mcnt += b
1301  }
1302 
1303  }
1304  // ---------------------------------------------------------------------
1305  {
1306  //__m128i b = _mm_load_si128(block);
1307  // TODO: SSE4...
1308  //w = _mm_extract_epi32(b, i);
1309 
1310  const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
1311 
1312  if (first_word)
1313  {
1314  first_word = false;
1315  }
1316  else
1317  {
1318  if (0!=(w0=blk[0]))
1319  {
1320  count += bm::word_bitcount(tcnt[0]);
1321  count -= !(w_prev ^ (w0 & 1));
1322  count -= w_prev = (w0 >> w_shift);
1323  }
1324  else
1325  {
1326  count -= !w_prev; w_prev ^= w_prev;
1327  }
1328  }
1329  if (0!=(w0=blk[1]))
1330  {
1331  count += bm::word_bitcount(tcnt[1]);
1332  count -= !(w_prev ^ (w0 & 1));
1333  count -= w_prev = (w0 >> w_shift);
1334  }
1335  else
1336  {
1337  count -= !w_prev; w_prev ^= w_prev;
1338  }
1339  if (0!=(w0=blk[2]))
1340  {
1341  count += bm::word_bitcount(tcnt[2]);
1342  count -= !(w_prev ^ (w0 & 1));
1343  count -= w_prev = (w0 >> w_shift);
1344  }
1345  else
1346  {
1347  count -= !w_prev; w_prev ^= w_prev;
1348  }
1349  if (0!=(w0=blk[3]))
1350  {
1351  count += bm::word_bitcount(tcnt[3]);
1352  count -= !(w_prev ^ (w0 & 1));
1353  count -= w_prev = (w0 >> w_shift);
1354  }
1355  else
1356  {
1357  count -= !w_prev; w_prev ^= w_prev;
1358  }
1359  }
1360  } while (++block < block_end);
1361 
1362  _mm_store_si128((__m128i*)tcnt, mcnt);
1363  *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
1364 
1365  return unsigned(count);
1366 }
1367 
1368 #ifdef __GNUG__
1369 // necessary measure to silence false warning from GCC about negative pointer arithmetics
1370 #pragma GCC diagnostic push
1371 #pragma GCC diagnostic ignored "-Warray-bounds"
1372 #endif
1373 
1374 /*!
1375 SSE4.2 check for one to two (variable len) 128 bit SSE lines for gap search results (8 elements)
1376 \internal
1377 */
1378 inline
1380  const bm::gap_word_t pos, unsigned size)
1381 {
1382  BM_ASSERT(size <= 16);
1383  BM_ASSERT(size);
1384 
1385  const unsigned unroll_factor = 8;
1386  if (size < 4) // for very short vector use conventional scan
1387  {
1388  if (pbuf[0] >= pos) { size = 0; }
1389  else if (pbuf[1] >= pos) { size = 1; }
1390  else { size = 2; BM_ASSERT(pbuf[2] >= pos); }
1391  return size;
1392  }
1393 
1394  __m128i m1, mz, maskF, maskFL;
1395 
1396  mz = _mm_setzero_si128();
1397  m1 = _mm_loadu_si128((__m128i*)(pbuf)); // load first 8 elements
1398 
1399  maskF = _mm_cmpeq_epi32(mz, mz); // set all FF
1400  maskFL = _mm_slli_si128(maskF, 4 * 2); // byle shift to make [0000 FFFF]
1401  int shiftL = (64 - (unroll_factor - size) * 16);
1402  maskFL = _mm_slli_epi64(maskFL, shiftL); // additional bit shift to [0000 00FF]
1403 
1404  m1 = _mm_andnot_si128(maskFL, m1); // m1 = (~mask) & m1
1405  m1 = _mm_or_si128(m1, maskFL);
1406 
1407  __m128i mp = _mm_set1_epi16(pos); // broadcast pos into all elements of a SIMD vector
1408  __m128i mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // unsigned m1 >= mp
1409  int mi = _mm_movemask_epi8(mge_mask); // collect flag bits
1410  if (mi)
1411  {
1412  int bsr_i= bm::bit_scan_fwd(mi) >> 1;
1413  return bsr_i; // address of first one element (target)
1414  }
1415  if (size == 8)
1416  return size;
1417 
1418  // inspect the next lane with possible step back (to avoid over-read the block boundaries)
1419  // GCC gives a false warning for "- unroll_factor" here
1420  const bm::gap_word_t* BMRESTRICT pbuf2 = pbuf + size - unroll_factor;
1421  BM_ASSERT(pbuf2 > pbuf); // assert in place to make sure GCC warning is indeed false
1422 
1423  m1 = _mm_loadu_si128((__m128i*)(pbuf2)); // load next elements (with possible overlap)
1424  mge_mask = _mm_cmpeq_epi16(_mm_subs_epu16(mp, m1), mz); // m1 >= mp
1425  mi = _mm_movemask_epi8(mge_mask);
1426  if (mi)
1427  {
1428  int bsr_i = bm::bit_scan_fwd(mi) >> 1;
1429  return size - (unroll_factor - bsr_i);
1430  }
1431  return size;
1432 }
1433 
1434 /**
1435  Hybrid binary search, starts as binary, then switches to linear scan
1436 
1437  \param buf - GAP buffer pointer.
1438  \param pos - index of the element.
1439  \param is_set - output. GAP value (0 or 1).
1440  \return GAP index.
1441 
1442  @ingroup SSE2
1443 */
1444 inline
1445 unsigned sse2_gap_bfind(const unsigned short* BMRESTRICT buf,
1446  unsigned pos, unsigned* BMRESTRICT is_set)
1447 {
1448  unsigned start = 1;
1449  unsigned end = 1 + ((*buf) >> 3);
1450 
1451  const unsigned arr_end = end;
1452  BM_ASSERT(start != end);
1453  unsigned size = end - start;
1454 
1455  for (; size >= 64; size = end - start)
1456  {
1457  unsigned mid = (start + end) >> 1;
1458  if (buf[mid] < pos)
1459  start = mid+1;
1460  else
1461  end = mid;
1462  if (buf[mid = (start + end) >> 1] < pos)
1463  start = mid+1;
1464  else
1465  end = mid;
1466  if (buf[mid = (start + end) >> 1] < pos)
1467  start = mid+1;
1468  else
1469  end = mid;
1470  if (buf[mid = (start + end) >> 1] < pos)
1471  start = mid+1;
1472  else
1473  end = mid;
1474  } // for
1475 
1476  for (; size >= 16; size = end - start)
1477  {
1478  if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1479  start = mid + 1;
1480  else
1481  end = mid;
1482  if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
1483  start = mid + 1;
1484  else
1485  end = mid;
1486  } // for
1487 
1488  size += (end != arr_end);
1489  start += bm::sse2_gap_find(buf + start, (bm::gap_word_t)pos, size);
1490  BM_ASSERT(buf[start] >= pos);
1491  BM_ASSERT(buf[start - 1] < pos || (start == 1));
1492 
1493  *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1494  return start;
1495 }
1496 
1497 /**
1498  Hybrid binary search, starts as binary, then switches to scan
1499  @ingroup SSE2
1500 */
1501 inline
1502 unsigned sse2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
1503 {
1504  unsigned is_set;
1505  bm::sse2_gap_bfind(buf, pos, &is_set);
1506  return is_set;
1507 }
1508 
1509 
1510 
1511 
1512 #ifdef __GNUG__
1513 #pragma GCC diagnostic pop
1514 #endif
1515 
1516 
1517 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1518  sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1519 
1520 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1521  sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1522 
1523 #define VECT_BITCOUNT(first, last) \
1524  sse2_bit_count((__m128i*) (first), (__m128i*) (last))
1525 
1526 #define VECT_BITCOUNT_AND(first, last, mask) \
1527  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1528 
1529 #define VECT_BITCOUNT_OR(first, last, mask) \
1530  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1531 
1532 #define VECT_BITCOUNT_XOR(first, last, mask) \
1533  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1534 
1535 #define VECT_BITCOUNT_SUB(first, last, mask) \
1536  sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1537 
1538 #define VECT_INVERT_BLOCK(first) \
1539  sse2_invert_block((__m128i*)first);
1540 
1541 #define VECT_AND_BLOCK(dst, src) \
1542  sse2_and_block((__m128i*) dst, (__m128i*) (src))
1543 
1544 #define VECT_AND_DIGEST(dst, src) \
1545  sse2_and_digest((__m128i*) dst, (const __m128i*) (src))
1546 
1547 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1548  sse2_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1549 
1550 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1551  sse2_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1552 
1553 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
1554  sse2_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1555 
1556 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1557  sse2_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1558 
1559 #define VECT_OR_BLOCK(dst, src) \
1560  sse2_or_block((__m128i*) dst, (__m128i*) (src))
1561 
1562 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1563  sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1564 
1565 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1566  sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1567 
1568 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1569  sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1570 
1571 #define VECT_SUB_BLOCK(dst, src) \
1572  sse2_sub_block((__m128i*) dst, (__m128i*) (src))
1573 
1574 #define VECT_SUB_DIGEST(dst, src) \
1575  sse2_sub_digest((__m128i*) dst, (const __m128i*) (src))
1576 
1577 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1578  sse2_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1579 
1580 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1581  sse2_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1582 
1583 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
1584  sse2_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1585 
1586 #define VECT_XOR_BLOCK(dst, src) \
1587  sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1588 
1589 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1590  sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1591 
1592 #define VECT_COPY_BLOCK(dst, src) \
1593  sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1594 
1595 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1596  sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1597 
1598 #define VECT_STREAM_BLOCK(dst, src) \
1599  sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1600 
1601 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1602  sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1603 
1604 #define VECT_SET_BLOCK(dst, value) \
1605  sse2_set_block((__m128i*) dst, value)
1606 
1607 #define VECT_IS_ZERO_BLOCK(dst) \
1608  sse2_is_all_zero((__m128i*) dst)
1609 
1610 #define VECT_IS_ONE_BLOCK(dst) \
1611  sse2_is_all_one((__m128i*) dst)
1612 
1613 #define VECT_IS_DIGEST_ZERO(start) \
1614  sse2_is_digest_zero((__m128i*)start)
1615 
1616 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1617  sse2_block_set_digest((__m128i*)dst, val)
1618 
1619 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1620  sse2_lower_bound_scan_u32(arr, target, from, to)
1621 
1622 #define VECT_SHIFT_R1(b, acc, co) \
1623  sse2_shift_r1((__m128i*)b, acc, co)
1624 
1625 
1626 #define VECT_BIT_FIND_FIRST(src, off, pos) \
1627  sse2_bit_find_first((__m128i*) src, off, pos)
1628 
1629 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1630  sse2_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1631 
1632 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1633  sse2_bit_block_xor(t, src, src_xor, d)
1634 
1635 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1636  sse2_bit_block_xor_2way(t, src_xor, d)
1637 
1638 #define VECT_GAP_BFIND(buf, pos, is_set) \
1639  sse2_gap_bfind(buf, pos, is_set)
1640 
1641 #define VECT_GAP_TEST(buf, pos) \
1642  sse2_gap_test(buf, pos)
1643 
1644 } // namespace
1645 
1646 
1647 #ifdef __GNUG__
1648 #pragma GCC diagnostic pop
1649 #endif
1650 
1651 
1652 #endif
ncbi::TMaskedQueryRegions mask
Definitions(internal)
#define BM_ALIGN16
Definition: bmdef.h:304
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN32
Definition: bmdef.h:306
#define BM_ALIGN16ATTR
Definition: bmdef.h:305
#define BMFORCEINLINE
Definition: bmdef.h:213
#define BM_ASSERT
Definition: bmdef.h:139
#define BM_ALIGN32ATTR
Definition: bmdef.h:307
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
char value[7]
Definition: config.c:431
bool sse2_bit_find_first(const __m128i *block, unsigned off, unsigned *pos) noexcept
Find first non-zero bit.
Definition: bmsse2.h:992
bool sse2_sub_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
Definition: bmsse2.h:780
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
Definition: bmsse2.h:66
bool sse2_and_or_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND-OR block digest stride dst |= *src1 & src2.
Definition: bmsse2.h:464
void sse2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse2.h:360
unsigned sse2_gap_test(const unsigned short *buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmsse2.h:1502
bool sse2_and_digest_5way(__m128i *dst, const __m128i *src1, const __m128i *src2, const __m128i *src3, const __m128i *src4) noexcept
AND block digest stride.
Definition: bmsse2.h:519
bool sse2_and_digest_3way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND block digest stride.
Definition: bmsse2.h:599
bool sse2_and_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND block digest stride dst = *src1 & src2.
Definition: bmsse2.h:681
bool sse2_is_all_one(const __m128i *block) noexcept
check if block is all ONE bits
Definition: bmsse2.h:219
bool sse2_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) noexcept
block shift left by 1
Definition: bmsse2.h:1165
bool sse2_and_digest(__m128i *dst, const __m128i *src) noexcept
AND block digest stride dst &= *src.
Definition: bmsse2.h:415
bool sse2_is_digest_zero(const __m128i *block) noexcept
check if digest stride is all zero bits
Definition: bmsse2.h:244
void sse2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmsse2.h:289
bool sse2_is_all_zero(const __m128i *block) noexcept
check if block is all zero bits
Definition: bmsse2.h:194
unsigned sse2_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
Definition: bmsse2.h:1445
bool sse2_bit_find_first_diff(const __m128i *block1, const __m128i *block2, unsigned *pos) noexcept
Find first bit which is different between two bit-blocks.
Definition: bmsse2.h:1048
void sse2_block_set_digest(__m128i *dst, unsigned value) noexcept
set digest stride to 0xFF.. or 0x0 value
Definition: bmsse2.h:268
bool sse2_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) noexcept
block shift right by 1
Definition: bmsse2.h:1112
bool sse2_sub_digest(__m128i *dst, const __m128i *src) noexcept
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmsse2.h:731
bool sse2_sub_digest_5way(__m128i *dst, const __m128i *src1, const __m128i *src2, const __m128i *src3, const __m128i *src4) noexcept
SUB block digest stride.
Definition: bmsse2.h:828
bool sse2_sub_digest_3way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
SUB block digest stride.
Definition: bmsse2.h:911
unsigned word_bitcount64(bm::id64_t x) noexcept
Definition: bmutil.h:605
bm::id_t word_bitcount(bm::id_t w) noexcept
Definition: bmutil.h:582
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
char * buf
int i
#include<zmmintrin.h>
Definition: bm.h:78
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
unsigned int word_t
Definition: bmconst.h:39
bm::id_t sse2_bit_block_calc_count_change(const __m128i *block, const __m128i *block_end, unsigned *bit_count)
Definition: bmsse2.h:1221
bm::id_t sse2_bit_count_op(const __m128i *block, const __m128i *block_end, const __m128i *mask_block, Func sse2_func)
Definition: bmsse2.h:127
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
Definition: bmutil.h:335
unsigned sse2_gap_find(const bm::gap_word_t *pbuf, const bm::gap_word_t pos, unsigned size)
Definition: bmsse2.h:1379
const unsigned set_block_size
Definition: bmconst.h:55
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
unsigned int id_t
Definition: bmconst.h:38
unsigned bit_scan_forward32(unsigned w) noexcept
Definition: bmutil.h:319
T bit_scan_fwd(T v) noexcept
Definition: bmutil.h:297
unsigned short gap_word_t
Definition: bmconst.h:78
unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345
const struct ncbi::grid::netcache::search::fields::SIZE size
EIPRangeType t
Definition: ncbi_localip.c:101
static __m128i _mm_subs_epu16(__m128i a, __m128i b)
Definition: sse2neon.h:6148
static __m128i _mm_setzero_si128()
Definition: sse2neon.h:5294
static __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6415
static int _mm_cvtsi128_si32(__m128i a)
Definition: sse2neon.h:4141
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5793
static __m128i _mm_srli_si128(__m128i a, int imm)
Definition: sse2neon.h:5840
static __m128i _mm_slli_epi64(__m128i a, int imm)
Definition: sse2neon.h:5541
static int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4731
static __m128i _mm_slli_si128(__m128i a, int imm)
Definition: sse2neon.h:5559
static __m128i _mm_set1_epi16(short w)
Definition: sse2neon.h:5154
static __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:3222
static __m128i _mm_slli_epi32(__m128i a, int imm)
Definition: sse2neon.h:5520
static __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4525
static void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5892
static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3239
static __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4426
static __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:4976
static __m128i _mm_add_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:2939
static __m128i _mm_set_epi32(int, int, int, int)
Definition: sse2neon.h:5070
static __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5167
static __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3111
int64x2_t __m128i
Definition: sse2neon.h:200
#define _mm_shuffle_epi32(a, imm)
Definition: sse2neon.h:5313
static __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3083
static __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3230
Modified on Thu Dec 07 10:09:59 2023 by modify_doxy.py rev. 669887