NCBI C++ ToolKit
bmavx2.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef BMAVX2__H__INCLUDED__
2 #define BMAVX2__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2022 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 // some of the algorithms here is based on modified libpopcnt library by Kim Walisch
22 // https://github.com/kimwalisch/libpopcnt/
23 //
24 /*
25  * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
26  * population count) in an array as quickly as possible using
27  * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
28  *
29  * Copyright (c) 2016 - 2017, Kim Walisch
30  * Copyright (c) 2016 - 2017, Wojciech Muła
31  *
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions are met:
36  *
37  * 1. Redistributions of source code must retain the above copyright notice, this
38  * list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright notice,
40  * this list of conditions and the following disclaimer in the documentation
41  * and/or other materials provided with the distribution.
42  *
43  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
45  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
46  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
47  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
48  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
49  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
50  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53  */
54 
55 
56 /** @defgroup AVX2 AVX2 functions
57  Processor specific optimizations for AVX2 instructions (internals)
58  @ingroup bvector
59  @internal
60  */
61 
62 
63 // Header implements processor specific intrinsics declarations for AVX2
64 // instruction set
65 //
66 #include<emmintrin.h>
67 #include<immintrin.h>
68 
69 #include "bmdef.h"
70 #include "bmbmi2.h"
71 #include "bmutil.h"
72 
73 namespace bm
74 {
75 
76 // debugging utils
77 #if 0
78 inline
79 void avx2_print256_u32(const char* prefix, const __m256i & value)
80 {
81  const size_t n = sizeof(__m256i) / sizeof(unsigned);
82  unsigned buffer[n];
83  _mm256_storeu_si256((__m256i*)buffer, value);
84  std::cout << prefix << " [ ";
85  for (int i = n-1; 1; --i)
86  {
87  std::cout << std::hex << buffer[i] << " ";
88  if (i == 0)
89  break;
90  }
91  std::cout << "]" << std::endl;
92 }
93 
94 inline
95 void avx2_print256_u16(const char* prefix, const __m256i & value)
96 {
97  const size_t n = sizeof(__m256i) / sizeof(unsigned short);
98  unsigned short buffer[n];
99  _mm256_storeu_si256((__m256i*)buffer, value);
100  std::cout << prefix << " [ ";
101  for (int i = n-1; 1; --i)
102  {
103  std::cout << buffer[i] << " ";
104  if (i == 0)
105  break;
106  }
107  std::cout << "]" << std::endl;
108 }
109 #endif
110 
111 #ifdef __GNUG__
112 #pragma GCC diagnostic push
113 #pragma GCC diagnostic ignored "-Wconversion"
114 #endif
115 
116 
117 #define BM_CSA256(h, l, a, b, c) \
118 { \
119  __m256i u = _mm256_xor_si256(a, b); \
120  h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \
121  l = _mm256_xor_si256(u, c); \
122 }
123 
124 #define BM_AVX2_BIT_COUNT(ret, v) \
125 { \
126  __m256i lo = _mm256_and_si256(v, low_mask); \
127  __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \
128  __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \
129  __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \
130  ret = _mm256_sad_epu8(cnt1, cnt2); \
131 }
132 
133 #define BM_AVX2_DECL_LOOKUP1 \
134  __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \
135  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
136 #define BM_AVX2_DECL_LOOKUP2 \
137 __m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \
138  4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
139 
140 #define BM_AVX2_POPCNT_PROLOG \
141  BM_AVX2_DECL_LOOKUP1 \
142  BM_AVX2_DECL_LOOKUP2 \
143  __m256i low_mask = _mm256_set1_epi8(0x0f); \
144  __m256i bc;
145 
146 /*!
147  @brief AVX2 Harley-Seal popcount
148  The algorithm is based on the paper "Faster Population Counts
149  using AVX2 Instructions" by Daniel Lemire, Nathan Kurz and
150  Wojciech Mula (23 Nov 2016).
151  @see https://arxiv.org/abs/1611.07612
152 
153  @ingroup AVX2
154 */
155 inline
156 bm::id_t avx2_bit_count(const __m256i* BMRESTRICT block,
157  const __m256i* BMRESTRICT block_end)
158 {
159  __m256i cnt = _mm256_setzero_si256();
160  __m256i ones = _mm256_setzero_si256();
161  __m256i twos = _mm256_setzero_si256();
162  __m256i fours = _mm256_setzero_si256();
163  __m256i eights = _mm256_setzero_si256();
164  __m256i sixteens = _mm256_setzero_si256();
165  __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
166  __m256i b, c;
167 
169  bm::id64_t* cnt64;
170 
171  do
172  {
173  b = _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
174  BM_CSA256(twosA, ones, ones, b, c);
175 
176  b = _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
177  BM_CSA256(twosB, ones, ones, b, c);
178  BM_CSA256(foursA, twos, twos, twosA, twosB);
179 
180  b = _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
181  BM_CSA256(twosA, ones, ones, b, c);
182 
183  b = _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
184  BM_CSA256(twosB, ones, ones, b, c);
185  BM_CSA256(foursB, twos, twos, twosA, twosB);
186  BM_CSA256(eightsA, fours, fours, foursA, foursB);
187 
188  b = _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
189  BM_CSA256(twosA, ones, ones, b, c);
190 
191  b = _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
192  BM_CSA256(twosB, ones, ones, b, c);
193  BM_CSA256(foursA, twos, twos, twosA, twosB);
194 
195  b = _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
196  BM_CSA256(twosA, ones, ones, b, c);
197 
198  b = _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
199  BM_CSA256(twosB, ones, ones, b, c);
200  BM_CSA256(foursB, twos, twos, twosA, twosB);
201  BM_CSA256(eightsB, fours, fours, foursA, foursB);
202  BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
203 
204  BM_AVX2_BIT_COUNT(bc, sixteens);
205  cnt = _mm256_add_epi64(cnt, bc);
206 
207  block += 16;
208  } while (block < block_end);
209 
210  cnt = _mm256_slli_epi64(cnt, 4);
211  BM_AVX2_BIT_COUNT(bc, eights)
212  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 3));
213  BM_AVX2_BIT_COUNT(bc, fours);
214  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 2));
215  BM_AVX2_BIT_COUNT(bc, twos);
216  cnt = _mm256_add_epi64(cnt, _mm256_slli_epi64(bc, 1));
217  BM_AVX2_BIT_COUNT(bc, ones);
218  cnt = _mm256_add_epi64(cnt, bc);
219 
220  cnt64 = (bm::id64_t*) &cnt;
221 
222  return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
223 }
224 
225 /*!
226  @brief Calculate population count based on digest
227 
228  @return popcnt
229  @ingroup AVX2
230 */
231 inline
233  bm::id64_t digest)
234 {
235  bm::id_t count = 0;
236  bm::id64_t* cnt64;
238  __m256i cnt = _mm256_setzero_si256();
239  while (digest)
240  {
241  bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
242 
243  unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
244  unsigned off = wave * bm::set_block_digest_wave_size;
245 
246  const __m256i* BMRESTRICT wave_src = (__m256i*)&block[off];
247 
248  __m256i m1A, m1B, m1C, m1D;
249  m1A = _mm256_load_si256(wave_src);
250  m1B = _mm256_load_si256(wave_src+1);
251  if (!_mm256_testz_si256(m1A, m1A))
252  {
253  BM_AVX2_BIT_COUNT(bc, m1A)
254  cnt = _mm256_add_epi64(cnt, bc);
255  }
256  if (!_mm256_testz_si256(m1B, m1B))
257  {
258  BM_AVX2_BIT_COUNT(bc, m1B)
259  cnt = _mm256_add_epi64(cnt, bc);
260  }
261 
262  m1C = _mm256_load_si256(wave_src+2);
263  m1D = _mm256_load_si256(wave_src+3);
264  if (!_mm256_testz_si256(m1C, m1C))
265  {
266  BM_AVX2_BIT_COUNT(bc, m1C)
267  cnt = _mm256_add_epi64(cnt, bc);
268  }
269  if (!_mm256_testz_si256(m1D, m1D))
270  {
271  BM_AVX2_BIT_COUNT(bc, m1D)
272  cnt = _mm256_add_epi64(cnt, bc);
273  }
274 
275  digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
276  } // while
277  cnt64 = (bm::id64_t*)&cnt;
278  count = (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
279  return count;
280 
281 }
282 
283 
284 
285 /*!
286  @brief AND bit count for two aligned bit-blocks
287  @ingroup AVX2
288 */
289 inline
291  const __m256i* BMRESTRICT block_end,
292  const __m256i* BMRESTRICT mask_block)
293 {
294  bm::id64_t* cnt64;
296  __m256i cnt = _mm256_setzero_si256();
297  __m256i ymm0, ymm1;
298 
299 
300  do
301  {
302  ymm0 = _mm256_load_si256(block);
303  ymm1 = _mm256_load_si256(mask_block);
304  ymm0 = _mm256_and_si256(ymm0, ymm1);
305  ++block; ++mask_block;
306  BM_AVX2_BIT_COUNT(bc, ymm0)
307  cnt = _mm256_add_epi64(cnt, bc);
308 
309  ymm0 = _mm256_load_si256(block);
310  ymm1 = _mm256_load_si256(mask_block);
311  ymm0 = _mm256_and_si256(ymm0, ymm1);
312  ++block; ++mask_block;
313  BM_AVX2_BIT_COUNT(bc, ymm0)
314  cnt = _mm256_add_epi64(cnt, bc);
315 
316  ymm0 = _mm256_load_si256(block);
317  ymm1 = _mm256_load_si256(mask_block);
318  ymm0 = _mm256_and_si256(ymm0, ymm1);
319  ++block; ++mask_block;
320  BM_AVX2_BIT_COUNT(bc, ymm0)
321  cnt = _mm256_add_epi64(cnt, bc);
322 
323  ymm0 = _mm256_load_si256(block);
324  ymm1 = _mm256_load_si256(mask_block);
325  ymm0 = _mm256_and_si256(ymm0, ymm1);
326  ++block; ++mask_block;
327  BM_AVX2_BIT_COUNT(bc, ymm0)
328  cnt = _mm256_add_epi64(cnt, bc);
329 
330  } while (block < block_end);
331 
332  cnt64 = (bm::id64_t*)&cnt;
333  return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
334 }
335 
336 inline
337 bm::id_t avx2_bit_count_or(const __m256i* BMRESTRICT block,
338  const __m256i* BMRESTRICT block_end,
339  const __m256i* BMRESTRICT mask_block)
340 {
341  bm::id64_t* cnt64;
343  __m256i cnt = _mm256_setzero_si256();
344  do
345  {
346  __m256i tmp0 = _mm256_load_si256(block);
347  __m256i tmp1 = _mm256_load_si256(mask_block);
348 
349  tmp0 = _mm256_or_si256(tmp0, tmp1);
350 
352  cnt = _mm256_add_epi64(cnt, bc);
353 
354  ++block; ++mask_block;
355 
356  } while (block < block_end);
357 
358  cnt64 = (bm::id64_t*)&cnt;
359  return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
360 }
361 
362 
363 /*!
364  @brief XOR bit count for two aligned bit-blocks
365  @ingroup AVX2
366 */
367 inline
369  const __m256i* BMRESTRICT block_end,
370  const __m256i* BMRESTRICT mask_block)
371 {
372  bm::id64_t* cnt64;
374  __m256i cnt = _mm256_setzero_si256();
375  __m256i mA, mB, mC, mD;
376  do
377  {
378  mA = _mm256_xor_si256(_mm256_load_si256(block+0),
379  _mm256_load_si256(mask_block+0));
380  BM_AVX2_BIT_COUNT(bc, mA)
381  cnt = _mm256_add_epi64(cnt, bc);
382 
383  mB = _mm256_xor_si256(_mm256_load_si256(block+1),
384  _mm256_load_si256(mask_block+1));
385  BM_AVX2_BIT_COUNT(bc, mB);
386  cnt = _mm256_add_epi64(cnt, bc);
387 
388  mC = _mm256_xor_si256(_mm256_load_si256(block+2),
389  _mm256_load_si256(mask_block+2));
390  BM_AVX2_BIT_COUNT(bc, mC);
391  cnt = _mm256_add_epi64(cnt, bc);
392 
393  mD = _mm256_xor_si256(_mm256_load_si256(block+3),
394  _mm256_load_si256(mask_block+3));
395  BM_AVX2_BIT_COUNT(bc, mD);
396  cnt = _mm256_add_epi64(cnt, bc);
397 
398  block += 4; mask_block += 4;
399 
400  } while (block < block_end);
401 
402  cnt64 = (bm::id64_t*)&cnt;
403  return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
404 }
405 
406 
407 
408 /*!
409  @brief AND NOT bit count for two aligned bit-blocks
410  @ingroup AVX2
411 */
412 inline
414  const __m256i* BMRESTRICT block_end,
415  const __m256i* BMRESTRICT mask_block)
416 {
417  bm::id64_t* cnt64;
419  __m256i cnt = _mm256_setzero_si256();
420  do
421  {
422  __m256i tmp0 = _mm256_load_si256(block);
423  __m256i tmp1 = _mm256_load_si256(mask_block);
424 
425  tmp0 = _mm256_andnot_si256(tmp1, tmp0);
426 
428  cnt = _mm256_add_epi64(cnt, bc);
429 
430  ++block; ++mask_block;
431 
432  } while (block < block_end);
433 
434  cnt64 = (bm::id64_t*)&cnt;
435  return (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
436 }
437 
438 
439 
440 /*!
441  @brief XOR array elements to specified mask
442  *dst = *src ^ mask
443 
444  @ingroup AVX2
445 */
446 inline
447 void avx2_xor_arr_2_mask(__m256i* BMRESTRICT dst,
448  const __m256i* BMRESTRICT src,
449  const __m256i* BMRESTRICT src_end,
451 {
452  __m256i yM = _mm256_set1_epi32(int(mask));
453  do
454  {
455  _mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM)); // ymm1 = (~ymm1) & ymm2
456  _mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));
457  _mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));
458  _mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));
459 
460  dst += 4; src += 4;
461  } while (src < src_end);
462 }
463 
464 
465 /*!
466  @brief Inverts array elements and NOT them to specified mask
467  *dst = ~*src & mask
468 
469  @ingroup AVX2
470 */
471 inline
473  const __m256i* BMRESTRICT src,
474  const __m256i* BMRESTRICT src_end,
476 {
477  __m256i yM = _mm256_set1_epi32(int(mask));
478  do
479  {
480  _mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM)); // ymm1 = (~ymm1) & ymm2
481  _mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));
482  _mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));
483  _mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));
484 
485  dst += 4; src += 4;
486  } while (src < src_end);
487 }
488 
489 /*!
490  @brief AND array elements against another array
491  *dst &= *src
492  @return 0 if destination does not have any bits
493  @ingroup AVX2
494 */
495 inline
496 unsigned avx2_and_block(__m256i* BMRESTRICT dst,
497  const __m256i* BMRESTRICT src)
498 {
499  __m256i m1A, m1B, m1C, m1D;
500  __m256i accA, accB, accC, accD;
501 
502  const __m256i* BMRESTRICT src_end =
503  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
504 
505  accA = accB = accC = accD = _mm256_setzero_si256();
506 
507  do
508  {
509  m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
510  m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
511  m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
512  m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
513 
514  _mm256_store_si256(dst+0, m1A);
515  _mm256_store_si256(dst+1, m1B);
516  _mm256_store_si256(dst+2, m1C);
517  _mm256_store_si256(dst+3, m1D);
518 
519  accA = _mm256_or_si256(accA, m1A);
520  accB = _mm256_or_si256(accB, m1B);
521  accC = _mm256_or_si256(accC, m1C);
522  accD = _mm256_or_si256(accD, m1D);
523 
524  src += 4; dst += 4;
525 
526  } while (src < src_end);
527 
528  accA = _mm256_or_si256(accA, accB); // A = A | B
529  accC = _mm256_or_si256(accC, accD); // C = C | D
530  accA = _mm256_or_si256(accA, accC); // A = A | C
531 
532  return !_mm256_testz_si256(accA, accA);
533 }
534 
535 /*!
536  @brief AND block digest stride
537  *dst &= *src
538 
539  @return true if stide is all zero
540  @ingroup AVX2
541 */
542 inline
543 bool avx2_and_digest(__m256i* BMRESTRICT dst,
544  const __m256i* BMRESTRICT src)
545 {
546  __m256i m1A, m1B, m1C, m1D;
547 
548  m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
549  m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
550  m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
551  m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
552 
553  _mm256_store_si256(dst+0, m1A);
554  _mm256_store_si256(dst+1, m1B);
555  _mm256_store_si256(dst+2, m1C);
556  _mm256_store_si256(dst+3, m1D);
557 
558  m1A = _mm256_or_si256(m1A, m1B);
559  m1C = _mm256_or_si256(m1C, m1D);
560  m1A = _mm256_or_si256(m1A, m1C);
561 
562  return _mm256_testz_si256(m1A, m1A);
563 }
564 
565 /*!
566  @brief AND block digest stride 2 way
567  *dst = *src1 & *src2
568 
569  @return true if stide is all zero
570  @ingroup AVX2
571 */
572 inline
574  const __m256i* BMRESTRICT src1,
575  const __m256i* BMRESTRICT src2)
576 {
577  __m256i m1A, m1B, m1C, m1D;
578 
579  m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
580  m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
581  m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
582  m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
583 
584  _mm256_store_si256(dst+0, m1A);
585  _mm256_store_si256(dst+1, m1B);
586  _mm256_store_si256(dst+2, m1C);
587  _mm256_store_si256(dst+3, m1D);
588 
589  m1A = _mm256_or_si256(m1A, m1B);
590  m1C = _mm256_or_si256(m1C, m1D);
591  m1A = _mm256_or_si256(m1A, m1C);
592 
593  return _mm256_testz_si256(m1A, m1A);
594 }
595 
596 /*!
597  @brief AND-OR block digest stride 2 way
598  *dst |= *src1 & *src2
599 
600  @return true if stide is all zero
601  @ingroup AVX2
602 */
603 inline
605  const __m256i* BMRESTRICT src1,
606  const __m256i* BMRESTRICT src2)
607 {
608  const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
609 
610  __m256i m1A, m1B, m1C, m1D;
611  __m256i mACC1;
612  __m256i mSA, mSB, mSC, mSD;
613 
614 
615  mSA = _mm256_load_si256(dst+0);
616  mSB = _mm256_load_si256(dst+1);
617  mACC1 = _mm256_and_si256(mSA, mSB);
618 
619  mSC = _mm256_load_si256(dst+2);
620  mSD = _mm256_load_si256(dst+3);
621 
622  mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));
623 
624  mACC1 = _mm256_xor_si256(mACC1, maskF);
625  if (_mm256_testz_si256(mACC1, mACC1)) // whole wave is saturated 1111s already
626  return false;
627 
628 
629  m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
630  m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
631  m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
632  m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
633 
634  mACC1 =
635  _mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));
636  bool all_z = _mm256_testz_si256(mACC1, mACC1);
637  if (all_z)
638  return all_z;
639 
640  m1A = _mm256_or_si256(mSA, m1A);
641  m1B = _mm256_or_si256(mSB, m1B);
642  m1C = _mm256_or_si256(mSC, m1C);
643  m1D = _mm256_or_si256(mSD, m1D);
644 
645  _mm256_store_si256(dst+0, m1A);
646  _mm256_store_si256(dst+1, m1B);
647  _mm256_store_si256(dst+2, m1C);
648  _mm256_store_si256(dst+3, m1D);
649 
650  return all_z;
651 }
652 
653 
654 /*!
655  @brief AND block digest stride
656  @ingroup AVX2
657 */
658 inline
660  const __m256i* BMRESTRICT src1,
661  const __m256i* BMRESTRICT src2,
662  const __m256i* BMRESTRICT src3,
663  const __m256i* BMRESTRICT src4)
664 {
665  __m256i m1A, m1B, m1C, m1D;
666  __m256i m1E, m1F, m1G, m1H;
667 
668  {
669  __m256i s1_0, s2_0, s1_1, s2_1;
670 
671  s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
672  s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
673  m1A = _mm256_and_si256(s1_0, s2_0);
674  m1B = _mm256_and_si256(s1_1, s2_1);
675  s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
676  s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
677  m1C = _mm256_and_si256(s1_0, s2_0);
678  m1D = _mm256_and_si256(s1_1, s2_1);
679  }
680  {
681  __m256i s3_0, s4_0, s3_1, s4_1;
682 
683  s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
684  s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
685  m1E = _mm256_and_si256(s3_0, s4_0);
686  m1F = _mm256_and_si256(s3_1, s4_1);
687 
688  m1A = _mm256_and_si256(m1A, m1E);
689  m1B = _mm256_and_si256(m1B, m1F);
690 
691  s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
692  s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
693  m1G = _mm256_and_si256(s3_0, s4_0);
694  m1H = _mm256_and_si256(s3_1, s4_1);
695  }
696  {
697  __m256i dst0, dst1;
698  dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
699 
700  m1C = _mm256_and_si256(m1C, m1G);
701  m1D = _mm256_and_si256(m1D, m1H);
702  m1A = _mm256_and_si256(m1A, dst0);
703  m1B = _mm256_and_si256(m1B, dst1);
704 
705  dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
706 
707  m1C = _mm256_and_si256(m1C, dst0);
708  m1D = _mm256_and_si256(m1D, dst1);
709  }
710  _mm256_store_si256(dst + 0, m1A);
711  _mm256_store_si256(dst + 1, m1B);
712  _mm256_store_si256(dst + 2, m1C);
713  _mm256_store_si256(dst + 3, m1D);
714 
715  m1A = _mm256_or_si256(m1A, m1B);
716  m1C = _mm256_or_si256(m1C, m1D);
717  m1A = _mm256_or_si256(m1A, m1C);
718 
719  return _mm256_testz_si256(m1A, m1A);
720 }
721 
722 /*!
723  @brief AND block digest stride
724  @ingroup AVX2
725 */
726 inline
728  const __m256i* BMRESTRICT src1,
729  const __m256i* BMRESTRICT src2)
730 {
731  __m256i m1A, m1B, m1C, m1D;
732 
733  {
734  __m256i s1_0, s2_0, s1_1, s2_1;
735 
736  s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
737  s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
738  m1A = _mm256_and_si256(s1_0, s2_0);
739  m1B = _mm256_and_si256(s1_1, s2_1);
740  s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
741  s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
742  m1C = _mm256_and_si256(s1_0, s2_0);
743  m1D = _mm256_and_si256(s1_1, s2_1);
744  }
745  {
746  __m256i dst0, dst1;
747  dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
748 
749  m1A = _mm256_and_si256(m1A, dst0);
750  m1B = _mm256_and_si256(m1B, dst1);
751 
752  dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
753 
754  m1C = _mm256_and_si256(m1C, dst0);
755  m1D = _mm256_and_si256(m1D, dst1);
756  }
757  _mm256_store_si256(dst + 0, m1A);
758  _mm256_store_si256(dst + 1, m1B);
759  _mm256_store_si256(dst + 2, m1C);
760  _mm256_store_si256(dst + 3, m1D);
761 
762  m1A = _mm256_or_si256(m1A, m1B);
763  m1C = _mm256_or_si256(m1C, m1D);
764  m1A = _mm256_or_si256(m1A, m1C);
765 
766  return _mm256_testz_si256(m1A, m1A);
767 }
768 
769 
770 /*!
771  @brief AND array elements against another array (unaligned)
772  *dst &= *src
773  @return 0 if destination does not have any bits
774  @ingroup AVX2
775 */
776 inline
777 unsigned avx2_and_arr_unal(__m256i* BMRESTRICT dst,
778  const __m256i* BMRESTRICT src,
779  const __m256i* BMRESTRICT src_end)
780 {
781  __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
782  __m256i accA, accB, accC, accD;
783 
784  accA = _mm256_setzero_si256();
785  accB = _mm256_setzero_si256();
786  accC = _mm256_setzero_si256();
787  accD = _mm256_setzero_si256();
788 
789  do
790  {
791  m1A = _mm256_loadu_si256(src+0);
792  m2A = _mm256_load_si256(dst+0);
793  m1A = _mm256_and_si256(m1A, m2A);
794  _mm256_store_si256(dst+0, m1A);
795  accA = _mm256_or_si256(accA, m1A);
796 
797  m1B = _mm256_loadu_si256(src+1);
798  m2B = _mm256_load_si256(dst+1);
799  m1B = _mm256_and_si256(m1B, m2B);
800  _mm256_store_si256(dst+1, m1B);
801  accB = _mm256_or_si256(accB, m1B);
802 
803  m1C = _mm256_loadu_si256(src+2);
804  m2C = _mm256_load_si256(dst+2);
805  m1C = _mm256_and_si256(m1C, m2C);
806  _mm256_store_si256(dst+2, m1C);
807  accC = _mm256_or_si256(accC, m1C);
808 
809  m1D = _mm256_loadu_si256(src+3);
810  m2D = _mm256_load_si256(dst+3);
811  m1D = _mm256_and_si256(m1D, m2D);
812  _mm256_store_si256(dst+3, m1D);
813  accD = _mm256_or_si256(accD, m1D);
814 
815  src += 4; dst += 4;
816 
817  } while (src < src_end);
818 
819  accA = _mm256_or_si256(accA, accB); // A = A | B
820  accC = _mm256_or_si256(accC, accD); // C = C | D
821  accA = _mm256_or_si256(accA, accC); // A = A | C
822 
823  return !_mm256_testz_si256(accA, accA);
824 }
825 
826 
827 /*!
828  @brief OR array elements against another array
829  *dst |= *src
830  @return true if all bits are 1
831 
832  @ingroup AVX2
833 */
834 inline
835 bool avx2_or_block(__m256i* BMRESTRICT dst,
836  const __m256i* BMRESTRICT src)
837 {
838  __m256i m1A, m1B, m1C, m1D;
839 
840  __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
841  __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
842 
843  __m256i* BMRESTRICT dst2 =
844  (__m256i*)((bm::word_t*)(dst) + bm::set_block_size/2);
845  const __m256i* BMRESTRICT src2 =
846  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size/2);
847  const __m256i* BMRESTRICT src_end =
848  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
849  do
850  {
851  m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
852  m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
853  mAccF0 = _mm256_and_si256(mAccF0, m1A);
854  mAccF0 = _mm256_and_si256(mAccF0, m1B);
855 
856  _mm256_stream_si256(dst, m1A);
857  _mm256_stream_si256(dst+1, m1B);
858 
859  src += 2; dst += 2;
860 
861  m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));
862  m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));
863  mAccF1 = _mm256_and_si256(mAccF1, m1C);
864  mAccF1 = _mm256_and_si256(mAccF1, m1D);
865 
866  _mm256_stream_si256(dst2, m1C);
867  _mm256_stream_si256(dst2+1, m1D);
868 
869  src2 += 2; dst2 += 2;
870  } while (src2 < src_end);
871 
872  __m256i maskF = _mm256_set1_epi32(~0u);
873  mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
874  __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
875  unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
876  return (maskA == ~0u);
877 }
878 
879 
880 /*!
881  @brief OR array elements against another unaligned array
882  *dst |= *src
883  @return true if all bits are 1
884 
885  @ingroup AVX2
886 */
887 inline
888 bool avx2_or_arr_unal(__m256i* BMRESTRICT dst,
889  const __m256i* BMRESTRICT src,
890  const __m256i* BMRESTRICT src_end)
891 {
892  __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
893  __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
894  __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
895  do
896  {
897  m1A = _mm256_loadu_si256(src+0);
898  m2A = _mm256_load_si256(dst+0);
899  m1A = _mm256_or_si256(m1A, m2A);
900  _mm256_store_si256(dst+0, m1A);
901 
902  m1B = _mm256_loadu_si256(src+1);
903  m2B = _mm256_load_si256(dst+1);
904  m1B = _mm256_or_si256(m1B, m2B);
905  _mm256_store_si256(dst+1, m1B);
906 
907  m1C = _mm256_loadu_si256(src+2);
908  m2C = _mm256_load_si256(dst+2);
909  m1C = _mm256_or_si256(m1C, m2C);
910  _mm256_store_si256(dst+2, m1C);
911 
912  m1D = _mm256_loadu_si256(src+3);
913  m2D = _mm256_load_si256(dst+3);
914  m1D = _mm256_or_si256(m1D, m2D);
915  _mm256_store_si256(dst+3, m1D);
916 
917  mAccF1 = _mm256_and_si256(mAccF1, m1C);
918  mAccF1 = _mm256_and_si256(mAccF1, m1D);
919  mAccF0 = _mm256_and_si256(mAccF0, m1A);
920  mAccF0 = _mm256_and_si256(mAccF0, m1B);
921 
922  src += 4; dst += 4;
923 
924  } while (src < src_end);
925 
926  __m256i maskF = _mm256_set1_epi32(~0u);
927  mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
928  __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
929  unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
930  return (maskA == ~0u);
931 }
932 
933 /*!
934  @brief OR 2 arrays and copy to the destination
935  *dst = *src1 | src2
936  @return true if all bits are 1
937 
938  @ingroup AVX2
939 */
940 inline
941 bool avx2_or_block_2way(__m256i* BMRESTRICT dst,
942  const __m256i* BMRESTRICT src1,
943  const __m256i* BMRESTRICT src2)
944 {
945  __m256i m1A, m1B, m1C, m1D;
946  __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
947  __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
948  const __m256i* BMRESTRICT src_end1 =
949  (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
950 
951  do
952  {
953  m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
954  m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
955  m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
956  m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
957 
958  _mm256_store_si256(dst+0, m1A);
959  _mm256_store_si256(dst+1, m1B);
960  _mm256_store_si256(dst+2, m1C);
961  _mm256_store_si256(dst+3, m1D);
962 
963  mAccF1 = _mm256_and_si256(mAccF1, m1C);
964  mAccF1 = _mm256_and_si256(mAccF1, m1D);
965  mAccF0 = _mm256_and_si256(mAccF0, m1A);
966  mAccF0 = _mm256_and_si256(mAccF0, m1B);
967 
968  src1 += 4; src2 += 4; dst += 4;
969 
970  } while (src1 < src_end1);
971 
972  __m256i maskF = _mm256_set1_epi32(~0u);
973  mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
974  __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
975  unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
976  return (maskA == ~0u);
977 }
978 
979 /*!
980  @brief OR array elements against another 2 arrays
981  *dst |= *src1 | src2
982  @return true if all bits are 1
983 
984  @ingroup AVX2
985 */
986 inline
987 bool avx2_or_block_3way(__m256i* BMRESTRICT dst,
988  const __m256i* BMRESTRICT src1,
989  const __m256i* BMRESTRICT src2)
990 {
991  __m256i m1A, m1B, m1C, m1D;
992  __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
993  __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
994  const __m256i* BMRESTRICT src_end1 =
995  (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
996 
997  do
998  {
999  m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1000  m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1001  m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1002  m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1003 
1004  m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1005  m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1006  m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1007  m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1008 
1009  _mm256_store_si256(dst+0, m1A);
1010  _mm256_store_si256(dst+1, m1B);
1011  _mm256_store_si256(dst+2, m1C);
1012  _mm256_store_si256(dst+3, m1D);
1013 
1014  mAccF1 = _mm256_and_si256(mAccF1, m1C);
1015  mAccF1 = _mm256_and_si256(mAccF1, m1D);
1016  mAccF0 = _mm256_and_si256(mAccF0, m1A);
1017  mAccF0 = _mm256_and_si256(mAccF0, m1B);
1018 
1019  src1 += 4; src2 += 4; dst += 4;
1020 
1021  } while (src1 < src_end1);
1022 
1023  __m256i maskF = _mm256_set1_epi32(~0u);
1024  mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1025  __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1026  unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1027  return (maskA == ~0u);
1028 }
1029 
1030 
1031 /*!
1032  @brief OR array elements against another 4 arrays
1033  *dst |= *src1 | src2
1034  @return true if all bits are 1
1035 
1036  @ingroup AVX2
1037 */
1038 inline
1039 bool avx2_or_block_5way(__m256i* BMRESTRICT dst,
1040  const __m256i* BMRESTRICT src1,
1041  const __m256i* BMRESTRICT src2,
1042  const __m256i* BMRESTRICT src3,
1043  const __m256i* BMRESTRICT src4)
1044 {
1045  __m256i m1A, m1B, m1C, m1D;
1046  __m256i mAccF0 = _mm256_set1_epi32(~0u); // broadcast 0xFF
1047  __m256i mAccF1 = _mm256_set1_epi32(~0u); // broadcast 0xFF
1048 
1049  const __m256i* BMRESTRICT src_end1 =
1050  (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
1051 
1052  do
1053  {
1054  m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1055  m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1056  m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1057  m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1058 
1059  m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1060  m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1061  m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1062  m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1063 
1064  m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));
1065  m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));
1066  m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));
1067  m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));
1068 
1069  m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));
1070  m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));
1071  m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));
1072  m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));
1073 
1074  _mm256_stream_si256(dst+0, m1A);
1075  _mm256_stream_si256(dst+1, m1B);
1076  _mm256_stream_si256(dst+2, m1C);
1077  _mm256_stream_si256(dst+3, m1D);
1078 
1079  mAccF1 = _mm256_and_si256(mAccF1, m1C);
1080  mAccF1 = _mm256_and_si256(mAccF1, m1D);
1081  mAccF0 = _mm256_and_si256(mAccF0, m1A);
1082  mAccF0 = _mm256_and_si256(mAccF0, m1B);
1083 
1084  src1 += 4; src2 += 4;
1085  src3 += 4; src4 += 4;
1086  _mm_prefetch ((const char*)src3, _MM_HINT_T0);
1087  _mm_prefetch ((const char*)src4, _MM_HINT_T0);
1088 
1089  dst += 4;
1090 
1091  } while (src1 < src_end1);
1092 
1093  __m256i maskF = _mm256_set1_epi32(~0u);
1094  mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1095  __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1096  unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1097  return (maskA == ~0u);
1098 }
1099 
1100 
1101 /*!
1102  @brief XOR block against another
1103  *dst ^= *src
1104  @return 0 if destination does not have any bits
1105  @ingroup AVX2
1106 */
1107 inline
1108 unsigned avx2_xor_block(__m256i* BMRESTRICT dst,
1109  const __m256i* BMRESTRICT src)
1110 {
1111  __m256i m1A, m1B, m1C, m1D;
1112  __m256i accA, accB, accC, accD;
1113 
1114  const __m256i* BMRESTRICT src_end =
1115  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1116 
1117  accA = accB = accC = accD = _mm256_setzero_si256();
1118 
1119  do
1120  {
1121  m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1122  m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1123  m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1124  m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1125 
1126  _mm256_store_si256(dst+0, m1A);
1127  _mm256_store_si256(dst+1, m1B);
1128  _mm256_store_si256(dst+2, m1C);
1129  _mm256_store_si256(dst+3, m1D);
1130 
1131  accA = _mm256_or_si256(accA, m1A);
1132  accB = _mm256_or_si256(accB, m1B);
1133  accC = _mm256_or_si256(accC, m1C);
1134  accD = _mm256_or_si256(accD, m1D);
1135 
1136  src += 4; dst += 4;
1137 
1138  } while (src < src_end);
1139 
1140  accA = _mm256_or_si256(accA, accB); // A = A | B
1141  accC = _mm256_or_si256(accC, accD); // C = C | D
1142  accA = _mm256_or_si256(accA, accC); // A = A | C
1143 
1144  return !_mm256_testz_si256(accA, accA);
1145 }
1146 
1147 /*!
1148  @brief 3 operand XOR
1149  *dst = *src1 ^ src2
1150  @return 0 if destination does not have any bits
1151  @ingroup AVX2
1152 */
1153 inline
1154 unsigned avx2_xor_block_2way(__m256i* BMRESTRICT dst,
1155  const __m256i* BMRESTRICT src1,
1156  const __m256i* BMRESTRICT src2)
1157 {
1158  __m256i m1A, m1B, m1C, m1D;
1159  __m256i accA, accB, accC, accD;
1160 
1161  const __m256i* BMRESTRICT src1_end =
1162  (const __m256i*)((bm::word_t*)(src1) + bm::set_block_size);
1163 
1164  accA = accB = accC = accD = _mm256_setzero_si256();
1165 
1166  do
1167  {
1168  m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));
1169  m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));
1170  m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));
1171  m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));
1172 
1173  _mm256_store_si256(dst + 0, m1A);
1174  _mm256_store_si256(dst + 1, m1B);
1175  _mm256_store_si256(dst + 2, m1C);
1176  _mm256_store_si256(dst + 3, m1D);
1177 
1178  accA = _mm256_or_si256(accA, m1A);
1179  accB = _mm256_or_si256(accB, m1B);
1180  accC = _mm256_or_si256(accC, m1C);
1181  accD = _mm256_or_si256(accD, m1D);
1182 
1183  src1 += 4; src2 += 4; dst += 4;
1184 
1185  } while (src1 < src1_end);
1186 
1187  accA = _mm256_or_si256(accA, accB); // A = A | B
1188  accC = _mm256_or_si256(accC, accD); // C = C | D
1189  accA = _mm256_or_si256(accA, accC); // A = A | C
1190 
1191  return !_mm256_testz_si256(accA, accA);
1192 }
1193 
1194 
1195 /*!
1196  @brief AND-NOT (SUB) array elements against another array
1197  *dst &= ~*src
1198 
1199  @return 0 if destination does not have any bits
1200 
1201  @ingroup AVX2
1202 */
1203 inline
1204 unsigned avx2_sub_block(__m256i* BMRESTRICT dst,
1205  const __m256i* BMRESTRICT src)
1206 {
1207  __m256i m1A, m1B, m1C, m1D;
1208  __m256i accA, accB, accC, accD;
1209 
1210  accA = accB = accC = accD = _mm256_setzero_si256();
1211 
1212  const __m256i* BMRESTRICT src_end =
1213  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1214 
1215  do
1216  {
1217  m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
1218  m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1219  m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1220  m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1221 
1222  _mm256_store_si256(dst+2, m1C);
1223  _mm256_store_si256(dst+3, m1D);
1224  _mm256_store_si256(dst+0, m1A);
1225  _mm256_store_si256(dst+1, m1B);
1226 
1227  accA = _mm256_or_si256(accA, m1A);
1228  accB = _mm256_or_si256(accB, m1B);
1229  accC = _mm256_or_si256(accC, m1C);
1230  accD = _mm256_or_si256(accD, m1D);
1231 
1232  src += 4; dst += 4;
1233  } while (src < src_end);
1234 
1235  accA = _mm256_or_si256(accA, accB); // A = A | B
1236  accC = _mm256_or_si256(accC, accD); // C = C | D
1237  accA = _mm256_or_si256(accA, accC); // A = A | C
1238 
1239  return !_mm256_testz_si256(accA, accA);
1240 }
1241 
1242 /*!
1243  @brief SUB (AND NOT) block digest stride
1244  *dst &= ~*src
1245 
1246  @return true if stide is all zero
1247  @ingroup AVX2
1248 */
1249 inline
1250 bool avx2_sub_digest(__m256i* BMRESTRICT dst,
1251  const __m256i* BMRESTRICT src)
1252 {
1253  __m256i m1A, m1B, m1C, m1D;
1254 
1255  m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1256  m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1257  m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1258  m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1259 
1260  _mm256_store_si256(dst+0, m1A);
1261  _mm256_store_si256(dst+1, m1B);
1262  _mm256_store_si256(dst+2, m1C);
1263  _mm256_store_si256(dst+3, m1D);
1264 
1265  m1A = _mm256_or_si256(m1A, m1B);
1266  m1C = _mm256_or_si256(m1C, m1D);
1267  m1A = _mm256_or_si256(m1A, m1C);
1268 
1269  return _mm256_testz_si256(m1A, m1A);
1270 }
1271 
1272 /*!
1273  @brief 2-operand SUB (AND NOT) block digest stride
1274  *dst = *src1 & ~*src2
1275 
1276  @return true if stide is all zero
1277  @ingroup AVX2
1278 */
1279 inline
1281  const __m256i* BMRESTRICT src1,
1282  const __m256i* BMRESTRICT src2)
1283 {
1284  __m256i m1A, m1B, m1C, m1D;
1285 
1286  m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));
1287  m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));
1288  m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));
1289  m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));
1290 
1291  _mm256_store_si256(dst+0, m1A);
1292  _mm256_store_si256(dst+1, m1B);
1293  _mm256_store_si256(dst+2, m1C);
1294  _mm256_store_si256(dst+3, m1D);
1295 
1296  m1A = _mm256_or_si256(m1A, m1B);
1297  m1C = _mm256_or_si256(m1C, m1D);
1298  m1A = _mm256_or_si256(m1A, m1C);
1299 
1300  return _mm256_testz_si256(m1A, m1A);
1301 }
1302 
1303 
1304 
1305 /*!
1306  @brief SUB block digest stride
1307  @ingroup AVX2
1308 */
1309 inline
1311  const __m256i* BMRESTRICT src1,
1312  const __m256i* BMRESTRICT src2,
1313  const __m256i* BMRESTRICT src3,
1314  const __m256i* BMRESTRICT src4)
1315 {
1316  __m256i m1A, m1B, m1C, m1D;
1317  __m256i m1E, m1F, m1G, m1H;
1318  const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1319 
1320  {
1321  __m256i s1_0, s2_0, s1_1, s2_1;
1322 
1323  s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1324  s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1325  s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1326  s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1327 
1328  m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1329 
1330  s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1331  s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1332  s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1333  s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1334 
1335  m1C = _mm256_and_si256(s1_0, s2_0);
1336  m1D = _mm256_and_si256(s1_1, s2_1);
1337  }
1338  {
1339  __m256i s3_0, s4_0, s3_1, s4_1;
1340 
1341  s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
1342  s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
1343  s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1344  s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1345 
1346  m1E = _mm256_and_si256(s3_0, s4_0);
1347  m1F = _mm256_and_si256(s3_1, s4_1);
1348 
1349  m1A = _mm256_and_si256(m1A, m1E);
1350  m1B = _mm256_and_si256(m1B, m1F);
1351 
1352  s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
1353  s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
1354  s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1355  s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1356 
1357  m1G = _mm256_and_si256(s3_0, s4_0);
1358  m1H = _mm256_and_si256(s3_1, s4_1);
1359  }
1360  {
1361  __m256i dst0, dst1;
1362  dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1363 
1364  m1C = _mm256_and_si256(m1C, m1G);
1365  m1D = _mm256_and_si256(m1D, m1H);
1366  m1A = _mm256_and_si256(m1A, dst0);
1367  m1B = _mm256_and_si256(m1B, dst1);
1368 
1369  dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1370 
1371  m1C = _mm256_and_si256(m1C, dst0);
1372  m1D = _mm256_and_si256(m1D, dst1);
1373  }
1374  _mm256_store_si256(dst + 0, m1A);
1375  _mm256_store_si256(dst + 1, m1B);
1376  _mm256_store_si256(dst + 2, m1C);
1377  _mm256_store_si256(dst + 3, m1D);
1378 
1379  m1A = _mm256_or_si256(m1A, m1B);
1380  m1C = _mm256_or_si256(m1C, m1D);
1381  m1A = _mm256_or_si256(m1A, m1C);
1382 
1383  return _mm256_testz_si256(m1A, m1A);
1384 }
1385 
1386 
1387 /*!
1388  @brief SUB block digest stride
1389  @ingroup AVX2
1390 */
1391 inline
1393  const __m256i* BMRESTRICT src1,
1394  const __m256i* BMRESTRICT src2)
1395 {
1396  __m256i m1A, m1B, m1C, m1D;
1397 // __m256i m1E, m1F, m1G, m1H;
1398  const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1399 
1400  {
1401  __m256i s1_0, s2_0, s1_1, s2_1;
1402 
1403  s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1404  s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1405  s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1406  s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1407 
1408  m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1409 
1410  s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1411  s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1412  s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1413  s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1414 
1415  m1C = _mm256_and_si256(s1_0, s2_0);
1416  m1D = _mm256_and_si256(s1_1, s2_1);
1417  }
1418  /*
1419  {
1420  __m256i s3_0, s4_0, s3_1, s4_1;
1421 
1422  s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
1423  s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
1424  s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1425  s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1426 
1427  m1E = _mm256_and_si256(s3_0, s4_0);
1428  m1F = _mm256_and_si256(s3_1, s4_1);
1429 
1430  m1A = _mm256_and_si256(m1A, m1E);
1431  m1B = _mm256_and_si256(m1B, m1F);
1432 
1433  s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
1434  s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
1435  s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1436  s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1437 
1438  m1G = _mm256_and_si256(s3_0, s4_0);
1439  m1H = _mm256_and_si256(s3_1, s4_1);
1440  }
1441  */
1442  {
1443  __m256i dst0, dst1;
1444  dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1445 
1446 // m1C = _mm256_and_si256(m1C, m1G);
1447 // m1D = _mm256_and_si256(m1D, m1H);
1448  m1A = _mm256_and_si256(m1A, dst0);
1449  m1B = _mm256_and_si256(m1B, dst1);
1450 
1451  dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1452 
1453  m1C = _mm256_and_si256(m1C, dst0);
1454  m1D = _mm256_and_si256(m1D, dst1);
1455  }
1456  _mm256_store_si256(dst + 0, m1A);
1457  _mm256_store_si256(dst + 1, m1B);
1458  _mm256_store_si256(dst + 2, m1C);
1459  _mm256_store_si256(dst + 3, m1D);
1460 
1461  m1A = _mm256_or_si256(m1A, m1B);
1462  m1C = _mm256_or_si256(m1C, m1D);
1463  m1A = _mm256_or_si256(m1A, m1C);
1464 
1465  return _mm256_testz_si256(m1A, m1A);
1466 }
1467 
1468 
1469 
1470 /*!
1471  @brief AVX2 block memset
1472  *dst = value
1473 
1474  @ingroup AVX2
1475 */
1478 {
1479  __m256i* BMRESTRICT dst_end =
1480  (__m256i*)((bm::word_t*)(dst) + bm::set_block_size);
1481 
1482  __m256i ymm0 = _mm256_set1_epi32(int(value));
1483  do
1484  {
1485  _mm256_store_si256(dst, ymm0);
1486  _mm256_store_si256(dst+1, ymm0);
1487  _mm256_store_si256(dst+2, ymm0);
1488  _mm256_store_si256(dst+3, ymm0);
1489 
1490  dst += 4;
1491  } while (dst < dst_end);
1492 }
1493 
1494 
1495 
1496 /*!
1497  @brief AVX2 block copy
1498  *dst = *src
1499 
1500  @ingroup AVX2
1501 */
1502 inline
1503 void avx2_copy_block(__m256i* BMRESTRICT dst,
1504  const __m256i* BMRESTRICT src)
1505 {
1506  __m256i ymm0, ymm1, ymm2, ymm3;
1507 
1508  const __m256i* BMRESTRICT src_end =
1509  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1510 
1511  do
1512  {
1513  ymm0 = _mm256_load_si256(src+0);
1514  ymm1 = _mm256_load_si256(src+1);
1515  ymm2 = _mm256_load_si256(src+2);
1516  ymm3 = _mm256_load_si256(src+3);
1517 
1518  _mm256_store_si256(dst+0, ymm0);
1519  _mm256_store_si256(dst+1, ymm1);
1520  _mm256_store_si256(dst+2, ymm2);
1521  _mm256_store_si256(dst+3, ymm3);
1522 
1523  ymm0 = _mm256_load_si256(src+4);
1524  ymm1 = _mm256_load_si256(src+5);
1525  ymm2 = _mm256_load_si256(src+6);
1526  ymm3 = _mm256_load_si256(src+7);
1527 
1528  _mm256_store_si256(dst+4, ymm0);
1529  _mm256_store_si256(dst+5, ymm1);
1530  _mm256_store_si256(dst+6, ymm2);
1531  _mm256_store_si256(dst+7, ymm3);
1532 
1533  src += 8; dst += 8;
1534 
1535  } while (src < src_end);
1536 }
1537 
1538 /*!
1539  @brief AVX2 block copy (unaligned SRC)
1540  *dst = *src
1541 
1542  @ingroup AVX2
1543 */
1544 inline
1546  const __m256i* BMRESTRICT src)
1547 {
1548  __m256i ymm0, ymm1, ymm2, ymm3;
1549 
1550  const __m256i* BMRESTRICT src_end =
1551  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1552 
1553  do
1554  {
1555  ymm0 = _mm256_loadu_si256(src+0);
1556  ymm1 = _mm256_loadu_si256(src+1);
1557  ymm2 = _mm256_loadu_si256(src+2);
1558  ymm3 = _mm256_loadu_si256(src+3);
1559 
1560  _mm256_store_si256(dst+0, ymm0);
1561  _mm256_store_si256(dst+1, ymm1);
1562  _mm256_store_si256(dst+2, ymm2);
1563  _mm256_store_si256(dst+3, ymm3);
1564 
1565  ymm0 = _mm256_loadu_si256(src+4);
1566  ymm1 = _mm256_loadu_si256(src+5);
1567  ymm2 = _mm256_loadu_si256(src+6);
1568  ymm3 = _mm256_loadu_si256(src+7);
1569 
1570  _mm256_store_si256(dst+4, ymm0);
1571  _mm256_store_si256(dst+5, ymm1);
1572  _mm256_store_si256(dst+6, ymm2);
1573  _mm256_store_si256(dst+7, ymm3);
1574 
1575  src += 8; dst += 8;
1576 
1577  } while (src < src_end);
1578 }
1579 
1580 
1581 
1582 /*!
1583  @brief AVX2 block copy
1584  *dst = *src
1585 
1586  @ingroup AVX2
1587 */
1588 inline
1589 void avx2_stream_block(__m256i* BMRESTRICT dst,
1590  const __m256i* BMRESTRICT src)
1591 {
1592  __m256i ymm0, ymm1, ymm2, ymm3;
1593 
1594  const __m256i* BMRESTRICT src_end =
1595  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1596 
1597  do
1598  {
1599  ymm0 = _mm256_load_si256(src+0);
1600  ymm1 = _mm256_load_si256(src+1);
1601  ymm2 = _mm256_load_si256(src+2);
1602  ymm3 = _mm256_load_si256(src+3);
1603 
1604  _mm256_stream_si256(dst+0, ymm0);
1605  _mm256_stream_si256(dst+1, ymm1);
1606  _mm256_stream_si256(dst+2, ymm2);
1607  _mm256_stream_si256(dst+3, ymm3);
1608 
1609  ymm0 = _mm256_load_si256(src+4);
1610  ymm1 = _mm256_load_si256(src+5);
1611  ymm2 = _mm256_load_si256(src+6);
1612  ymm3 = _mm256_load_si256(src+7);
1613 
1614  _mm256_stream_si256(dst+4, ymm0);
1615  _mm256_stream_si256(dst+5, ymm1);
1616  _mm256_stream_si256(dst+6, ymm2);
1617  _mm256_stream_si256(dst+7, ymm3);
1618 
1619  src += 8; dst += 8;
1620 
1621  } while (src < src_end);
1622 }
1623 
1624 /*!
1625  @brief AVX2 block copy (unaligned SRC)
1626  *dst = *src
1627 
1628  @ingroup AVX2
1629 */
1630 inline
1632  const __m256i* BMRESTRICT src)
1633 {
1634  __m256i ymm0, ymm1, ymm2, ymm3;
1635 
1636  const __m256i* BMRESTRICT src_end =
1637  (const __m256i*)((bm::word_t*)(src) + bm::set_block_size);
1638 
1639  do
1640  {
1641  ymm0 = _mm256_loadu_si256(src+0);
1642  ymm1 = _mm256_loadu_si256(src+1);
1643  ymm2 = _mm256_loadu_si256(src+2);
1644  ymm3 = _mm256_loadu_si256(src+3);
1645 
1646  _mm256_stream_si256(dst+0, ymm0);
1647  _mm256_stream_si256(dst+1, ymm1);
1648  _mm256_stream_si256(dst+2, ymm2);
1649  _mm256_stream_si256(dst+3, ymm3);
1650 
1651  ymm0 = _mm256_loadu_si256(src+4);
1652  ymm1 = _mm256_loadu_si256(src+5);
1653  ymm2 = _mm256_loadu_si256(src+6);
1654  ymm3 = _mm256_loadu_si256(src+7);
1655 
1656  _mm256_stream_si256(dst+4, ymm0);
1657  _mm256_stream_si256(dst+5, ymm1);
1658  _mm256_stream_si256(dst+6, ymm2);
1659  _mm256_stream_si256(dst+7, ymm3);
1660 
1661  src += 8; dst += 8;
1662 
1663  } while (src < src_end);
1664 }
1665 
1666 
1667 
1668 /*!
1669  @brief Invert bit-block
1670  *dst = ~*dst
1671  or
1672  *dst ^= *dst
1673 
1674  @ingroup AVX2
1675 */
1676 inline
1677 void avx2_invert_block(__m256i* BMRESTRICT dst)
1678 {
1679  __m256i maskFF = _mm256_set1_epi32(-1); // broadcast 0xFF
1680  const __m256i* BMRESTRICT dst_end =
1681  (const __m256i*)((bm::word_t*)(dst) + bm::set_block_size);
1682 
1683  __m256i ymm0, ymm1;
1684  do
1685  {
1686  ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);
1687  ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);
1688 
1689  _mm256_store_si256(dst+0, ymm0);
1690  _mm256_store_si256(dst+1, ymm1);
1691 
1692  ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);
1693  ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);
1694 
1695  _mm256_store_si256(dst+2, ymm0);
1696  _mm256_store_si256(dst+3, ymm1);
1697 
1698  dst += 4;
1699 
1700  } while (dst < dst_end);
1701 }
1702 
1703 /*!
1704  @brief check if block is all zero bits
1705  @ingroup AVX2
1706 */
1707 inline
1708 bool avx2_is_all_zero(const __m256i* BMRESTRICT block)
1709 {
1710  const __m256i* BMRESTRICT block_end =
1711  (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1712 
1713  do
1714  {
1715  __m256i w0 = _mm256_load_si256(block+0);
1716  __m256i w1 = _mm256_load_si256(block+1);
1717 
1718  __m256i wA = _mm256_or_si256(w0, w1);
1719 
1720  __m256i w2 = _mm256_load_si256(block+2);
1721  __m256i w3 = _mm256_load_si256(block+3);
1722 
1723  __m256i wB = _mm256_or_si256(w2, w3);
1724  wA = _mm256_or_si256(wA, wB);
1725 
1726  if (!_mm256_testz_si256(wA, wA))
1727  return false;
1728  block += 4;
1729  } while (block < block_end);
1730  return true;
1731 }
1732 
1733 /*!
1734  @brief check if digest stride is all zero bits
1735  @ingroup AVX2
1736 */
1737 inline
1738 bool avx2_is_digest_zero(const __m256i* BMRESTRICT block)
1739 {
1740  __m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));
1741  __m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));
1742  wA = _mm256_or_si256(wA, wB);
1743 
1744  return _mm256_testz_si256(wA, wA);
1745 }
1746 
1747 /*!
1748  @brief set digest stride to 0xFF.. or 0x0 value
1749  @ingroup AVX2
1750 */
1751 inline
1752 void avx2_block_set_digest(__m256i* dst, unsigned value)
1753 {
1754  __m256i mV = _mm256_set1_epi32(int(value));
1755  _mm256_store_si256(dst, mV);
1756  _mm256_store_si256(dst + 1, mV);
1757  _mm256_store_si256(dst + 2, mV);
1758  _mm256_store_si256(dst + 3, mV);
1759 }
1760 
1761 /*!
1762  @brief check if block is all one bits
1763  @return true if all bits are 1
1764  @ingroup AVX2
1765 */
1766 inline
1767 bool avx2_is_all_one(const __m256i* BMRESTRICT block)
1768 {
1769  const __m256i maskF = _mm256_set1_epi32(~0u); // brosdcast 0xFF
1770  const __m256i* BMRESTRICT block_end =
1771  (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1772  do
1773  {
1774  __m256i m1A = _mm256_load_si256(block+0);
1775  __m256i m1B = _mm256_load_si256(block+1);
1776  m1A = _mm256_xor_si256(m1A, maskF);
1777  m1B = _mm256_xor_si256(m1B, maskF);
1778  m1A = _mm256_or_si256(m1A, m1B);
1779  if (!_mm256_testz_si256(m1A, m1A))
1780  return false;
1781  block += 2;
1782  } while (block < block_end);
1783  return true;
1784 }
1785 
1786 /*!
1787  @brief check if wave of pointers is all 0xFFF
1788  @ingroup AVX2
1789 */
1791 bool avx2_test_all_one_wave(const void* ptr)
1792 {
1793  __m256i maskF = _mm256_set1_epi32(~0u); // braodcast 0xFF
1794  __m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF); // (w0 == maskF)
1795  unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1796  return (maskA == ~0u);
1797 }
1798 
1799 
1800 /*!
1801  @brief check if wave of pointers is all NULL
1802  @ingroup AVX2
1803 */
1805 bool avx2_test_all_zero_wave(const void* ptr)
1806 {
1807  __m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1808  return _mm256_testz_si256(w0, w0);
1809 }
1810 
1811 /*!
1812  @brief check if 2 wave of pointers are all NULL
1813  @ingroup AVX2
1814 */
1816 bool avx2_test_all_zero_wave2(const void* ptr0, const void* ptr1)
1817 {
1818  __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1819  __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1820  w0 = _mm256_or_si256(w0, w1);
1821  return _mm256_testz_si256(w0, w0);
1822 }
1823 
1824 /*!
1825  @brief check if 2 wave of pointers are all the same (NULL or FULL)
1826  @ingroup AVX2
1827 */
1829 bool avx2_test_all_eq_wave2(const void* ptr0, const void* ptr1)
1830 {
1831  __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1832  __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1833  w0 = _mm256_xor_si256(w0, w1);
1834  return _mm256_testz_si256(w0, w0);
1835 }
1836 
1837 /*!
1838  @brief block shift left by 1
1839  @ingroup AVX2
1840 */
1841 inline
1842 bool avx2_shift_l1(__m256i* block, bm::word_t* empty_acc, unsigned co1)
1843 {
1844  __m256i* block_end =
1845  (__m256i*)((bm::word_t*)(block) + bm::set_block_size);
1846 
1847  __m256i m1COshft, m2COshft;
1848  __m256i mAcc = _mm256_set1_epi32(0);
1849  __m256i mMask1 = _mm256_set1_epi32(1);
1850  __m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
1851  unsigned co2;
1852 
1853  for (--block_end; block_end >= block; block_end -= 2)
1854  {
1855  __m256i m1A = _mm256_load_si256(block_end);
1856  __m256i m2A = _mm256_load_si256(block_end-1);
1857 
1858  __m256i m1CO = _mm256_and_si256(m1A, mMask1);
1859  __m256i m2CO = _mm256_and_si256(m2A, mMask1);
1860 
1861  co2 = _mm256_extract_epi32(m1CO, 0);
1862 
1863  m1A = _mm256_srli_epi32(m1A, 1); // (block[i] >> 1u)
1864  m2A = _mm256_srli_epi32(m2A, 1);
1865 
1866  // shift CO flags using -1 permute indexes, add CO to v[0]
1867  m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1868  m1COshft = _mm256_insert_epi32(m1COshft, co1, 7); // v[7] = co_flag
1869 
1870  co1 = co2;
1871 
1872  co2 = _mm256_extract_epi32(m2CO, 0);
1873 
1874  m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1875  m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);
1876 
1877  m1COshft = _mm256_slli_epi32(m1COshft, 31);
1878  m2COshft = _mm256_slli_epi32(m2COshft, 31);
1879 
1880  m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1881  m2A = _mm256_or_si256(m2A, m2COshft);
1882 
1883  _mm256_store_si256(block_end, m1A);
1884  _mm256_store_si256(block_end-1, m2A);
1885 
1886  mAcc = _mm256_or_si256(mAcc, m1A);
1887  mAcc = _mm256_or_si256(mAcc, m2A);
1888 
1889  co1 = co2;
1890 
1891  } // for
1892 
1893  *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1894  return co1;
1895 }
1896 
1897 
1898 /*!
1899  @brief block shift right by 1
1900  @ingroup AVX2
1901 */
1902 inline
1903 bool avx2_shift_r1(__m256i* block, bm::word_t* empty_acc, unsigned co1)
1904 {
1905  const __m256i* block_end =
1906  (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
1907 
1908  __m256i m1COshft, m2COshft;
1909  __m256i mAcc = _mm256_set1_epi32(0);
1910  __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1911  unsigned co2;
1912 
1913  for (;block < block_end; block+=2)
1914  {
1915  __m256i m1A = _mm256_load_si256(block);
1916  __m256i m2A = _mm256_load_si256(block+1);
1917 
1918  __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1919  __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1920 
1921  co2 = _mm256_extract_epi32(m1CO, 7);
1922 
1923  m1A = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
1924  m2A = _mm256_slli_epi32(m2A, 1);
1925 
1926  // shift CO flags using +1 permute indexes, add CO to v[0]
1927  m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1928  m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
1929 
1930  co1 = co2;
1931 
1932  co2 = _mm256_extract_epi32(m2CO, 7);
1933  m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1934  m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1935 
1936  m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
1937  m2A = _mm256_or_si256(m2A, m2COshft);
1938 
1939  _mm256_store_si256(block, m1A);
1940  _mm256_store_si256(block+1, m2A);
1941 
1942  mAcc = _mm256_or_si256(mAcc, m1A);
1943  mAcc = _mm256_or_si256(mAcc, m2A);
1944 
1945  co1 = co2;
1946  } // for
1947 
1948  *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1949  return co1;
1950 }
1951 
1952 
1953 /*!
1954  @brief fused block shift right by 1 plus AND
1955  @ingroup AVX2
1956 */
1957 
1958 inline
1959 bool avx2_shift_r1_and(__m256i* BMRESTRICT block,
1960  bm::word_t co1,
1961  const __m256i* BMRESTRICT mask_block,
1962  bm::id64_t* BMRESTRICT digest)
1963 {
1964  BM_ASSERT(*digest);
1965 
1966  bm::word_t* wblock = (bm::word_t*) block;
1967  const bm::word_t* mblock = (const bm::word_t*) mask_block;
1968 
1969  __m256i m1COshft, m2COshft;
1970  __m256i mAcc = _mm256_set1_epi32(0);
1971  __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1972  unsigned co2;
1973 
1974  bm::id64_t d, wd;
1975  wd = d = *digest;
1976  unsigned di = co1 ? 0 : unsigned(_tzcnt_u64(d)); // get first set bit
1977  for (; di < 64 ; ++di)
1978  {
1979  const unsigned d_base = di * bm::set_block_digest_wave_size;
1980  const bm::id64_t dmask = (1ull << di);
1981  if (d & dmask) // digest stride NOT empty
1982  {
1983  mAcc = _mm256_xor_si256(mAcc, mAcc); // mAcc = 0
1984 
1985  mask_block = (__m256i*) &mblock[d_base];
1986  _mm_prefetch ((const char*)mask_block, _MM_HINT_NTA);
1987 
1988  block = (__m256i*) &wblock[d_base];
1989 
1990  for (unsigned i = 0; i < 2; ++i, block += 2, mask_block += 2)
1991  {
1992  __m256i m1A = _mm256_load_si256(block);
1993  __m256i m2A = _mm256_load_si256(block+1);
1994 
1995  __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1996  __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1997 
1998  co2 = _mm256_extract_epi32(m1CO, 7);
1999 
2000  m1A = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2001  m2A = _mm256_slli_epi32(m2A, 1);
2002 
2003  __m256i m1M = _mm256_load_si256(mask_block);
2004  __m256i m2M = _mm256_load_si256(mask_block+1);
2005 
2006  // shift CO flags using +1 permute indexes, add CO to v[0]
2007  m1COshft = _mm256_insert_epi32(
2008  _mm256_permutevar8x32_epi32(m1CO, mCOidx),
2009  co1, 0); // v[0] = co_flag
2010 
2011  co1 = co2;
2012  co2 = _mm256_extract_epi32(m2CO, 7);
2013  m2COshft = _mm256_insert_epi32(
2014  _mm256_permutevar8x32_epi32(m2CO, mCOidx),
2015  co1, 0);
2016 
2017  m1A = _mm256_or_si256(m1A, m1COshft); // block[i] |= co_flag
2018  m2A = _mm256_or_si256(m2A, m2COshft);
2019 
2020  m1A = _mm256_and_si256(m1A, m1M); // block[i] &= mask_block[i]
2021  m2A = _mm256_and_si256(m2A, m2M);
2022 
2023  _mm256_store_si256(block, m1A);
2024  _mm256_store_si256(block+1, m2A);
2025 
2026  mAcc = _mm256_or_si256(mAcc, m1A);
2027  mAcc = _mm256_or_si256(mAcc, m2A);
2028 
2029  co1 = co2;
2030 
2031  } // for i
2032 
2033  if (_mm256_testz_si256(mAcc, mAcc)) // test if OR accum is zero
2034  d &= ~dmask; // clear the digest bit
2035 
2036  wd = _blsr_u64(wd); // wd &= wd - 1; // reset lowest set bit
2037  }
2038  else // stride is empty
2039  {
2040  if (co1)
2041  {
2042  BM_ASSERT(co1 == 1);
2043  BM_ASSERT(wblock[d_base] == 0);
2044 
2045  bm::id64_t w0 = wblock[d_base] = (co1 & mblock[d_base]);
2046  d |= (dmask & (w0 << di)); // update digest (branchless if (w0))
2047  co1 = 0;
2048  }
2049  if (!wd) // digest is empty, no CO -> exit
2050  break;
2051  }
2052  } // for di
2053 
2054  *digest = d;
2055  return co1;
2056 }
2057 
2058 
2059 
2060 /*
2061 inline
2062 void avx2_i32_shift()
2063 {
2064  unsigned shift_in = 80;
2065 
2066  __m256i mTest = _mm256_set_epi32(70, 60, 50, 40, 30, 20, 10, 100);
2067  __m256i mIdx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2068 
2069  __m256i m1shft = _mm256_permutevar8x32_epi32(mTest, mIdx);
2070  m1shft = _mm256_insert_epi32(m1shft, shift_in, 0);
2071 
2072  avx2_print256("m1shft=", m1shft);
2073 }
2074 */
2075 
2076 
2077 
2078 /*!
2079  AVX2 calculate number of bit changes from 0 to 1
2080  @ingroup AVX2
2081 */
2082 inline
2083 unsigned avx2_bit_block_calc_change(const __m256i* BMRESTRICT block,
2084  unsigned size)
2085 {
2087 
2088  const __m256i* block_end =
2089  (const __m256i*)((bm::word_t*)(block) + size);
2090 
2091  __m256i m1COshft, m2COshft;
2092  __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2093  __m256i cntAcc = _mm256_setzero_si256();
2094 
2095  unsigned w0 = *((bm::word_t*)(block));
2096  unsigned count = 1;
2097 
2099 
2100  unsigned co2, co1 = 0;
2101  for (;block < block_end; block+=2)
2102  {
2103  __m256i m1A = _mm256_load_si256(block);
2104  __m256i m2A = _mm256_load_si256(block+1);
2105 
2106  __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2107  __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2108 
2109  co2 = _mm256_extract_epi32(m1CO, 7);
2110 
2111  __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2112  __m256i m2As = _mm256_slli_epi32(m2A, 1);
2113 
2114  // shift CO flags using +1 permute indexes, add CO to v[0]
2115  m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2116  m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2117 
2118  co1 = co2;
2119 
2120  co2 = _mm256_extract_epi32(m2CO, 7);
2121  m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2122  m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2123 
2124  m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2125  m2As = _mm256_or_si256(m2As, m2COshft);
2126 
2127  co1 = co2;
2128 
2129  // we now have two shifted AVX2 regs with carry-over
2130  m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2131  m2A = _mm256_xor_si256(m2A, m2As);
2132 
2133  {
2134  BM_AVX2_BIT_COUNT(bc, m1A)
2135  cntAcc = _mm256_add_epi64(cntAcc, bc);
2136  BM_AVX2_BIT_COUNT(bc, m2A)
2137  cntAcc = _mm256_add_epi64(cntAcc, bc);
2138  }
2139  } // for
2140 
2141  // horizontal count sum
2142  _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2143  count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2144 
2145  count -= (w0 & 1u); // correct initial carry-in error
2146  return count;
2147 }
2148 
2149 /*!
2150  AVX2 calculate number of bit changes from 0 to 1 from a XOR product
2151  @ingroup AVX2
2152 */
2153 inline
2154 void avx2_bit_block_calc_xor_change(const __m256i* BMRESTRICT block,
2155  const __m256i* BMRESTRICT xor_block,
2156  unsigned size,
2157  unsigned* BMRESTRICT gcount,
2158  unsigned* BMRESTRICT bcount)
2159 {
2161 
2162  const __m256i* BMRESTRICT block_end =
2163  (const __m256i*)((bm::word_t*)(block) + size);
2164 
2165  __m256i m1COshft, m2COshft;
2166  __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2167 
2168  __m256i cntAcc = _mm256_setzero_si256();
2169  __m256i cntAcc2 = _mm256_setzero_si256();
2170 
2171  unsigned w0 = *((bm::word_t*)(block));
2172  unsigned bit_count = 0;
2173  unsigned gap_count = 1;
2174 
2176 
2177  unsigned co2, co1 = 0;
2178  for (;block < block_end; block+=2, xor_block+=2)
2179  {
2180  __m256i m1A = _mm256_load_si256(block);
2181  __m256i m2A = _mm256_load_si256(block+1);
2182  __m256i m1B = _mm256_load_si256(xor_block);
2183  __m256i m2B = _mm256_load_si256(xor_block+1);
2184 
2185  m1A = _mm256_xor_si256 (m1A, m1B);
2186  m2A = _mm256_xor_si256 (m2A, m2B);
2187 
2188  {
2189  BM_AVX2_BIT_COUNT(bc, m1A)
2190  cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2191  BM_AVX2_BIT_COUNT(bc, m2A)
2192  cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2193  }
2194 
2195  __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2196  __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2197 
2198  co2 = _mm256_extract_epi32(m1CO, 7);
2199 
2200  __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2201  __m256i m2As = _mm256_slli_epi32(m2A, 1);
2202 
2203  // shift CO flags using +1 permute indexes, add CO to v[0]
2204  m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2205  m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2206 
2207  co1 = co2;
2208 
2209  co2 = _mm256_extract_epi32(m2CO, 7);
2210  m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2211  m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2212 
2213  m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2214  m2As = _mm256_or_si256(m2As, m2COshft);
2215 
2216  co1 = co2;
2217 
2218  // we now have two shifted AVX2 regs with carry-over
2219  m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2220  m2A = _mm256_xor_si256(m2A, m2As);
2221 
2222  {
2223  BM_AVX2_BIT_COUNT(bc, m1A)
2224  cntAcc = _mm256_add_epi64(cntAcc, bc);
2225  BM_AVX2_BIT_COUNT(bc, m2A)
2226  cntAcc = _mm256_add_epi64(cntAcc, bc);
2227  }
2228  } // for
2229 
2230  // horizontal count sum
2231  _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2232  gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2233  gap_count -= (w0 & 1u); // correct initial carry-in error
2234  if (!gap_count)
2235  ++gap_count; // always >0
2236 
2237  _mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);
2238  bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2239 
2240  *gcount = gap_count;
2241  *bcount = bit_count;
2242 }
2243 
2244 
2245 
2246 /*!
2247  AVX2 calculate number of bit changes from 0 to 1 and bitcount
2248  @ingroup AVX2
2249 */
2250 inline
2251 void avx2_bit_block_calc_change_bc(const __m256i* BMRESTRICT block,
2252  unsigned* gcount, unsigned* bcount)
2253 {
2255 
2256  const __m256i* block_end =
2257  (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
2258 
2259  __m256i m1COshft, m2COshft;
2260  __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2261  __m256i cntAcc = _mm256_setzero_si256();
2262 
2263  unsigned w0 = *((bm::word_t*)(block));
2264  unsigned bit_count = 0;
2265  unsigned gap_count = 1;
2266 
2268 
2269  unsigned co2, co1 = 0;
2270  for (;block < block_end; block+=2)
2271  {
2272  __m256i m1A = _mm256_load_si256(block);
2273  __m256i m2A = _mm256_load_si256(block+1);
2274 
2275  // popcount
2276  {
2277  bm::id64_t* b64 = (bm::id64_t*)block;
2278 
2279  bit_count += (unsigned) (_mm_popcnt_u64(b64[0]) + _mm_popcnt_u64(b64[1]));
2280  bit_count += (unsigned)(_mm_popcnt_u64(b64[2]) + _mm_popcnt_u64(b64[3]));
2281 
2282  bit_count += (unsigned)(_mm_popcnt_u64(b64[4]) + _mm_popcnt_u64(b64[5]));
2283  bit_count += (unsigned)(_mm_popcnt_u64(b64[6]) + _mm_popcnt_u64(b64[7]));
2284  }
2285 
2286  __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2287  __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2288 
2289  co2 = _mm256_extract_epi32(m1CO, 7);
2290 
2291  __m256i m1As = _mm256_slli_epi32(m1A, 1); // (block[i] << 1u)
2292  __m256i m2As = _mm256_slli_epi32(m2A, 1);
2293 
2294  // shift CO flags using +1 permute indexes, add CO to v[0]
2295  m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2296  m1COshft = _mm256_insert_epi32(m1COshft, co1, 0); // v[0] = co_flag
2297 
2298  co1 = co2;
2299 
2300  co2 = _mm256_extract_epi32(m2CO, 7);
2301  m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2302  m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2303 
2304  m1As = _mm256_or_si256(m1As, m1COshft); // block[i] |= co_flag
2305  m2As = _mm256_or_si256(m2As, m2COshft);
2306 
2307  co1 = co2;
2308 
2309  // we now have two shifted AVX2 regs with carry-over
2310  m1A = _mm256_xor_si256(m1A, m1As); // w ^= (w >> 1);
2311  m2A = _mm256_xor_si256(m2A, m2As);
2312 
2313  {
2314  BM_AVX2_BIT_COUNT(bc, m1A)
2315  cntAcc = _mm256_add_epi64(cntAcc, bc);
2316  BM_AVX2_BIT_COUNT(bc, m2A)
2317  cntAcc = _mm256_add_epi64(cntAcc, bc);
2318  }
2319  } // for
2320 
2321  // horizontal count sum
2322  _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2323  gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2324  gap_count -= (w0 & 1u); // correct initial carry-in error
2325 
2326  *gcount = gap_count;
2327  *bcount = bit_count;
2328 }
2329 
2330 
2331 /*!
2332  \brief Find first bit which is different between two bit-blocks
2333  @ingroup AVX2
2334 */
2335 inline
2336 bool avx2_bit_find_first_diff(const __m256i* BMRESTRICT block1,
2337  const __m256i* BMRESTRICT block2,
2338  unsigned* pos)
2339 {
2340  unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR;
2341 
2342  const __m256i* block1_end =
2343  (const __m256i*)((bm::word_t*)(block1) + bm::set_block_size);
2344  __m256i maskZ = _mm256_setzero_si256();
2345  __m256i mA, mB;
2346  unsigned simd_lane = 0;
2347  do
2348  {
2349  mA = _mm256_xor_si256(_mm256_load_si256(block1),
2350  _mm256_load_si256(block2));
2351  mB = _mm256_xor_si256(_mm256_load_si256(block1+1),
2352  _mm256_load_si256(block2+1));
2353  __m256i mOR = _mm256_or_si256(mA, mB);
2354  if (!_mm256_testz_si256(mOR, mOR)) // test 2x256 lanes
2355  {
2356  if (!_mm256_testz_si256(mA, mA))
2357  {
2358  // invert to fing (w != 0)
2359  unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2360  BM_ASSERT(mask);
2361  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2362  _mm256_store_si256 ((__m256i*)simd_buf, mA);
2363  unsigned widx = bsf >> 2; // (bsf / 4);
2364  unsigned w = simd_buf[widx];// _mm256_extract_epi32 (mA, widx);
2365  bsf = bm::bsf_asm32(w); // find first bit != 0
2366  *pos = (simd_lane * 256) + (widx * 32) + bsf;
2367  return true;
2368  }
2369  // invert to fing (w != 0)
2370  unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2371  BM_ASSERT(mask);
2372  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2373  _mm256_store_si256 ((__m256i*)simd_buf, mB);
2374  unsigned widx = bsf >> 2; // (bsf / 4);
2375  unsigned w = simd_buf[widx];// _mm256_extract_epi32 (mB, widx);
2376  bsf = bm::bsf_asm32(w); // find first bit != 0
2377  *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2378  return true;
2379  }
2380 
2381  simd_lane+=2;
2382  block1+=2; block2+=2;
2383 
2384  } while (block1 < block1_end);
2385  return false;
2386 }
2387 
2388 
2389 /*!
2390  \brief Find first bit set
2391  @ingroup AVX2
2392 */
2393 inline
2394 bool avx2_bit_find_first(const __m256i* BMRESTRICT block, unsigned off, unsigned* pos)
2395 {
2396  unsigned BM_ALIGN32 simd_buf[8] BM_ALIGN32ATTR;
2397 
2398  block = (const __m256i*)((bm::word_t*)(block) + off);
2399  const __m256i* block_end =
2400  (const __m256i*)((bm::word_t*)(block) + bm::set_block_size);
2401  __m256i maskZ = _mm256_setzero_si256();
2402  __m256i mA, mB;
2403  unsigned simd_lane = 0;
2404  do
2405  {
2406  mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);
2407  __m256i mOR = _mm256_or_si256(mA, mB);
2408  if (!_mm256_testz_si256(mOR, mOR)) // test 2x256 lanes
2409  {
2410  if (!_mm256_testz_si256(mA, mA))
2411  {
2412  // invert to fing (w != 0)
2413  unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2414  BM_ASSERT(mask);
2415  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2416  _mm256_store_si256 ((__m256i*)simd_buf, mA);
2417  unsigned widx = bsf >> 2; // (bsf / 4);
2418  unsigned w = simd_buf[widx];
2419  bsf = bm::bsf_asm32(w); // find first bit != 0
2420  *pos = (off * 32) + (simd_lane * 256) + (widx * 32) + bsf;
2421  return true;
2422  }
2423  // invert to fing (w != 0)
2424  unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2425  BM_ASSERT(mask);
2426  int bsf = bm::bsf_asm32(mask); // find first !=0 (could use lzcnt())
2427  _mm256_store_si256 ((__m256i*)simd_buf, mB);
2428  unsigned widx = bsf >> 2; // (bsf / 4);
2429  unsigned w = simd_buf[widx];
2430  bsf = bm::bsf_asm32(w); // find first bit != 0
2431  *pos = (off * 32) + ((++simd_lane) * 256) + (widx * 32) + bsf;
2432  return true;
2433  }
2434 
2435  simd_lane+=2;
2436  block+=2;
2437 
2438  } while (block < block_end);
2439  return false;
2440 }
2441 
2442 
2443 
2444 /* @brief Gap block population count (array sum) utility
2445  @param pbuf - unrolled, aligned to 1-start GAP buffer
2446  @param avx_vect_waves - number of AVX vector lines to process
2447  @param sum - result acumulator
2448  @return tail pointer
2449 
2450  @internal
2451 */
2452 inline
2454  unsigned avx_vect_waves,
2455  unsigned* sum)
2456 {
2457  __m256i xcnt = _mm256_setzero_si256();
2458 
2459  // accumulate odd and even elements of the vector the result is
2460  // correct based on modulus 16 (max element value in gap blocks is 65535)
2461  // overflow is not an issue here
2462  for (unsigned i = 0; i < avx_vect_waves; ++i)
2463  {
2464  __m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
2465  __m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
2466  __m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
2467  xcnt = _mm256_add_epi16(xcnt, ymm_s2);
2468  pbuf += 32;
2469  }
2470  // odd minus even vector elements clears the result for 1111 blocks
2471  // bsrli - byte shifts the vector element by 2 bytes (1 short int)
2472  xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
2473 
2474  // horizontal sum of vector elements
2475  // cnt16[0] + cnt16[2] + cnt16[4] + cnt16[6] + cnt16[8] + cnt16[10] + cnt16[12] + cnt16[14];
2476  //
2477  xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
2478  xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
2479  __m128i xcnt2 = _mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
2480 
2481  // extract 32-bit word and mask to take first 16 bits
2482  *sum += _mm_cvtsi128_si32(xcnt2) & 0xffff;
2483  return pbuf;
2484 }
2485 
2486 
2487 /*!
2488  AVX2 index lookup to check what belongs to the same block (8 elements)
2489  \internal
2490 */
2491 inline
2492 unsigned avx2_idx_arr_block_lookup(const unsigned* idx, unsigned size,
2493  unsigned nb, unsigned start)
2494 {
2495  const unsigned unroll_factor = 16;
2496  const unsigned len = (size - start);
2497  const unsigned len_unr = len - (len % unroll_factor);
2498  unsigned k;
2499 
2500  idx += start;
2501 
2502  __m256i nbM = _mm256_set1_epi32(int(nb));
2503 
2504  for (k = 0; k < len_unr; k+=unroll_factor)
2505  {
2506  __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2507  __m256i nbA = _mm256_srli_epi32(idxA, bm::set_block_shift); // idx[k] >> bm::set_block_shift
2508 
2509  __m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
2510  if (~0u != unsigned(_mm256_movemask_epi8(wcmpA)))
2511  break;
2512  __m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
2513  __m256i nbB = _mm256_srli_epi32(idxB, bm::set_block_shift);
2514 
2515  __m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
2516  if (~0u != unsigned(_mm256_movemask_epi8(wcmpB)))
2517  break;
2518  } // for k
2519  for (; k < len; ++k)
2520  {
2521  if (nb != unsigned(idx[k] >> bm::set_block_shift))
2522  break;
2523  } // for k
2524  return start + k;
2525 }
2526 
2527 
2528 /*!
2529  SSE4.2 bulk bit set
2530  \internal
2531 */
2532 inline
2534  const unsigned* BMRESTRICT idx,
2535  unsigned start, unsigned stop )
2536 {
2537  const unsigned unroll_factor = 8;
2538  const unsigned len = (stop - start);
2539  const unsigned len_unr = len - (len % unroll_factor);
2540 
2541  idx += start;
2542 
2543  __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2544  __m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
2545  __m256i mask1 = _mm256_set1_epi32(1);
2546  __m256i mask_tmp;
2547 
2548  unsigned BM_ALIGN32 mask_v[8] BM_ALIGN32ATTR;
2549  unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
2550 
2551  unsigned k = 0, mask, w_idx;
2552  for (; k < len_unr; k+=unroll_factor)
2553  {
2554  __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2555  __m256i nbitA = _mm256_and_si256 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
2556  __m256i nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
2557 
2558  nbitA = _mm256_and_si256 (nbitA, sw_mask); // nbit &= bm::set_word_mask;
2559 
2560  __m256i maskA = _mm256_sllv_epi32(mask1, nbitA); // (1 << nbit)
2561 
2562  _mm256_store_si256 ((__m256i*)mword_v, nwordA); // store block word idxs
2563 
2564  // shufffle + permute to prepare comparison vector
2565  mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
2566  mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2567  mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2568  if (mask == ~0u) // all idxs belong the same word
2569  {
2570  w_idx = mword_v[0];
2571  mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp); // zero bits
2572  mask_tmp = _mm256_or_si256 (mask_tmp, maskA); // set bits
2573 
2574  // horizontal OR via permutation of two 128-bit lanes
2575  // then byte-shifts + OR withing lower 128
2576  __m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);
2577  __m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);
2578  mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);
2579  mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4); // shift R by 1 int
2580  mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2581  mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8); // shift R by 2 ints
2582  mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2583 
2584  int u0 = _mm256_extract_epi32(mask_tmp, 0); // final OR
2585  block[w_idx] |= u0;
2586  }
2587  else // whole 256-bit lane does NOT hit the same word...
2588  {
2589  _mm256_store_si256 ((__m256i*)mask_v, maskA);
2590 
2591  // compute horizonlal OR of set bit mask over lo-hi 128-bit lanes
2592  // it is used later if lo or hi lanes hit the same word
2593  // (probabilistic speculation)
2594  //
2595  int u0, u4;
2596  {
2597  mask_tmp = _mm256_bsrli_epi128(maskA, 4); // shift R by 1 int
2598  mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2599  __m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8); // shift R by 2 ints
2600  mask_tmp = _mm256_or_si256 (m0, mask_tmp);
2601 
2602  u0 = _mm256_extract_epi32(mask_tmp, 0); // final OR (128-lo)
2603  u4 = _mm256_extract_epi32(mask_tmp, 4); // final OR (128-hi)
2604  }
2605 
2606  // check the lo 128-lane
2607  {
2608  mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0); // lo
2609  __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0); // copy simd[0]
2610  mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2611  if (mask == ~0u) // all idxs belong the same word
2612  {
2613  w_idx = mword_v[0];
2614  block[w_idx] |= u0;
2615  }
2616  else // different block words: use "shotgun" OR
2617  {
2618  block[mword_v[0]] |= mask_v[0];
2619  block[mword_v[1]] |= mask_v[1];
2620  block[mword_v[2]] |= mask_v[2];
2621  block[mword_v[3]] |= mask_v[3];
2622 
2623  }
2624  }
2625 
2626  // check the hi 128-lane
2627  {
2628  mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1); // hi
2629  __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2630  mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2631  if (mask == ~0u) // all idxs belong the same word
2632  {
2633  w_idx = mword_v[4];
2634  block[w_idx] |= u4;
2635  }
2636  else
2637  {
2638  block[mword_v[4]] |= mask_v[4];
2639  block[mword_v[5]] |= mask_v[5];
2640  block[mword_v[6]] |= mask_v[6];
2641  block[mword_v[7]] |= mask_v[7];
2642  }
2643  }
2644  }
2645  } // for k
2646 
2647  for (; k < len; ++k)
2648  {
2649  unsigned n = idx[k];
2650  unsigned nbit = unsigned(n & bm::set_block_mask);
2651  unsigned nword = nbit >> bm::set_word_shift;
2652  nbit &= bm::set_word_mask;
2653  block[nword] |= (1u << nbit);
2654  } // for k
2655 }
2656 
2657 
2658 /** Set a bits in an AVX target, by indexes (int4) from the source
2659  @internal
2660 */
2662 __m256i avx2_setbit_256(__m256i target, __m256i source)
2663 {
2664  __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2665  __m256i mask1 = _mm256_set1_epi32(1);
2666 
2667  __m256i v0, v1, acc1, acc2;
2668  v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(0));
2669  v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(1));
2670  v0 = _mm256_sub_epi32(v0, stride_idx);
2671  v1 = _mm256_sub_epi32(v1, stride_idx);
2672  v0 = _mm256_sllv_epi32(mask1, v0);
2673  v1 = _mm256_sllv_epi32(mask1, v1);
2674  acc1 = _mm256_or_si256(v1, v0);
2675  v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(2));
2676  v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(3));
2677  v0 = _mm256_sub_epi32(v0, stride_idx);
2678  v1 = _mm256_sub_epi32(v1, stride_idx);
2679  v0 = _mm256_sllv_epi32(mask1, v0);
2680  v1 = _mm256_sllv_epi32(mask1, v1);
2681  acc2 = _mm256_or_si256(v1, v0);
2682  target = _mm256_or_si256(target, acc1);
2683  v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(4));
2684  v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(5));
2685  v0 = _mm256_sub_epi32(v0, stride_idx);
2686  v1 = _mm256_sub_epi32(v1, stride_idx);
2687  v0 = _mm256_sllv_epi32(mask1, v0);
2688  v1 = _mm256_sllv_epi32(mask1, v1);
2689  acc1 = _mm256_or_si256(v1, v0);
2690  target = _mm256_or_si256(target, acc2);
2691  v0 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(6));
2692  v1 = _mm256_permutevar8x32_epi32(source, _mm256_set1_epi32(7));
2693  v0 = _mm256_sub_epi32(v0, stride_idx);
2694  v1 = _mm256_sub_epi32(v1, stride_idx);
2695  v0 = _mm256_sllv_epi32(mask1, v0);
2696  v1 = _mm256_sllv_epi32(mask1, v1);
2697  acc2 = _mm256_or_si256(v1, v0);
2698 
2699  target = _mm256_or_si256(target, acc1);
2700  target = _mm256_or_si256(target, acc2);
2701  return target;
2702 }
2703 
2704 
2705 /** Experimental code to set bits via AVX strides
2706  @internal
2707 */
2708 inline
2710  const unsigned* BMRESTRICT idx,
2711  unsigned start, unsigned stop )
2712 {
2713  __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2714  __m256i mask1 = _mm256_set1_epi32(1);
2715  __m256i* block_avx = (__m256i*)block;
2716 
2717  unsigned stride = 0;
2718  __m256i* avx_stride_p = block_avx + stride;
2719  __m256i blkA = _mm256_load_si256(avx_stride_p);
2720 
2721  for (unsigned i = start; i < stop; ++i)
2722  {
2723  unsigned n = idx[i];
2724  unsigned nbit = unsigned(n & bm::set_block_mask);
2725  unsigned new_stride = nbit >> 8; // (nbit / 256)
2726  unsigned stride_bit = nbit & 0xFF; // (nbit % 256)
2727  if (new_stride != stride)
2728  {
2729  _mm256_store_si256(avx_stride_p, blkA); // flush the avx2 accum
2730  stride = new_stride;
2731  avx_stride_p = block_avx + stride;
2732  blkA = _mm256_load_si256(avx_stride_p); // re-load the accum
2733  }
2734  // set avx2 stride bit
2735  __m256i v0 = _mm256_set1_epi32(stride_bit);
2736  __m256i s0 = _mm256_sub_epi32(v0, stride_idx);
2737  __m256i k0 = _mm256_sllv_epi32(mask1, s0);
2738  blkA = _mm256_or_si256(blkA, k0);
2739  } // for i
2740 
2741  _mm256_store_si256(avx_stride_p, blkA);
2742 }
2743 
2744 /** Experimental code to set bits via AVX strides
2745  @internal
2746 */
2747 inline
2749  const unsigned* BMRESTRICT idx,
2750  unsigned start, unsigned stop )
2751 {
2752  const unsigned unroll_factor = 8;
2753  const unsigned len = (stop - start);
2754  const unsigned len_unr = len - (len % unroll_factor);
2755 
2756  idx += start;
2757 
2758  __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2759  __m256i mask1 = _mm256_set1_epi32(1);
2760 
2761  __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
2762  __m256i stride_bit_mask = _mm256_set1_epi32(0xFF);
2763 
2764  unsigned BM_ALIGN32 mstride_v[8] BM_ALIGN32ATTR;
2765  int BM_ALIGN32 mstride_bit_v[8] BM_ALIGN32ATTR;
2766 
2767  // define the very first block stride based on index 0
2768  unsigned stride = unsigned(idx[0] & bm::set_block_mask) >> 8;
2769 
2770  __m256i* block_avx = (__m256i*)block;
2771  __m256i* avx_stride_p = block_avx + stride;
2772 
2773  __m256i blkA = _mm256_load_si256(avx_stride_p); // load the first accum
2774 
2775  unsigned k = 0, mask;
2776  for (; k < len_unr; k+=unroll_factor)
2777  {
2778  __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2779  __m256i nbitA = _mm256_and_si256 (idxA, sb_mask); // nbit = idx[k] & bm::set_block_mask
2780  __m256i strideA = _mm256_srli_epi32 (nbitA, 8); // new_stride = nbit >> 8
2781  __m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask); // stride_bit = nbit & 0xFF;
2782 
2783  // construct a cmp vector from broadcasted v[0]
2784  __m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);
2785  mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2786  mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));
2787  if (mask == ~0u) // all idxs belong the same avx2 stride
2788  {
2789  unsigned new_stride = (unsigned)_mm256_extract_epi32(strideA, 0);
2790  if (new_stride != stride)
2791  {
2792  _mm256_store_si256(avx_stride_p, blkA); // flush avx2 accum
2793  stride = new_stride;
2794  avx_stride_p = block_avx + stride;
2795  blkA = _mm256_load_si256(avx_stride_p); // re-load accum
2796  }
2797  // set 8 bits all at once
2798  blkA = bm::avx2_setbit_256(blkA, strideBitA);
2799  }
2800  else // stride mix here, process one by one
2801  {
2802  _mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA); // store block stride-bit idxs
2803  _mm256_store_si256 ((__m256i*)mstride_v, strideA);
2804  for (unsigned j = 0; j < 8; ++j)
2805  {
2806  unsigned new_stride = mstride_v[j];
2807  if (new_stride != stride)
2808  {
2809  _mm256_store_si256(avx_stride_p, blkA); // flush avx2 accum
2810  stride = new_stride;
2811  avx_stride_p = block_avx + stride;
2812  blkA = _mm256_load_si256(avx_stride_p); // re-load accum
2813  }
2814  // set avx2 bits one by one
2815  mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);
2816  mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);
2817  mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);
2818  blkA = _mm256_or_si256(blkA, mask_tmp);
2819  } // for j
2820  }
2821  } // for k
2822  _mm256_store_si256(avx_stride_p, blkA);
2823 
2824  // set the tail bits conventionally
2825  for (; k < len; ++k)
2826  {
2827  unsigned n = idx[k];
2828  unsigned nbit = unsigned(n & bm::set_block_mask);
2829  unsigned nword = nbit >> bm::set_word_shift;
2830  nbit &= bm::set_word_mask;
2831  block[nword] |= (1u << nbit);
2832  } // for k
2833 }
2834 
2835 
2836 /**
2837  Experiemntal. Set number of bits in AVX register from 0 to i
2838  [ 000000 00000 0000000 00011 11111 ] - i = 7
2839 */
2840 inline
2841 __m256i avx2_setbit_to256(unsigned i)
2842 {
2843  __m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2844  __m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));
2845  __m256i maskFF = _mm256_set1_epi32(-1);
2846  __m256i maskZ = _mm256_setzero_si256();
2847 
2848  __m256i v0 = _mm256_set1_epi32(i);
2849  __m256i s0 = _mm256_sub_epi32(v0, stride_idx1);
2850  __m256i k1 = _mm256_sllv_epi32(maskFF, s0);
2851 
2852  {
2853  __m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);
2854  cmp_eq = _mm256_xor_si256(maskFF, cmp_eq); // invert: != 0 mask
2855  k1 = _mm256_xor_si256(k1, cmp_eq); // [ 0 0 0 0 0 0 3 0 ]
2856  }
2857 
2858  __m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);
2859  cmp_gt = _mm256_xor_si256(maskFF, cmp_gt); // invert as GT == LT|EQ (LE)
2860  __m256i r = _mm256_xor_si256(k1, cmp_gt); // invert all full words (right)
2861 
2862  return r;
2863 }
2864 
2865 
2866 
2867 /**
2868  Experimental (test) function to do SIMD vector search (lower bound)
2869  in sorted, growing array
2870  @ingroup AVX2
2871 
2872  \internal
2873 */
2874 inline
2875 int avx2_cmpge_u32(__m256i vect8, unsigned value)
2876 {
2877  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
2878  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
2879  //
2880  __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2881  __m256i mm_val = _mm256_set1_epi32(value);
2882 
2883  __m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8); // (signed) vect4 - 0x80000000
2884  __m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8); // (signed) mm_val - 0x80000000
2885 
2886  __m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);
2887  __m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);
2888 
2889  __m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);
2890  int mask = _mm256_movemask_epi8(cmp_mask_ge);
2891  if (mask)
2892  {
2893  int bsf = bm::bsf_asm32(mask); // could use lzcnt()
2894  return bsf / 4;
2895  }
2896  return -1;
2897 }
2898 
2899 /**
2900  Experimental (test) function to do SIMD vector search
2901  in sorted, growing array
2902  @ingroup AVX2
2903 
2904  \internal
2905 */
2906 inline
2907 int avx2_cmpge_u16(__m256i vect16, unsigned short value)
2908 {
2909  __m256i mZ = _mm256_setzero_si256();
2910  __m256i mVal = _mm256_set1_epi16(value);
2911 
2912  // subs_epu16 - unsigned substration with saturation, gives 0u if (a - b) < 0
2913  __m256i mSub = _mm256_subs_epu16(mVal, vect16);
2914  __m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2915  unsigned mask = _mm256_movemask_epi8(mge_mask);
2916  if (mask)
2917  {
2918  int lz = _tzcnt_u32(mask);
2919  return lz / 2;
2920  }
2921  return -1;
2922 }
2923 
2924 
2925 /**
2926  Hybrid binary search, starts as binary, then switches to scan
2927 
2928  NOTE: AVX code uses _mm256_subs_epu16 - saturated substraction
2929  which gives 0 if A-B=0 if A < B (not negative a value).
2930 
2931  \param buf - GAP buffer pointer.
2932  \param pos - index of the element.
2933  \param is_set - output. GAP value (0 or 1).
2934  \return GAP index OR bit-test
2935 
2936  @ingroup AVX2
2937 */
2938 template<bool RET_TEST=false>
2939 unsigned avx2_gap_bfind(const unsigned short* BMRESTRICT buf,
2940  unsigned pos, unsigned* BMRESTRICT is_set)
2941 {
2942  BM_ASSERT(is_set || RET_TEST);
2943 
2944  const unsigned linear_cutoff = 64;//48;
2945  const unsigned unroll_factor = 16;
2946 
2947  BM_ASSERT(pos < bm::gap_max_bits);
2948 
2949  unsigned res;
2950  unsigned start = 1;
2951  unsigned end = ((*buf) >> 3);
2952 
2953  const unsigned arr_end = end + 1;
2954  if (end <= unroll_factor) // too small for a full AVX stride
2955  {
2956  for (; true; ++start)
2957  if (buf[start] >= pos)
2958  goto ret;
2959  BM_ASSERT(0);
2960  }
2961 
2962  do
2963  {
2964  unsigned dsize = end - start;
2965  for (; dsize >= 64; dsize = end - start)
2966  {
2967  unsigned mid = (start + end) >> 1;
2968  if (buf[mid] < pos)
2969  start = mid+1;
2970  else
2971  end = mid;
2972  if (buf[mid = (start + end) >> 1] < pos)
2973  start = mid+1;
2974  else
2975  end = mid;
2976  if (buf[mid = (start + end) >> 1] < pos)
2977  start = mid+1;
2978  else
2979  end = mid;
2980  if (buf[mid = (start + end) >> 1] < pos)
2981  start = mid+1;
2982  else
2983  end = mid;
2984  BM_ASSERT(buf[end] >= pos);
2985  } // for
2986 
2987  dsize = end - start + 1;
2988  if (dsize < linear_cutoff)
2989  {
2990  // set wider scan window to possibly over-read the range,
2991  // but stay within allocated block memory
2992  //
2993  dsize = arr_end - start;
2994 
2995  __m256i mZ = _mm256_setzero_si256();
2996  __m256i mPos = _mm256_set1_epi16((unsigned short)pos);
2997  __m256i vect16, mSub, mge_mask;
2998 
2999  for (unsigned len_unr = start + (dsize - (dsize % unroll_factor));
3000  start < len_unr; start += unroll_factor)
3001  {
3002  vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); //16x u16s
3003  mSub = _mm256_subs_epu16(mPos, vect16);
3004  mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3005  if (int mask = _mm256_movemask_epi8(mge_mask); mask)
3006  {
3007  int lz = _tzcnt_u32(mask);
3008  start += (lz >> 1);
3009  goto ret;
3010  }
3011  } // for
3012 // if (unsigned tail = unroll_factor-(end-start); start > tail+1)
3013  {
3014  start = end - 15;
3015  BM_ASSERT(buf[start + 15] >= pos);
3016  vect16 = _mm256_loadu_si256((__m256i*)(&buf[start])); //16x u16s
3017  mSub = _mm256_subs_epu16(mPos, vect16);
3018  mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3019  int mask = _mm256_movemask_epi8(mge_mask);
3020  BM_ASSERT(mask); // the result MUST be here at this point
3021  int lz = _tzcnt_u32(mask);
3022  start += (lz >> 1);
3023  goto ret;
3024  }
3025  for (; true; ++start)
3026  if (buf[start] >= pos)
3027  goto ret;
3028  BM_ASSERT(0);
3029  }
3030 
3031  if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
3032  start = mid + 1;
3033  else
3034  end = mid;
3035  if (unsigned mid = (start + end) >> 1; buf[mid] < pos)
3036  start = mid + 1;
3037  else
3038  end = mid;
3039  } while (1);
3040 ret:
3041  res = ((*buf) & 1) ^ ((start-1) & 1);
3042  if constexpr(RET_TEST)
3043  return res;
3044  else
3045  {
3046  *is_set = res;
3047  return start;
3048  }
3049 }
3050 
3051 
3052 /**
3053  Hybrid binary search, starts as binary, then switches to scan
3054  @ingroup AVX2
3055 */
3056 inline
3057 unsigned avx2_gap_test(const unsigned short* BMRESTRICT buf, unsigned pos)
3058 {
3059  return bm::avx2_gap_bfind<true>(buf, pos, 0);
3060 }
3061 
3062 /**
3063  lower bound (great or equal) linear scan in ascending order sorted array
3064  @ingroup AVX2
3065  \internal
3066 */
3067 inline
3068 unsigned avx2_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
3069  unsigned target,
3070  unsigned from,
3071  unsigned to)
3072 {
3073  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
3074  // see more at:
3075  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
3076 
3077  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
3078 
3079  unsigned unroll_factor = 8;
3080  unsigned len = to - from + 1;
3081  unsigned len_unr = len - (len % unroll_factor);
3082 
3083  __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
3084  __m256i vect_target = _mm256_set1_epi32(target);
3085  __m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
3086 
3087  int mask;
3088  __m256i vect80, norm_vect80, cmp_mask_ge;
3089 
3090  unsigned k = 0;
3091  for (; k < len_unr; k += unroll_factor)
3092  {
3093  vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k])); // 8 u32s
3094  norm_vect80 = _mm256_sub_epi32(vect80, mask0x8); // (signed) vect4 - 0x80000000
3095 
3096  cmp_mask_ge = _mm256_or_si256( // GT | EQ
3097  _mm256_cmpgt_epi32(norm_vect80, norm_target),
3098  _mm256_cmpeq_epi32(vect80, vect_target)
3099  );
3100  mask = _mm256_movemask_epi8(cmp_mask_ge);
3101  if (mask)
3102  {
3103  int bsf = bm::bsf_asm32(mask); //_bit_scan_forward(mask);
3104  return from + k + (bsf / 4);
3105  }
3106  } // for
3107 
3108  for (; k < len; ++k)
3109  {
3110  if (arr_base[k] >= target)
3111  return from + k;
3112  }
3113  return to + 1;
3114 }
3115 
3116 
3117 /*!
3118  AVX2 bit block gather-scatter
3119 
3120  @param arr - destination array to set bits
3121  @param blk - source bit-block
3122  @param idx - gather index array
3123  @param size - gather array size
3124  @param start - gaher start index
3125  @param bit_idx - bit to set in the target array
3126 
3127  \internal
3128 
3129  C algorithm:
3130 
3131  for (unsigned k = start; k < size; ++k)
3132  {
3133  nbit = unsigned(idx[k] & bm::set_block_mask);
3134  nword = unsigned(nbit >> bm::set_word_shift);
3135  mask0 = 1u << (nbit & bm::set_word_mask);
3136  arr[k] |= TRGW(bool(blk[nword] & mask0) << bit_idx);
3137  }
3138 
3139 */
3140 inline
3142  const unsigned* BMRESTRICT blk,
3143  const unsigned* BMRESTRICT idx,
3144  unsigned size,
3145  unsigned start,
3146  unsigned bit_idx)
3147 {
3148  const unsigned unroll_factor = 8;
3149  const unsigned len = (size - start);
3150  const unsigned len_unr = len - (len % unroll_factor);
3151 
3152  __m256i sb_mask = _mm256_set1_epi32(bm::set_block_mask);
3153  __m256i sw_mask = _mm256_set1_epi32(bm::set_word_mask);
3154  __m256i maskFF = _mm256_set1_epi32(~0u);
3155 
3156  __m256i mask_tmp, mask_0;
3157 
3158  unsigned BM_ALIGN32 mword_v[8] BM_ALIGN32ATTR;
3159 
3160  unsigned k = 0, mask, w_idx;
3161  for (; k < len_unr; k+=unroll_factor)
3162  {
3163  __m256i nbitA, nwordA;
3164  const unsigned base = start + k;
3165  __m256i* idx_ptr = (__m256i*)(idx+base); // idx[base]
3166 
3167  nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask); // nbit = idx[base] & bm::set_block_mask
3168  nwordA = _mm256_srli_epi32 (nbitA, bm::set_word_shift); // nword = nbit >> bm::set_word_shift
3169 
3170  // shufffle + permute to prepare comparison vector
3171  mask_tmp = _mm256_shuffle_epi32 (nwordA, _MM_SHUFFLE(1,1,1,1));
3172  mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
3173  mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
3174  _mm256_store_si256((__m256i*)mword_v, nwordA);
3175 
3176  if (mask == ~0u) // all idxs belong the same word avoid (costly) gather
3177  {
3178  w_idx = mword_v[0];
3179  mask_tmp = _mm256_set1_epi32(blk[w_idx]); // use broadcast
3180  }
3181  else // gather for: blk[nword] (.. & mask0 )
3182  {
3183  mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
3184  blk[mword_v[5]], blk[mword_v[4]],
3185  blk[mword_v[3]], blk[mword_v[2]],
3186  blk[mword_v[1]], blk[mword_v[0]]);
3187  }
3188 
3189  // mask0 = 1u << (nbit & bm::set_word_mask);
3190  //
3191  __m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
3192  __m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
3193  mask_0 = _mm256_sllv_epi32(mask1, shiftA);
3194 
3195  mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
3196  if (!_mm256_testz_si256(mask_tmp, mask_tmp)) // AND tests empty
3197  {
3198  __m256i* target_ptr = (__m256i*)(arr+base); // arr[base]
3199  // bool(blk[nword] ... )
3200  __m256i maskZ = _mm256_xor_si256(maskFF, maskFF); // all zero
3201  mask1 = _mm256_slli_epi32(mask1, bit_idx); // << bit_idx
3202  mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ); // set 0xFF if==0
3203  mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF); // invert
3204  mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
3205  _mm256_storeu_si256 (target_ptr, // arr[base] |= MASK_EXPR
3206  _mm256_or_si256 (mask_tmp,
3207  _mm256_loadu_si256(target_ptr)));
3208  }
3209 
3210  } // for
3211 
3212  for (; k < len; ++k)
3213  {
3214  const unsigned base = start + k;
3215  unsigned nbit = unsigned(idx[base] & bm::set_block_mask);
3216  arr[base] |= unsigned(bool(blk[nbit >> bm::set_word_shift] & (1u << (nbit & bm::set_word_mask))) << bit_idx);
3217  }
3218 
3219 }
3220 
3221 /**
3222  Convert bit block to GAP block
3223  @ingroup AVX2
3224  \internal
3225 */
3226 inline
3228  const unsigned* BMRESTRICT block,
3229  unsigned dest_len)
3230 {
3231  const unsigned* BMRESTRICT block_end = block + bm::set_block_size;
3232  gap_word_t* BMRESTRICT pcurr = dest;
3233  gap_word_t* BMRESTRICT end = dest + dest_len; (void)end;
3234 
3235  unsigned bitval = (*block) & 1u;
3236  *pcurr++ = bm::gap_word_t(bitval);
3237  *pcurr = 0;
3238  unsigned bit_idx = 0;
3239 
3240  const unsigned vCAP = 64; // 64-bit system
3241  __m256i maskZ = _mm256_set1_epi32(0);
3242 
3243  for (; block < block_end; block += 8)
3244  {
3245  unsigned k = 0;
3246  if (!bitval)
3247  {
3248  // check number of trailing 64-bit words using AVX compare
3249  __m256i accA = _mm256_load_si256((__m256i*)block); // 4x u64s
3250  __m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);
3251  unsigned mask = ~_mm256_movemask_epi8(cmpA);
3252  if (!mask)
3253  {
3254  bit_idx += 256;
3255  continue;
3256  }
3257  unsigned w64_idx = _tzcnt_u32(mask);
3258  k = w64_idx / 8; // 8 byte word offset
3259  bit_idx += k * vCAP;
3260  }
3261 
3262  for (; k < 4; ++k)
3263  {
3264  bm::id64_t val = (((bm::id64_t*)block)[k]);
3265 
3266  if (!val || val == ~0ull)
3267  {
3268  // branchless if
3269  bool cmp = (bool(bitval) != bool(val));
3270  unsigned mask = ~(cmp - 1u);
3271  *pcurr = mask & (gap_word_t)(bit_idx-cmp);
3272  bitval ^= unsigned(cmp);
3273  unsigned long long pcu = reinterpret_cast<unsigned long long>(pcurr);
3274  pcu += mask & sizeof(gap_word_t);
3275  pcurr = reinterpret_cast<gap_word_t*>(pcu);
3276  bit_idx += vCAP;
3277  continue;
3278  } // while
3279 
3280 
3281  // process "0100011" word
3282  //
3283  unsigned bits_consumed = 0;
3284  do
3285  {
3286  unsigned tz = 1u;
3287  if (bitval != (val & tz))
3288  {
3289  bitval ^= tz;
3290  *pcurr++ = (gap_word_t)(bit_idx-tz);
3291 
3292  BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3293  BM_ASSERT(pcurr != end);
3294  }
3295  else // match, find the next idx
3296  {
3297  tz = (unsigned)_tzcnt_u64(bitval ? ~val : val);
3298  }
3299 
3300  bool cmp = ((bits_consumed+=tz) < vCAP);
3301  bit_idx += tz;
3302  val >>= tz;
3303 
3304  if (!val)
3305  {
3306  tz = ~(cmp - 1u); // generate 0xFFFF or 0x0000 mask
3307  *pcurr = tz & (gap_word_t)(bit_idx-cmp);
3308  bitval ^= unsigned(cmp);
3309  bit_idx += tz & (vCAP - bits_consumed);
3310  unsigned long long pcu = reinterpret_cast<unsigned long long>(pcurr);
3311  pcu += tz & sizeof(gap_word_t);
3312  pcurr = reinterpret_cast<gap_word_t*>(pcu);
3313 
3314  BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3315  BM_ASSERT(pcurr != end);
3316  break;
3317  }
3318  } while (1);
3319  } // for k
3320 
3321  } // for block < end
3322 
3323  *pcurr = (gap_word_t)(bit_idx-1);
3324  unsigned len = (unsigned)(pcurr - dest);
3325  *dest = (gap_word_t)((*dest & 7) + (len << 3));
3326  return len;
3327 }
3328 
3329 /**
3330  Build partial XOR product of 2 bit-blocks using digest mask
3331 
3332  @param target_block - target := block ^ xor_block
3333  @param block - arg1
3334  @param xor_block - arg2
3335  @param digest - mask for each block wave to XOR (1) or just copy (0)
3336 
3337  @ingroup AVX2
3338  @internal
3339 */
3340 inline
3341 void avx2_bit_block_xor(bm::word_t* target_block,
3342  const bm::word_t* block, const bm::word_t* xor_block,
3343  bm::id64_t digest)
3344 {
3345  for (unsigned i = 0; i < bm::block_waves; ++i)
3346  {
3347  const bm::id64_t mask = (1ull << i);
3348  unsigned off = (i * bm::set_block_digest_wave_size);
3349  const __m256i* sub_block = (__m256i*) (block + off);
3350  __m256i* t_sub_block = (__m256i*)(target_block + off);
3351 
3352  if (digest & mask) // XOR filtered sub-block
3353  {
3354  const __m256i* xor_sub_block = (__m256i*) (xor_block + off);
3355  __m256i mA, mB, mC, mD;
3356  mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3357  _mm256_load_si256(xor_sub_block));
3358  mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3359  _mm256_load_si256(xor_sub_block+1));
3360  mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3361  _mm256_load_si256(xor_sub_block+2));
3362  mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3363  _mm256_load_si256(xor_sub_block+3));
3364 
3365  _mm256_store_si256(t_sub_block, mA);
3366  _mm256_store_si256(t_sub_block+1, mB);
3367  _mm256_store_si256(t_sub_block+2, mC);
3368  _mm256_store_si256(t_sub_block+3, mD);
3369  }
3370  else // just copy source
3371  {
3372  _mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));
3373  _mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));
3374  _mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));
3375  _mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));
3376  }
3377  } // for i
3378 }
3379 
3380 
3381 /**
3382  Build partial XOR product of 2 bit-blocks using digest mask
3383 
3384  @param target_block - target ^= xor_block
3385  @param xor_block - arg1
3386  @param digest - mask for each block wave to XOR (1)
3387 
3388  @ingroup AVX2
3389  @internal
3390 */
3391 inline
3393  const bm::word_t* xor_block,
3394  bm::id64_t digest) BMNOEXCEPT
3395 {
3396  while (digest)
3397  {
3398  bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
3399  unsigned wave = (unsigned)_mm_popcnt_u64(t - 1);
3400  unsigned off = wave * bm::set_block_digest_wave_size;
3401 
3402  const __m256i* sub_block = (const __m256i*) (xor_block + off);
3403  __m256i* t_sub_block = (__m256i*)(target_block + off);
3404 
3405  __m256i mA, mB, mC, mD;
3406  mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3407  _mm256_load_si256(t_sub_block));
3408  mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3409  _mm256_load_si256(t_sub_block+1));
3410  mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3411  _mm256_load_si256(t_sub_block+2));
3412  mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3413  _mm256_load_si256(t_sub_block+3));
3414 
3415  _mm256_store_si256(t_sub_block, mA);
3416  _mm256_store_si256(t_sub_block+1, mB);
3417  _mm256_store_si256(t_sub_block+2, mC);
3418  _mm256_store_si256(t_sub_block+3, mD);
3419 
3420  digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
3421  } // while
3422 
3423 }
3424 
3425 
3426 
3427 #ifdef __GNUG__
3428 #pragma GCC diagnostic pop
3429 #endif
3430 
3431 
3432 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
3433  avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3434 
3435 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
3436  avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3437 
3438 #define VECT_BITCOUNT(first, last) \
3439  avx2_bit_count((__m256i*) (first), (__m256i*) (last))
3440 
3441 #define VECT_BITCOUNT_AND(first, last, mask) \
3442  avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3443 
3444 #define VECT_BITCOUNT_OR(first, last, mask) \
3445  avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3446 
3447 #define VECT_BITCOUNT_XOR(first, last, mask) \
3448  avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3449 
3450 #define VECT_BITCOUNT_SUB(first, last, mask) \
3451  avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3452 
3453 #define VECT_INVERT_BLOCK(first) \
3454  avx2_invert_block((__m256i*)first);
3455 
3456 #define VECT_AND_BLOCK(dst, src) \
3457  avx2_and_block((__m256i*) dst, (const __m256i*) (src))
3458 
3459 #define VECT_AND_DIGEST(dst, src) \
3460  avx2_and_digest((__m256i*) dst, (const __m256i*) (src))
3461 
3462 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
3463  avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3464 
3465 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
3466  avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3467 
3468 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3469  avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3470 
3471 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
3472  avx2_and_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3473 
3474 #define VECT_OR_BLOCK(dst, src) \
3475  avx2_or_block((__m256i*) dst, (__m256i*) (src))
3476 
3477 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3478  avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3479 
3480 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
3481  avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3482 
3483 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3484  avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3485 
3486 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
3487  avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4))
3488 
3489 #define VECT_SUB_BLOCK(dst, src) \
3490  avx2_sub_block((__m256i*) dst, (__m256i*) (src))
3491 
3492 #define VECT_SUB_DIGEST(dst, src) \
3493  avx2_sub_digest((__m256i*) dst, (const __m256i*) (src))
3494 
3495 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
3496  avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3497 
3498 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3499  avx2_sub_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3500 
3501 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
3502  avx2_sub_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3503 
3504 #define VECT_XOR_BLOCK(dst, src) \
3505  avx2_xor_block((__m256i*) dst, (__m256i*) (src))
3506 
3507 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
3508  avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3509 
3510 #define VECT_COPY_BLOCK(dst, src) \
3511  avx2_copy_block((__m256i*) dst, (__m256i*) (src))
3512 
3513 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \
3514  avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src))
3515 
3516 #define VECT_STREAM_BLOCK(dst, src) \
3517  avx2_stream_block((__m256i*) dst, (__m256i*) (src))
3518 
3519 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
3520  avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src))
3521 
3522 #define VECT_SET_BLOCK(dst, value) \
3523  avx2_set_block((__m256i*) dst, (value))
3524 
3525 #define VECT_IS_ZERO_BLOCK(dst) \
3526  avx2_is_all_zero((__m256i*) dst)
3527 
3528 #define VECT_IS_ONE_BLOCK(dst) \
3529  avx2_is_all_one((__m256i*) dst)
3530 
3531 #define VECT_IS_DIGEST_ZERO(start) \
3532  avx2_is_digest_zero((__m256i*)start)
3533 
3534 #define VECT_BLOCK_SET_DIGEST(dst, val) \
3535  avx2_block_set_digest((__m256i*)dst, val)
3536 
3537 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
3538  avx2_lower_bound_scan_u32(arr, target, from, to)
3539 
3540 #define VECT_SHIFT_L1(b, acc, co) \
3541  avx2_shift_l1((__m256i*)b, acc, co)
3542 
3543 #define VECT_SHIFT_R1(b, acc, co) \
3544  avx2_shift_r1((__m256i*)b, acc, co)
3545 
3546 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
3547  avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest)
3548 
3549 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
3550  avx2_idx_arr_block_lookup(idx, size, nb, start)
3551 
3552 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
3553  avx2_set_block_bits3(block, idx, start, stop)
3554 
3555 #define VECT_BLOCK_CHANGE(block, size) \
3556  avx2_bit_block_calc_change((__m256i*)block, size)
3557 
3558 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
3559  avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc)
3560 
3561 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
3562  avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc)
3563 
3564 #define VECT_BIT_TO_GAP(dest, src, dest_len) \
3565  avx2_bit_to_gap(dest, src, dest_len)
3566 
3567 #define VECT_BIT_FIND_FIRST(src1, off, pos) \
3568  avx2_bit_find_first((__m256i*) src1, off, pos)
3569 
3570 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \
3571  avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos)
3572 
3573 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
3574  avx2_bit_block_xor(t, src, src_xor, d)
3575 
3576 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
3577  avx2_bit_block_xor_2way(t, src_xor, d)
3578 
3579 #define VECT_GAP_BFIND(buf, pos, is_set) \
3580  avx2_gap_bfind(buf, pos, is_set)
3581 
3582 #define VECT_GAP_TEST(buf, pos) \
3583  avx2_gap_test(buf, pos)
3584 
3585 
3586 #define VECT_BIT_COUNT_DIGEST(blk, d) \
3587  avx2_bit_block_count(blk, d)
3588 
3589 
3590 } // namespace
3591 
3592 
3593 
3594 
3595 #endif
ncbi::TMaskedQueryRegions mask
#define BM_AVX2_POPCNT_PROLOG
Definition: bmavx2.h:140
#define BM_CSA256(h, l, a, b, c)
Definition: bmavx2.h:117
#define BM_AVX2_BIT_COUNT(ret, v)
Definition: bmavx2.h:124
Definitions(internal)
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN32
Definition: bmdef.h:306
#define BMFORCEINLINE
Definition: bmdef.h:213
#define BM_ASSERT
Definition: bmdef.h:139
#define BM_ALIGN32ATTR
Definition: bmdef.h:307
Bit manipulation primitives (internal)
#define bool
Definition: bool.h:34
void avx2_copy_block(__m256i *dst, const __m256i *src)
AVX2 block copy dst = *src.
Definition: bmavx2.h:1503
unsigned avx2_and_block(__m256i *dst, const __m256i *src)
AND array elements against another array dst &= *src.
Definition: bmavx2.h:496
void avx2_xor_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
Definition: bmavx2.h:447
bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
Definition: bmavx2.h:1816
void avx2_bit_block_calc_change_bc(const __m256i *block, unsigned *gcount, unsigned *bcount)
Definition: bmavx2.h:2251
unsigned avx2_gap_test(const unsigned short *buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmavx2.h:3057
bool avx2_is_all_one(const __m256i *block)
check if block is all one bits
Definition: bmavx2.h:1767
bool avx2_and_or_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND-OR block digest stride 2 way dst |= *src1 & *src2.
Definition: bmavx2.h:604
bool avx2_or_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
OR array elements against another unaligned array dst |= *src.
Definition: bmavx2.h:888
bool avx2_or_block_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition: bmavx2.h:987
bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
Definition: bmavx2.h:1829
unsigned avx2_and_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
AND array elements against another array (unaligned) dst &= *src.
Definition: bmavx2.h:777
bool avx2_and_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND block digest stride 2 way dst = *src1 & *src2.
Definition: bmavx2.h:573
bool avx2_or_block(__m256i *dst, const __m256i *src)
OR array elements against another array dst |= *src.
Definition: bmavx2.h:835
unsigned avx2_xor_block(__m256i *dst, const __m256i *src)
XOR block against another dst ^= *src.
Definition: bmavx2.h:1108
bool avx2_sub_digest(__m256i *dst, const __m256i *src)
SUB (AND NOT) block digest stride dst &= ~*src.
Definition: bmavx2.h:1250
bm::id_t avx2_bit_count_sub(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND NOT bit count for two aligned bit-blocks.
Definition: bmavx2.h:413
unsigned avx2_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set)
Hybrid binary search, starts as binary, then switches to scan.
Definition: bmavx2.h:2939
unsigned avx2_bit_to_gap(gap_word_t *dest, const unsigned *block, unsigned dest_len)
Convert bit block to GAP block.
Definition: bmavx2.h:3227
bool avx2_sub_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
SUB block digest stride.
Definition: bmavx2.h:1310
bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift right by 1
Definition: bmavx2.h:1903
bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
Definition: bmavx2.h:1805
unsigned avx2_bit_block_calc_change(const __m256i *block, unsigned size)
Definition: bmavx2.h:2083
bool avx2_sub_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2
Definition: bmavx2.h:1280
bool avx2_sub_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
SUB block digest stride.
Definition: bmavx2.h:1392
void avx2_andnot_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition: bmavx2.h:472
void avx2_block_set_digest(__m256i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
Definition: bmavx2.h:1752
bm::id_t avx2_bit_count_xor(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
XOR bit count for two aligned bit-blocks.
Definition: bmavx2.h:368
bm::id_t avx2_bit_count(const __m256i *block, const __m256i *block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...
Definition: bmavx2.h:156
bm::id_t avx2_bit_count_and(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND bit count for two aligned bit-blocks.
Definition: bmavx2.h:290
void avx2_stream_block(__m256i *dst, const __m256i *src)
AVX2 block copy dst = *src.
Definition: bmavx2.h:1589
void avx2_copy_block_unalign(__m256i *dst, const __m256i *src)
AVX2 block copy (unaligned SRC) dst = *src.
Definition: bmavx2.h:1545
void avx2_invert_block(__m256i *dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
Definition: bmavx2.h:1677
void avx2_bit_block_calc_xor_change(const __m256i *block, const __m256i *xor_block, unsigned size, unsigned *gcount, unsigned *bcount)
Definition: bmavx2.h:2154
bool avx2_shift_r1_and(__m256i *block, bm::word_t co1, const __m256i *mask_block, bm::id64_t *digest)
fused block shift right by 1 plus AND
Definition: bmavx2.h:1959
bool avx2_bit_find_first_diff(const __m256i *block1, const __m256i *block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
Definition: bmavx2.h:2336
bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)
Calculate population count based on digest.
Definition: bmavx2.h:232
bool avx2_and_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND block digest stride.
Definition: bmavx2.h:727
bool avx2_and_digest(__m256i *dst, const __m256i *src)
AND block digest stride dst &= *src.
Definition: bmavx2.h:543
unsigned avx2_sub_block(__m256i *dst, const __m256i *src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition: bmavx2.h:1204
void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmavx2.h:3341
void avx2_set_block(__m256i *dst, bm::word_t value)
AVX2 block memset dst = value.
Definition: bmavx2.h:1477
bool avx2_test_all_one_wave(const void *ptr)
check if wave of pointers is all 0xFFF
Definition: bmavx2.h:1791
void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
Definition: bmavx2.h:3392
bool avx2_or_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
OR 2 arrays and copy to the destination dst = *src1 | src2.
Definition: bmavx2.h:941
unsigned avx2_xor_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
3 operand XOR dst = *src1 ^ src2
Definition: bmavx2.h:1154
bool avx2_is_digest_zero(const __m256i *block)
check if digest stride is all zero bits
Definition: bmavx2.h:1738
bool avx2_bit_find_first(const __m256i *block, unsigned off, unsigned *pos)
Find first bit set.
Definition: bmavx2.h:2394
void avx2_stream_block_unalign(__m256i *dst, const __m256i *src)
AVX2 block copy (unaligned SRC) dst = *src.
Definition: bmavx2.h:1631
bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift left by 1
Definition: bmavx2.h:1842
int avx2_cmpge_u16(__m256i vect16, unsigned short value)
Experimental (test) function to do SIMD vector search in sorted, growing array.
Definition: bmavx2.h:2907
int avx2_cmpge_u32(__m256i vect8, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
Definition: bmavx2.h:2875
bool avx2_or_block_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
Definition: bmavx2.h:1039
bool avx2_is_all_zero(const __m256i *block)
check if block is all zero bits
Definition: bmavx2.h:1708
bool avx2_and_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
AND block digest stride.
Definition: bmavx2.h:659
unsigned avx2_lower_bound_scan_u32(const unsigned *arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmavx2.h:3068
char * buf
int i
yy_size_t n
int len
static void hex(unsigned char c)
Definition: mdb_dump.c:56
#include<zmmintrin.h>
Definition: bm.h:78
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
unsigned int word_t
Definition: bmconst.h:39
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
Definition: bmavx2.h:2492
__m256i avx2_setbit_to256(unsigned i)
Experiemntal.
Definition: bmavx2.h:2841
const unsigned set_block_mask
Definition: bmconst.h:57
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
Definition: bmavx2.h:2453
bm::id_t avx2_bit_count_or(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
Definition: bmavx2.h:337
void avx2_set_block_bits2(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
Definition: bmavx2.h:2709
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
Definition: bmutil.h:335
void avx2_set_block_bits(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Definition: bmavx2.h:2533
void avx2_set_block_bits3(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
Definition: bmavx2.h:2748
const unsigned set_word_shift
Definition: bmconst.h:72
const unsigned set_block_size
Definition: bmconst.h:55
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
unsigned int id_t
Definition: bmconst.h:38
__m256i avx2_setbit_256(__m256i target, __m256i source)
Set a bits in an AVX target, by indexes (int4) from the source.
Definition: bmavx2.h:2662
unsigned short gap_word_t
Definition: bmconst.h:78
void avx2_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx)
Definition: bmavx2.h:3141
const unsigned gap_max_bits
Definition: bmconst.h:81
const unsigned set_block_shift
Definition: bmconst.h:56
const unsigned set_word_mask
Definition: bmconst.h:73
unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
EIPRangeType t
Definition: ncbi_localip.c:101
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static unsigned cnt[256]
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
#define tmp1
#define tmp0
static int _mm_cvtsi128_si32(__m128i a)
Definition: sse2neon.h:4141
static __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2924
static void _mm_prefetch(const void *p, int i)
Definition: sse2neon.h:2263
static int64_t _mm_popcnt_u64(uint64_t a)
Definition: sse2neon.h:8714
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
MACRO for shuffle parameter for _mm_shuffle_ps().
Definition: sse2neon.h:151
@ _MM_HINT_T0
Definition: sse2neon.h:503
@ _MM_HINT_NTA
Definition: sse2neon.h:502
int64x2_t __m128i
Definition: sse2neon.h:200
Modified on Fri Sep 20 14:57:41 2024 by modify_doxy.py rev. 669887