1 #ifndef BMAVX2__H__INCLUDED__
2 #define BMAVX2__H__INCLUDED__
79 void avx2_print256_u32(
const char* prefix,
const __m256i &
value)
81 const size_t n =
sizeof(__m256i) /
sizeof(
unsigned);
84 std::cout << prefix <<
" [ ";
85 for (
int i =
n-1; 1; --
i)
91 std::cout <<
"]" << std::endl;
95 void avx2_print256_u16(
const char* prefix,
const __m256i &
value)
97 const size_t n =
sizeof(__m256i) /
sizeof(
unsigned short);
100 std::cout << prefix <<
" [ ";
101 for (
int i =
n-1; 1; --
i)
107 std::cout <<
"]" << std::endl;
112 #pragma GCC diagnostic push
113 #pragma GCC diagnostic ignored "-Wconversion"
117 #define BM_CSA256(h, l, a, b, c) \
119 __m256i u = _mm256_xor_si256(a, b); \
120 h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c)); \
121 l = _mm256_xor_si256(u, c); \
124 #define BM_AVX2_BIT_COUNT(ret, v) \
126 __m256i lo = _mm256_and_si256(v, low_mask); \
127 __m256i hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), low_mask); \
128 __m256i cnt1 = _mm256_shuffle_epi8(lookup1, lo); \
129 __m256i cnt2 = _mm256_shuffle_epi8(lookup2, hi); \
130 ret = _mm256_sad_epu8(cnt1, cnt2); \
133 #define BM_AVX2_DECL_LOOKUP1 \
134 __m256i lookup1 = _mm256_setr_epi8(4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, \
135 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8);
136 #define BM_AVX2_DECL_LOOKUP2 \
137 __m256i lookup2 = _mm256_setr_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, \
138 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
140 #define BM_AVX2_POPCNT_PROLOG \
141 BM_AVX2_DECL_LOOKUP1 \
142 BM_AVX2_DECL_LOOKUP2 \
143 __m256i low_mask = _mm256_set1_epi8(0x0f); \
159 __m256i
cnt = _mm256_setzero_si256();
160 __m256i ones = _mm256_setzero_si256();
161 __m256i twos = _mm256_setzero_si256();
162 __m256i fours = _mm256_setzero_si256();
163 __m256i eights = _mm256_setzero_si256();
164 __m256i sixteens = _mm256_setzero_si256();
165 __m256i twosA, twosB, foursA, foursB, eightsA, eightsB;
173 b = _mm256_load_si256(block+0); c = _mm256_load_si256(block+1);
176 b = _mm256_load_si256(block+2); c = _mm256_load_si256(block+3);
178 BM_CSA256(foursA, twos, twos, twosA, twosB);
180 b = _mm256_load_si256(block+4); c = _mm256_load_si256(block+5);
183 b = _mm256_load_si256(block+6); c = _mm256_load_si256(block+7);
185 BM_CSA256(foursB, twos, twos, twosA, twosB);
186 BM_CSA256(eightsA, fours, fours, foursA, foursB);
188 b = _mm256_load_si256(block+8); c = _mm256_load_si256(block+9);
191 b = _mm256_load_si256(block+10); c = _mm256_load_si256(block+11);
193 BM_CSA256(foursA, twos, twos, twosA, twosB);
195 b = _mm256_load_si256(block+12); c = _mm256_load_si256(block+13);
198 b = _mm256_load_si256(block+14); c = _mm256_load_si256(block+15);
200 BM_CSA256(foursB, twos, twos, twosA, twosB);
201 BM_CSA256(eightsB, fours, fours, foursA, foursB);
202 BM_CSA256(sixteens, eights, eights, eightsA, eightsB);
205 cnt = _mm256_add_epi64(
cnt, bc);
208 }
while (block < block_end);
210 cnt = _mm256_slli_epi64(
cnt, 4);
212 cnt = _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 3));
214 cnt = _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 2));
216 cnt = _mm256_add_epi64(
cnt, _mm256_slli_epi64(bc, 1));
218 cnt = _mm256_add_epi64(
cnt, bc);
222 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
238 __m256i
cnt = _mm256_setzero_si256();
246 const __m256i*
BMRESTRICT wave_src = (__m256i*)&block[off];
248 __m256i m1A, m1B, m1C, m1D;
249 m1A = _mm256_load_si256(wave_src);
250 m1B = _mm256_load_si256(wave_src+1);
251 if (!_mm256_testz_si256(m1A, m1A))
254 cnt = _mm256_add_epi64(
cnt, bc);
256 if (!_mm256_testz_si256(m1B, m1B))
259 cnt = _mm256_add_epi64(
cnt, bc);
262 m1C = _mm256_load_si256(wave_src+2);
263 m1D = _mm256_load_si256(wave_src+3);
264 if (!_mm256_testz_si256(m1C, m1C))
267 cnt = _mm256_add_epi64(
cnt, bc);
269 if (!_mm256_testz_si256(m1D, m1D))
272 cnt = _mm256_add_epi64(
cnt, bc);
278 count = (unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
296 __m256i
cnt = _mm256_setzero_si256();
302 ymm0 = _mm256_load_si256(block);
303 ymm1 = _mm256_load_si256(mask_block);
304 ymm0 = _mm256_and_si256(ymm0, ymm1);
305 ++block; ++mask_block;
307 cnt = _mm256_add_epi64(
cnt, bc);
309 ymm0 = _mm256_load_si256(block);
310 ymm1 = _mm256_load_si256(mask_block);
311 ymm0 = _mm256_and_si256(ymm0, ymm1);
312 ++block; ++mask_block;
314 cnt = _mm256_add_epi64(
cnt, bc);
316 ymm0 = _mm256_load_si256(block);
317 ymm1 = _mm256_load_si256(mask_block);
318 ymm0 = _mm256_and_si256(ymm0, ymm1);
319 ++block; ++mask_block;
321 cnt = _mm256_add_epi64(
cnt, bc);
323 ymm0 = _mm256_load_si256(block);
324 ymm1 = _mm256_load_si256(mask_block);
325 ymm0 = _mm256_and_si256(ymm0, ymm1);
326 ++block; ++mask_block;
328 cnt = _mm256_add_epi64(
cnt, bc);
330 }
while (block < block_end);
333 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
343 __m256i
cnt = _mm256_setzero_si256();
346 __m256i
tmp0 = _mm256_load_si256(block);
347 __m256i
tmp1 = _mm256_load_si256(mask_block);
352 cnt = _mm256_add_epi64(
cnt, bc);
354 ++block; ++mask_block;
356 }
while (block < block_end);
359 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
374 __m256i
cnt = _mm256_setzero_si256();
375 __m256i mA, mB, mC, mD;
378 mA = _mm256_xor_si256(_mm256_load_si256(block+0),
379 _mm256_load_si256(mask_block+0));
381 cnt = _mm256_add_epi64(
cnt, bc);
383 mB = _mm256_xor_si256(_mm256_load_si256(block+1),
384 _mm256_load_si256(mask_block+1));
386 cnt = _mm256_add_epi64(
cnt, bc);
388 mC = _mm256_xor_si256(_mm256_load_si256(block+2),
389 _mm256_load_si256(mask_block+2));
391 cnt = _mm256_add_epi64(
cnt, bc);
393 mD = _mm256_xor_si256(_mm256_load_si256(block+3),
394 _mm256_load_si256(mask_block+3));
396 cnt = _mm256_add_epi64(
cnt, bc);
398 block += 4; mask_block += 4;
400 }
while (block < block_end);
403 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
419 __m256i
cnt = _mm256_setzero_si256();
422 __m256i
tmp0 = _mm256_load_si256(block);
423 __m256i
tmp1 = _mm256_load_si256(mask_block);
428 cnt = _mm256_add_epi64(
cnt, bc);
430 ++block; ++mask_block;
432 }
while (block < block_end);
435 return (
unsigned)(cnt64[0] + cnt64[1] + cnt64[2] + cnt64[3]);
452 __m256i yM = _mm256_set1_epi32(
int(
mask));
455 _mm256_store_si256(dst+0, _mm256_xor_si256(_mm256_load_si256(src+0), yM));
456 _mm256_store_si256(dst+1, _mm256_xor_si256(_mm256_load_si256(src+1), yM));
457 _mm256_store_si256(dst+2, _mm256_xor_si256(_mm256_load_si256(src+2), yM));
458 _mm256_store_si256(dst+3, _mm256_xor_si256(_mm256_load_si256(src+3), yM));
461 }
while (src < src_end);
477 __m256i yM = _mm256_set1_epi32(
int(
mask));
480 _mm256_store_si256(dst+0, _mm256_andnot_si256(_mm256_load_si256(src+0), yM));
481 _mm256_store_si256(dst+1, _mm256_andnot_si256(_mm256_load_si256(src+1), yM));
482 _mm256_store_si256(dst+2, _mm256_andnot_si256(_mm256_load_si256(src+2), yM));
483 _mm256_store_si256(dst+3, _mm256_andnot_si256(_mm256_load_si256(src+3), yM));
486 }
while (src < src_end);
499 __m256i m1A, m1B, m1C, m1D;
500 __m256i accA, accB, accC, accD;
505 accA = accB = accC = accD = _mm256_setzero_si256();
509 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
510 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
511 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
512 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
514 _mm256_store_si256(dst+0, m1A);
515 _mm256_store_si256(dst+1, m1B);
516 _mm256_store_si256(dst+2, m1C);
517 _mm256_store_si256(dst+3, m1D);
519 accA = _mm256_or_si256(accA, m1A);
520 accB = _mm256_or_si256(accB, m1B);
521 accC = _mm256_or_si256(accC, m1C);
522 accD = _mm256_or_si256(accD, m1D);
526 }
while (src < src_end);
528 accA = _mm256_or_si256(accA, accB);
529 accC = _mm256_or_si256(accC, accD);
530 accA = _mm256_or_si256(accA, accC);
532 return !_mm256_testz_si256(accA, accA);
546 __m256i m1A, m1B, m1C, m1D;
548 m1A = _mm256_and_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
549 m1B = _mm256_and_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
550 m1C = _mm256_and_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
551 m1D = _mm256_and_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
553 _mm256_store_si256(dst+0, m1A);
554 _mm256_store_si256(dst+1, m1B);
555 _mm256_store_si256(dst+2, m1C);
556 _mm256_store_si256(dst+3, m1D);
558 m1A = _mm256_or_si256(m1A, m1B);
559 m1C = _mm256_or_si256(m1C, m1D);
560 m1A = _mm256_or_si256(m1A, m1C);
562 return _mm256_testz_si256(m1A, m1A);
577 __m256i m1A, m1B, m1C, m1D;
579 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
580 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
581 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
582 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
584 _mm256_store_si256(dst+0, m1A);
585 _mm256_store_si256(dst+1, m1B);
586 _mm256_store_si256(dst+2, m1C);
587 _mm256_store_si256(dst+3, m1D);
589 m1A = _mm256_or_si256(m1A, m1B);
590 m1C = _mm256_or_si256(m1C, m1D);
591 m1A = _mm256_or_si256(m1A, m1C);
593 return _mm256_testz_si256(m1A, m1A);
608 const __m256i maskF = _mm256_set1_epi32(~0u);
610 __m256i m1A, m1B, m1C, m1D;
612 __m256i mSA, mSB, mSC, mSD;
615 mSA = _mm256_load_si256(dst+0);
616 mSB = _mm256_load_si256(dst+1);
617 mACC1 = _mm256_and_si256(mSA, mSB);
619 mSC = _mm256_load_si256(dst+2);
620 mSD = _mm256_load_si256(dst+3);
622 mACC1 = _mm256_and_si256(mACC1, _mm256_and_si256(mSC, mSD));
624 mACC1 = _mm256_xor_si256(mACC1, maskF);
625 if (_mm256_testz_si256(mACC1, mACC1))
629 m1A = _mm256_and_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
630 m1B = _mm256_and_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
631 m1C = _mm256_and_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
632 m1D = _mm256_and_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
635 _mm256_or_si256(_mm256_or_si256(m1A, m1B), _mm256_or_si256(m1C, m1D));
636 bool all_z = _mm256_testz_si256(mACC1, mACC1);
640 m1A = _mm256_or_si256(mSA, m1A);
641 m1B = _mm256_or_si256(mSB, m1B);
642 m1C = _mm256_or_si256(mSC, m1C);
643 m1D = _mm256_or_si256(mSD, m1D);
645 _mm256_store_si256(dst+0, m1A);
646 _mm256_store_si256(dst+1, m1B);
647 _mm256_store_si256(dst+2, m1C);
648 _mm256_store_si256(dst+3, m1D);
665 __m256i m1A, m1B, m1C, m1D;
666 __m256i m1E, m1F, m1G, m1H;
669 __m256i s1_0, s2_0, s1_1, s2_1;
671 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
672 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
673 m1A = _mm256_and_si256(s1_0, s2_0);
674 m1B = _mm256_and_si256(s1_1, s2_1);
675 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
676 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
677 m1C = _mm256_and_si256(s1_0, s2_0);
678 m1D = _mm256_and_si256(s1_1, s2_1);
681 __m256i s3_0, s4_0, s3_1, s4_1;
683 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
684 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
685 m1E = _mm256_and_si256(s3_0, s4_0);
686 m1F = _mm256_and_si256(s3_1, s4_1);
688 m1A = _mm256_and_si256(m1A, m1E);
689 m1B = _mm256_and_si256(m1B, m1F);
691 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
692 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
693 m1G = _mm256_and_si256(s3_0, s4_0);
694 m1H = _mm256_and_si256(s3_1, s4_1);
698 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
700 m1C = _mm256_and_si256(m1C, m1G);
701 m1D = _mm256_and_si256(m1D, m1H);
702 m1A = _mm256_and_si256(m1A, dst0);
703 m1B = _mm256_and_si256(m1B, dst1);
705 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
707 m1C = _mm256_and_si256(m1C, dst0);
708 m1D = _mm256_and_si256(m1D, dst1);
710 _mm256_store_si256(dst + 0, m1A);
711 _mm256_store_si256(dst + 1, m1B);
712 _mm256_store_si256(dst + 2, m1C);
713 _mm256_store_si256(dst + 3, m1D);
715 m1A = _mm256_or_si256(m1A, m1B);
716 m1C = _mm256_or_si256(m1C, m1D);
717 m1A = _mm256_or_si256(m1A, m1C);
719 return _mm256_testz_si256(m1A, m1A);
731 __m256i m1A, m1B, m1C, m1D;
734 __m256i s1_0, s2_0, s1_1, s2_1;
736 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
737 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
738 m1A = _mm256_and_si256(s1_0, s2_0);
739 m1B = _mm256_and_si256(s1_1, s2_1);
740 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
741 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
742 m1C = _mm256_and_si256(s1_0, s2_0);
743 m1D = _mm256_and_si256(s1_1, s2_1);
747 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
749 m1A = _mm256_and_si256(m1A, dst0);
750 m1B = _mm256_and_si256(m1B, dst1);
752 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
754 m1C = _mm256_and_si256(m1C, dst0);
755 m1D = _mm256_and_si256(m1D, dst1);
757 _mm256_store_si256(dst + 0, m1A);
758 _mm256_store_si256(dst + 1, m1B);
759 _mm256_store_si256(dst + 2, m1C);
760 _mm256_store_si256(dst + 3, m1D);
762 m1A = _mm256_or_si256(m1A, m1B);
763 m1C = _mm256_or_si256(m1C, m1D);
764 m1A = _mm256_or_si256(m1A, m1C);
766 return _mm256_testz_si256(m1A, m1A);
781 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
782 __m256i accA, accB, accC, accD;
784 accA = _mm256_setzero_si256();
785 accB = _mm256_setzero_si256();
786 accC = _mm256_setzero_si256();
787 accD = _mm256_setzero_si256();
791 m1A = _mm256_loadu_si256(src+0);
792 m2A = _mm256_load_si256(dst+0);
793 m1A = _mm256_and_si256(m1A, m2A);
794 _mm256_store_si256(dst+0, m1A);
795 accA = _mm256_or_si256(accA, m1A);
797 m1B = _mm256_loadu_si256(src+1);
798 m2B = _mm256_load_si256(dst+1);
799 m1B = _mm256_and_si256(m1B, m2B);
800 _mm256_store_si256(dst+1, m1B);
801 accB = _mm256_or_si256(accB, m1B);
803 m1C = _mm256_loadu_si256(src+2);
804 m2C = _mm256_load_si256(dst+2);
805 m1C = _mm256_and_si256(m1C, m2C);
806 _mm256_store_si256(dst+2, m1C);
807 accC = _mm256_or_si256(accC, m1C);
809 m1D = _mm256_loadu_si256(src+3);
810 m2D = _mm256_load_si256(dst+3);
811 m1D = _mm256_and_si256(m1D, m2D);
812 _mm256_store_si256(dst+3, m1D);
813 accD = _mm256_or_si256(accD, m1D);
817 }
while (src < src_end);
819 accA = _mm256_or_si256(accA, accB);
820 accC = _mm256_or_si256(accC, accD);
821 accA = _mm256_or_si256(accA, accC);
823 return !_mm256_testz_si256(accA, accA);
838 __m256i m1A, m1B, m1C, m1D;
840 __m256i mAccF0 = _mm256_set1_epi32(~0u);
841 __m256i mAccF1 = _mm256_set1_epi32(~0u);
851 m1A = _mm256_or_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
852 m1B = _mm256_or_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
853 mAccF0 = _mm256_and_si256(mAccF0, m1A);
854 mAccF0 = _mm256_and_si256(mAccF0, m1B);
856 _mm256_stream_si256(dst, m1A);
857 _mm256_stream_si256(dst+1, m1B);
861 m1C = _mm256_or_si256(_mm256_load_si256(src2), _mm256_load_si256(dst2));
862 m1D = _mm256_or_si256(_mm256_load_si256(src2+1), _mm256_load_si256(dst2+1));
863 mAccF1 = _mm256_and_si256(mAccF1, m1C);
864 mAccF1 = _mm256_and_si256(mAccF1, m1D);
866 _mm256_stream_si256(dst2, m1C);
867 _mm256_stream_si256(dst2+1, m1D);
869 src2 += 2; dst2 += 2;
870 }
while (src2 < src_end);
872 __m256i maskF = _mm256_set1_epi32(~0u);
873 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
874 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
875 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
876 return (maskA == ~0u);
892 __m256i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
893 __m256i mAccF0 = _mm256_set1_epi32(~0u);
894 __m256i mAccF1 = _mm256_set1_epi32(~0u);
897 m1A = _mm256_loadu_si256(src+0);
898 m2A = _mm256_load_si256(dst+0);
899 m1A = _mm256_or_si256(m1A, m2A);
900 _mm256_store_si256(dst+0, m1A);
902 m1B = _mm256_loadu_si256(src+1);
903 m2B = _mm256_load_si256(dst+1);
904 m1B = _mm256_or_si256(m1B, m2B);
905 _mm256_store_si256(dst+1, m1B);
907 m1C = _mm256_loadu_si256(src+2);
908 m2C = _mm256_load_si256(dst+2);
909 m1C = _mm256_or_si256(m1C, m2C);
910 _mm256_store_si256(dst+2, m1C);
912 m1D = _mm256_loadu_si256(src+3);
913 m2D = _mm256_load_si256(dst+3);
914 m1D = _mm256_or_si256(m1D, m2D);
915 _mm256_store_si256(dst+3, m1D);
917 mAccF1 = _mm256_and_si256(mAccF1, m1C);
918 mAccF1 = _mm256_and_si256(mAccF1, m1D);
919 mAccF0 = _mm256_and_si256(mAccF0, m1A);
920 mAccF0 = _mm256_and_si256(mAccF0, m1B);
924 }
while (src < src_end);
926 __m256i maskF = _mm256_set1_epi32(~0u);
927 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
928 __m256i wcmpA = _mm256_cmpeq_epi8(mAccF0, maskF);
929 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
930 return (maskA == ~0u);
945 __m256i m1A, m1B, m1C, m1D;
946 __m256i mAccF0 = _mm256_set1_epi32(~0u);
947 __m256i mAccF1 = _mm256_set1_epi32(~0u);
953 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(src2+0));
954 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(src2+1));
955 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(src2+2));
956 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(src2+3));
958 _mm256_store_si256(dst+0, m1A);
959 _mm256_store_si256(dst+1, m1B);
960 _mm256_store_si256(dst+2, m1C);
961 _mm256_store_si256(dst+3, m1D);
963 mAccF1 = _mm256_and_si256(mAccF1, m1C);
964 mAccF1 = _mm256_and_si256(mAccF1, m1D);
965 mAccF0 = _mm256_and_si256(mAccF0, m1A);
966 mAccF0 = _mm256_and_si256(mAccF0, m1B);
968 src1 += 4; src2 += 4; dst += 4;
970 }
while (src1 < src_end1);
972 __m256i maskF = _mm256_set1_epi32(~0u);
973 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
974 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
975 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
976 return (maskA == ~0u);
991 __m256i m1A, m1B, m1C, m1D;
992 __m256i mAccF0 = _mm256_set1_epi32(~0u);
993 __m256i mAccF1 = _mm256_set1_epi32(~0u);
999 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1000 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1001 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1002 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1004 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1005 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1006 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1007 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1009 _mm256_store_si256(dst+0, m1A);
1010 _mm256_store_si256(dst+1, m1B);
1011 _mm256_store_si256(dst+2, m1C);
1012 _mm256_store_si256(dst+3, m1D);
1014 mAccF1 = _mm256_and_si256(mAccF1, m1C);
1015 mAccF1 = _mm256_and_si256(mAccF1, m1D);
1016 mAccF0 = _mm256_and_si256(mAccF0, m1A);
1017 mAccF0 = _mm256_and_si256(mAccF0, m1B);
1019 src1 += 4; src2 += 4; dst += 4;
1021 }
while (src1 < src_end1);
1023 __m256i maskF = _mm256_set1_epi32(~0u);
1024 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1025 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1026 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1027 return (maskA == ~0u);
1045 __m256i m1A, m1B, m1C, m1D;
1046 __m256i mAccF0 = _mm256_set1_epi32(~0u);
1047 __m256i mAccF1 = _mm256_set1_epi32(~0u);
1054 m1A = _mm256_or_si256(_mm256_load_si256(src1+0), _mm256_load_si256(dst+0));
1055 m1B = _mm256_or_si256(_mm256_load_si256(src1+1), _mm256_load_si256(dst+1));
1056 m1C = _mm256_or_si256(_mm256_load_si256(src1+2), _mm256_load_si256(dst+2));
1057 m1D = _mm256_or_si256(_mm256_load_si256(src1+3), _mm256_load_si256(dst+3));
1059 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src2+0));
1060 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src2+1));
1061 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src2+2));
1062 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src2+3));
1064 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src3+0));
1065 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src3+1));
1066 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src3+2));
1067 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src3+3));
1069 m1A = _mm256_or_si256(m1A, _mm256_load_si256(src4+0));
1070 m1B = _mm256_or_si256(m1B, _mm256_load_si256(src4+1));
1071 m1C = _mm256_or_si256(m1C, _mm256_load_si256(src4+2));
1072 m1D = _mm256_or_si256(m1D, _mm256_load_si256(src4+3));
1074 _mm256_stream_si256(dst+0, m1A);
1075 _mm256_stream_si256(dst+1, m1B);
1076 _mm256_stream_si256(dst+2, m1C);
1077 _mm256_stream_si256(dst+3, m1D);
1079 mAccF1 = _mm256_and_si256(mAccF1, m1C);
1080 mAccF1 = _mm256_and_si256(mAccF1, m1D);
1081 mAccF0 = _mm256_and_si256(mAccF0, m1A);
1082 mAccF0 = _mm256_and_si256(mAccF0, m1B);
1084 src1 += 4; src2 += 4;
1085 src3 += 4; src4 += 4;
1091 }
while (src1 < src_end1);
1093 __m256i maskF = _mm256_set1_epi32(~0u);
1094 mAccF0 = _mm256_and_si256(mAccF0, mAccF1);
1095 __m256i wcmpA= _mm256_cmpeq_epi8(mAccF0, maskF);
1096 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1097 return (maskA == ~0u);
1111 __m256i m1A, m1B, m1C, m1D;
1112 __m256i accA, accB, accC, accD;
1117 accA = accB = accC = accD = _mm256_setzero_si256();
1121 m1A = _mm256_xor_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1122 m1B = _mm256_xor_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1123 m1C = _mm256_xor_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1124 m1D = _mm256_xor_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1126 _mm256_store_si256(dst+0, m1A);
1127 _mm256_store_si256(dst+1, m1B);
1128 _mm256_store_si256(dst+2, m1C);
1129 _mm256_store_si256(dst+3, m1D);
1131 accA = _mm256_or_si256(accA, m1A);
1132 accB = _mm256_or_si256(accB, m1B);
1133 accC = _mm256_or_si256(accC, m1C);
1134 accD = _mm256_or_si256(accD, m1D);
1138 }
while (src < src_end);
1140 accA = _mm256_or_si256(accA, accB);
1141 accC = _mm256_or_si256(accC, accD);
1142 accA = _mm256_or_si256(accA, accC);
1144 return !_mm256_testz_si256(accA, accA);
1158 __m256i m1A, m1B, m1C, m1D;
1159 __m256i accA, accB, accC, accD;
1164 accA = accB = accC = accD = _mm256_setzero_si256();
1168 m1A = _mm256_xor_si256(_mm256_load_si256(src1 + 0), _mm256_load_si256(src2 + 0));
1169 m1B = _mm256_xor_si256(_mm256_load_si256(src1 + 1), _mm256_load_si256(src2 + 1));
1170 m1C = _mm256_xor_si256(_mm256_load_si256(src1 + 2), _mm256_load_si256(src2 + 2));
1171 m1D = _mm256_xor_si256(_mm256_load_si256(src1 + 3), _mm256_load_si256(src2 + 3));
1173 _mm256_store_si256(dst + 0, m1A);
1174 _mm256_store_si256(dst + 1, m1B);
1175 _mm256_store_si256(dst + 2, m1C);
1176 _mm256_store_si256(dst + 3, m1D);
1178 accA = _mm256_or_si256(accA, m1A);
1179 accB = _mm256_or_si256(accB, m1B);
1180 accC = _mm256_or_si256(accC, m1C);
1181 accD = _mm256_or_si256(accD, m1D);
1183 src1 += 4; src2 += 4; dst += 4;
1185 }
while (src1 < src1_end);
1187 accA = _mm256_or_si256(accA, accB);
1188 accC = _mm256_or_si256(accC, accD);
1189 accA = _mm256_or_si256(accA, accC);
1191 return !_mm256_testz_si256(accA, accA);
1207 __m256i m1A, m1B, m1C, m1D;
1208 __m256i accA, accB, accC, accD;
1210 accA = accB = accC = accD = _mm256_setzero_si256();
1217 m1A = _mm256_andnot_si256(_mm256_load_si256(src), _mm256_load_si256(dst));
1218 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1219 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1220 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1222 _mm256_store_si256(dst+2, m1C);
1223 _mm256_store_si256(dst+3, m1D);
1224 _mm256_store_si256(dst+0, m1A);
1225 _mm256_store_si256(dst+1, m1B);
1227 accA = _mm256_or_si256(accA, m1A);
1228 accB = _mm256_or_si256(accB, m1B);
1229 accC = _mm256_or_si256(accC, m1C);
1230 accD = _mm256_or_si256(accD, m1D);
1233 }
while (src < src_end);
1235 accA = _mm256_or_si256(accA, accB);
1236 accC = _mm256_or_si256(accC, accD);
1237 accA = _mm256_or_si256(accA, accC);
1239 return !_mm256_testz_si256(accA, accA);
1253 __m256i m1A, m1B, m1C, m1D;
1255 m1A = _mm256_andnot_si256(_mm256_load_si256(src+0), _mm256_load_si256(dst+0));
1256 m1B = _mm256_andnot_si256(_mm256_load_si256(src+1), _mm256_load_si256(dst+1));
1257 m1C = _mm256_andnot_si256(_mm256_load_si256(src+2), _mm256_load_si256(dst+2));
1258 m1D = _mm256_andnot_si256(_mm256_load_si256(src+3), _mm256_load_si256(dst+3));
1260 _mm256_store_si256(dst+0, m1A);
1261 _mm256_store_si256(dst+1, m1B);
1262 _mm256_store_si256(dst+2, m1C);
1263 _mm256_store_si256(dst+3, m1D);
1265 m1A = _mm256_or_si256(m1A, m1B);
1266 m1C = _mm256_or_si256(m1C, m1D);
1267 m1A = _mm256_or_si256(m1A, m1C);
1269 return _mm256_testz_si256(m1A, m1A);
1284 __m256i m1A, m1B, m1C, m1D;
1286 m1A = _mm256_andnot_si256(_mm256_load_si256(src2+0), _mm256_load_si256(src1+0));
1287 m1B = _mm256_andnot_si256(_mm256_load_si256(src2+1), _mm256_load_si256(src1+1));
1288 m1C = _mm256_andnot_si256(_mm256_load_si256(src2+2), _mm256_load_si256(src1+2));
1289 m1D = _mm256_andnot_si256(_mm256_load_si256(src2+3), _mm256_load_si256(src1+3));
1291 _mm256_store_si256(dst+0, m1A);
1292 _mm256_store_si256(dst+1, m1B);
1293 _mm256_store_si256(dst+2, m1C);
1294 _mm256_store_si256(dst+3, m1D);
1296 m1A = _mm256_or_si256(m1A, m1B);
1297 m1C = _mm256_or_si256(m1C, m1D);
1298 m1A = _mm256_or_si256(m1A, m1C);
1300 return _mm256_testz_si256(m1A, m1A);
1316 __m256i m1A, m1B, m1C, m1D;
1317 __m256i m1E, m1F, m1G, m1H;
1318 const __m256i maskF = _mm256_set1_epi32(~0u);
1321 __m256i s1_0, s2_0, s1_1, s2_1;
1323 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1324 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1325 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1326 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1328 m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1330 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1331 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1332 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1333 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1335 m1C = _mm256_and_si256(s1_0, s2_0);
1336 m1D = _mm256_and_si256(s1_1, s2_1);
1339 __m256i s3_0, s4_0, s3_1, s4_1;
1341 s3_0 = _mm256_load_si256(src3 + 0); s4_0 = _mm256_load_si256(src4 + 0);
1342 s3_1 = _mm256_load_si256(src3 + 1); s4_1 = _mm256_load_si256(src4 + 1);
1343 s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1344 s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1346 m1E = _mm256_and_si256(s3_0, s4_0);
1347 m1F = _mm256_and_si256(s3_1, s4_1);
1349 m1A = _mm256_and_si256(m1A, m1E);
1350 m1B = _mm256_and_si256(m1B, m1F);
1352 s3_0 = _mm256_load_si256(src3 + 2); s4_0 = _mm256_load_si256(src4 + 2);
1353 s3_1 = _mm256_load_si256(src3 + 3); s4_1 = _mm256_load_si256(src4 + 3);
1354 s3_0 = _mm256_xor_si256(s3_0, maskF);s4_0 = _mm256_xor_si256(s4_0, maskF);
1355 s3_1 = _mm256_xor_si256(s3_1, maskF);s4_1 = _mm256_xor_si256(s4_1, maskF);
1357 m1G = _mm256_and_si256(s3_0, s4_0);
1358 m1H = _mm256_and_si256(s3_1, s4_1);
1362 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1364 m1C = _mm256_and_si256(m1C, m1G);
1365 m1D = _mm256_and_si256(m1D, m1H);
1366 m1A = _mm256_and_si256(m1A, dst0);
1367 m1B = _mm256_and_si256(m1B, dst1);
1369 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1371 m1C = _mm256_and_si256(m1C, dst0);
1372 m1D = _mm256_and_si256(m1D, dst1);
1374 _mm256_store_si256(dst + 0, m1A);
1375 _mm256_store_si256(dst + 1, m1B);
1376 _mm256_store_si256(dst + 2, m1C);
1377 _mm256_store_si256(dst + 3, m1D);
1379 m1A = _mm256_or_si256(m1A, m1B);
1380 m1C = _mm256_or_si256(m1C, m1D);
1381 m1A = _mm256_or_si256(m1A, m1C);
1383 return _mm256_testz_si256(m1A, m1A);
1396 __m256i m1A, m1B, m1C, m1D;
1398 const __m256i maskF = _mm256_set1_epi32(~0u);
1401 __m256i s1_0, s2_0, s1_1, s2_1;
1403 s1_0 = _mm256_load_si256(src1 + 0); s2_0 = _mm256_load_si256(src2 + 0);
1404 s1_1 = _mm256_load_si256(src1 + 1); s2_1 = _mm256_load_si256(src2 + 1);
1405 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1406 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1408 m1A = _mm256_and_si256(s1_0, s2_0); m1B = _mm256_and_si256(s1_1, s2_1);
1410 s1_0 = _mm256_load_si256(src1 + 2); s2_0 = _mm256_load_si256(src2 + 2);
1411 s1_1 = _mm256_load_si256(src1 + 3); s2_1 = _mm256_load_si256(src2 + 3);
1412 s1_0 = _mm256_xor_si256(s1_0, maskF);s2_0 = _mm256_xor_si256(s2_0, maskF);
1413 s1_1 = _mm256_xor_si256(s1_1, maskF);s2_1 = _mm256_xor_si256(s2_1, maskF);
1415 m1C = _mm256_and_si256(s1_0, s2_0);
1416 m1D = _mm256_and_si256(s1_1, s2_1);
1444 dst0 = _mm256_load_si256(dst + 0); dst1 = _mm256_load_si256(dst + 1);
1448 m1A = _mm256_and_si256(m1A, dst0);
1449 m1B = _mm256_and_si256(m1B, dst1);
1451 dst0 = _mm256_load_si256(dst + 2); dst1 = _mm256_load_si256(dst + 3);
1453 m1C = _mm256_and_si256(m1C, dst0);
1454 m1D = _mm256_and_si256(m1D, dst1);
1456 _mm256_store_si256(dst + 0, m1A);
1457 _mm256_store_si256(dst + 1, m1B);
1458 _mm256_store_si256(dst + 2, m1C);
1459 _mm256_store_si256(dst + 3, m1D);
1461 m1A = _mm256_or_si256(m1A, m1B);
1462 m1C = _mm256_or_si256(m1C, m1D);
1463 m1A = _mm256_or_si256(m1A, m1C);
1465 return _mm256_testz_si256(m1A, m1A);
1482 __m256i ymm0 = _mm256_set1_epi32(
int(
value));
1485 _mm256_store_si256(dst, ymm0);
1486 _mm256_store_si256(dst+1, ymm0);
1487 _mm256_store_si256(dst+2, ymm0);
1488 _mm256_store_si256(dst+3, ymm0);
1491 }
while (dst < dst_end);
1506 __m256i ymm0, ymm1, ymm2, ymm3;
1513 ymm0 = _mm256_load_si256(src+0);
1514 ymm1 = _mm256_load_si256(src+1);
1515 ymm2 = _mm256_load_si256(src+2);
1516 ymm3 = _mm256_load_si256(src+3);
1518 _mm256_store_si256(dst+0, ymm0);
1519 _mm256_store_si256(dst+1, ymm1);
1520 _mm256_store_si256(dst+2, ymm2);
1521 _mm256_store_si256(dst+3, ymm3);
1523 ymm0 = _mm256_load_si256(src+4);
1524 ymm1 = _mm256_load_si256(src+5);
1525 ymm2 = _mm256_load_si256(src+6);
1526 ymm3 = _mm256_load_si256(src+7);
1528 _mm256_store_si256(dst+4, ymm0);
1529 _mm256_store_si256(dst+5, ymm1);
1530 _mm256_store_si256(dst+6, ymm2);
1531 _mm256_store_si256(dst+7, ymm3);
1535 }
while (src < src_end);
1548 __m256i ymm0, ymm1, ymm2, ymm3;
1555 ymm0 = _mm256_loadu_si256(src+0);
1556 ymm1 = _mm256_loadu_si256(src+1);
1557 ymm2 = _mm256_loadu_si256(src+2);
1558 ymm3 = _mm256_loadu_si256(src+3);
1560 _mm256_store_si256(dst+0, ymm0);
1561 _mm256_store_si256(dst+1, ymm1);
1562 _mm256_store_si256(dst+2, ymm2);
1563 _mm256_store_si256(dst+3, ymm3);
1565 ymm0 = _mm256_loadu_si256(src+4);
1566 ymm1 = _mm256_loadu_si256(src+5);
1567 ymm2 = _mm256_loadu_si256(src+6);
1568 ymm3 = _mm256_loadu_si256(src+7);
1570 _mm256_store_si256(dst+4, ymm0);
1571 _mm256_store_si256(dst+5, ymm1);
1572 _mm256_store_si256(dst+6, ymm2);
1573 _mm256_store_si256(dst+7, ymm3);
1577 }
while (src < src_end);
1592 __m256i ymm0, ymm1, ymm2, ymm3;
1599 ymm0 = _mm256_load_si256(src+0);
1600 ymm1 = _mm256_load_si256(src+1);
1601 ymm2 = _mm256_load_si256(src+2);
1602 ymm3 = _mm256_load_si256(src+3);
1604 _mm256_stream_si256(dst+0, ymm0);
1605 _mm256_stream_si256(dst+1, ymm1);
1606 _mm256_stream_si256(dst+2, ymm2);
1607 _mm256_stream_si256(dst+3, ymm3);
1609 ymm0 = _mm256_load_si256(src+4);
1610 ymm1 = _mm256_load_si256(src+5);
1611 ymm2 = _mm256_load_si256(src+6);
1612 ymm3 = _mm256_load_si256(src+7);
1614 _mm256_stream_si256(dst+4, ymm0);
1615 _mm256_stream_si256(dst+5, ymm1);
1616 _mm256_stream_si256(dst+6, ymm2);
1617 _mm256_stream_si256(dst+7, ymm3);
1621 }
while (src < src_end);
1634 __m256i ymm0, ymm1, ymm2, ymm3;
1641 ymm0 = _mm256_loadu_si256(src+0);
1642 ymm1 = _mm256_loadu_si256(src+1);
1643 ymm2 = _mm256_loadu_si256(src+2);
1644 ymm3 = _mm256_loadu_si256(src+3);
1646 _mm256_stream_si256(dst+0, ymm0);
1647 _mm256_stream_si256(dst+1, ymm1);
1648 _mm256_stream_si256(dst+2, ymm2);
1649 _mm256_stream_si256(dst+3, ymm3);
1651 ymm0 = _mm256_loadu_si256(src+4);
1652 ymm1 = _mm256_loadu_si256(src+5);
1653 ymm2 = _mm256_loadu_si256(src+6);
1654 ymm3 = _mm256_loadu_si256(src+7);
1656 _mm256_stream_si256(dst+4, ymm0);
1657 _mm256_stream_si256(dst+5, ymm1);
1658 _mm256_stream_si256(dst+6, ymm2);
1659 _mm256_stream_si256(dst+7, ymm3);
1663 }
while (src < src_end);
1679 __m256i maskFF = _mm256_set1_epi32(-1);
1686 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+0), maskFF);
1687 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+1), maskFF);
1689 _mm256_store_si256(dst+0, ymm0);
1690 _mm256_store_si256(dst+1, ymm1);
1692 ymm0 = _mm256_xor_si256(_mm256_load_si256(dst+2), maskFF);
1693 ymm1 = _mm256_xor_si256(_mm256_load_si256(dst+3), maskFF);
1695 _mm256_store_si256(dst+2, ymm0);
1696 _mm256_store_si256(dst+3, ymm1);
1700 }
while (dst < dst_end);
1715 __m256i w0 = _mm256_load_si256(block+0);
1716 __m256i w1 = _mm256_load_si256(block+1);
1718 __m256i wA = _mm256_or_si256(w0, w1);
1720 __m256i w2 = _mm256_load_si256(block+2);
1721 __m256i w3 = _mm256_load_si256(block+3);
1723 __m256i wB = _mm256_or_si256(w2, w3);
1724 wA = _mm256_or_si256(wA, wB);
1726 if (!_mm256_testz_si256(wA, wA))
1729 }
while (block < block_end);
1740 __m256i wA = _mm256_or_si256(_mm256_load_si256(block+0), _mm256_load_si256(block+1));
1741 __m256i wB = _mm256_or_si256(_mm256_load_si256(block+2), _mm256_load_si256(block+3));
1742 wA = _mm256_or_si256(wA, wB);
1744 return _mm256_testz_si256(wA, wA);
1754 __m256i mV = _mm256_set1_epi32(
int(
value));
1755 _mm256_store_si256(dst, mV);
1756 _mm256_store_si256(dst + 1, mV);
1757 _mm256_store_si256(dst + 2, mV);
1758 _mm256_store_si256(dst + 3, mV);
1769 const __m256i maskF = _mm256_set1_epi32(~0u);
1774 __m256i m1A = _mm256_load_si256(block+0);
1775 __m256i m1B = _mm256_load_si256(block+1);
1776 m1A = _mm256_xor_si256(m1A, maskF);
1777 m1B = _mm256_xor_si256(m1B, maskF);
1778 m1A = _mm256_or_si256(m1A, m1B);
1779 if (!_mm256_testz_si256(m1A, m1A))
1782 }
while (block < block_end);
1793 __m256i maskF = _mm256_set1_epi32(~0u);
1794 __m256i wcmpA = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)ptr), maskF);
1795 unsigned maskA = unsigned(_mm256_movemask_epi8(wcmpA));
1796 return (maskA == ~0u);
1807 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr);
1808 return _mm256_testz_si256(w0, w0);
1818 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1819 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1820 w0 = _mm256_or_si256(w0, w1);
1821 return _mm256_testz_si256(w0, w0);
1831 __m256i w0 = _mm256_loadu_si256((__m256i*)ptr0);
1832 __m256i w1 = _mm256_loadu_si256((__m256i*)ptr1);
1833 w0 = _mm256_xor_si256(w0, w1);
1834 return _mm256_testz_si256(w0, w0);
1844 __m256i* block_end =
1847 __m256i m1COshft, m2COshft;
1848 __m256i mAcc = _mm256_set1_epi32(0);
1849 __m256i mMask1 = _mm256_set1_epi32(1);
1850 __m256i mCOidx = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
1853 for (--block_end; block_end >= block; block_end -= 2)
1855 __m256i m1A = _mm256_load_si256(block_end);
1856 __m256i m2A = _mm256_load_si256(block_end-1);
1858 __m256i m1CO = _mm256_and_si256(m1A, mMask1);
1859 __m256i m2CO = _mm256_and_si256(m2A, mMask1);
1861 co2 = _mm256_extract_epi32(m1CO, 0);
1863 m1A = _mm256_srli_epi32(m1A, 1);
1864 m2A = _mm256_srli_epi32(m2A, 1);
1867 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1868 m1COshft = _mm256_insert_epi32(m1COshft, co1, 7);
1872 co2 = _mm256_extract_epi32(m2CO, 0);
1874 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1875 m2COshft = _mm256_insert_epi32(m2COshft, co1, 7);
1877 m1COshft = _mm256_slli_epi32(m1COshft, 31);
1878 m2COshft = _mm256_slli_epi32(m2COshft, 31);
1880 m1A = _mm256_or_si256(m1A, m1COshft);
1881 m2A = _mm256_or_si256(m2A, m2COshft);
1883 _mm256_store_si256(block_end, m1A);
1884 _mm256_store_si256(block_end-1, m2A);
1886 mAcc = _mm256_or_si256(mAcc, m1A);
1887 mAcc = _mm256_or_si256(mAcc, m2A);
1893 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1905 const __m256i* block_end =
1908 __m256i m1COshft, m2COshft;
1909 __m256i mAcc = _mm256_set1_epi32(0);
1910 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1913 for (;block < block_end; block+=2)
1915 __m256i m1A = _mm256_load_si256(block);
1916 __m256i m2A = _mm256_load_si256(block+1);
1918 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1919 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1921 co2 = _mm256_extract_epi32(m1CO, 7);
1923 m1A = _mm256_slli_epi32(m1A, 1);
1924 m2A = _mm256_slli_epi32(m2A, 1);
1927 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
1928 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
1932 co2 = _mm256_extract_epi32(m2CO, 7);
1933 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
1934 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
1936 m1A = _mm256_or_si256(m1A, m1COshft);
1937 m2A = _mm256_or_si256(m2A, m2COshft);
1939 _mm256_store_si256(block, m1A);
1940 _mm256_store_si256(block+1, m2A);
1942 mAcc = _mm256_or_si256(mAcc, m1A);
1943 mAcc = _mm256_or_si256(mAcc, m2A);
1948 *empty_acc = !_mm256_testz_si256(mAcc, mAcc);
1969 __m256i m1COshft, m2COshft;
1970 __m256i mAcc = _mm256_set1_epi32(0);
1971 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
1976 unsigned di = co1 ? 0 : unsigned(_tzcnt_u64(d));
1977 for (; di < 64 ; ++di)
1983 mAcc = _mm256_xor_si256(mAcc, mAcc);
1985 mask_block = (__m256i*) &mblock[d_base];
1988 block = (__m256i*) &wblock[d_base];
1990 for (
unsigned i = 0;
i < 2; ++
i, block += 2, mask_block += 2)
1992 __m256i m1A = _mm256_load_si256(block);
1993 __m256i m2A = _mm256_load_si256(block+1);
1995 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
1996 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
1998 co2 = _mm256_extract_epi32(m1CO, 7);
2000 m1A = _mm256_slli_epi32(m1A, 1);
2001 m2A = _mm256_slli_epi32(m2A, 1);
2003 __m256i m1M = _mm256_load_si256(mask_block);
2004 __m256i m2M = _mm256_load_si256(mask_block+1);
2007 m1COshft = _mm256_insert_epi32(
2008 _mm256_permutevar8x32_epi32(m1CO, mCOidx),
2012 co2 = _mm256_extract_epi32(m2CO, 7);
2013 m2COshft = _mm256_insert_epi32(
2014 _mm256_permutevar8x32_epi32(m2CO, mCOidx),
2017 m1A = _mm256_or_si256(m1A, m1COshft);
2018 m2A = _mm256_or_si256(m2A, m2COshft);
2020 m1A = _mm256_and_si256(m1A, m1M);
2021 m2A = _mm256_and_si256(m2A, m2M);
2023 _mm256_store_si256(block, m1A);
2024 _mm256_store_si256(block+1, m2A);
2026 mAcc = _mm256_or_si256(mAcc, m1A);
2027 mAcc = _mm256_or_si256(mAcc, m2A);
2033 if (_mm256_testz_si256(mAcc, mAcc))
2045 bm::id64_t w0 = wblock[d_base] = (co1 & mblock[d_base]);
2046 d |= (dmask & (w0 << di));
2088 const __m256i* block_end =
2091 __m256i m1COshft, m2COshft;
2092 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2093 __m256i cntAcc = _mm256_setzero_si256();
2100 unsigned co2, co1 = 0;
2101 for (;block < block_end; block+=2)
2103 __m256i m1A = _mm256_load_si256(block);
2104 __m256i m2A = _mm256_load_si256(block+1);
2106 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2107 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2109 co2 = _mm256_extract_epi32(m1CO, 7);
2111 __m256i m1As = _mm256_slli_epi32(m1A, 1);
2112 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2115 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2116 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2120 co2 = _mm256_extract_epi32(m2CO, 7);
2121 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2122 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2124 m1As = _mm256_or_si256(m1As, m1COshft);
2125 m2As = _mm256_or_si256(m2As, m2COshft);
2130 m1A = _mm256_xor_si256(m1A, m1As);
2131 m2A = _mm256_xor_si256(m2A, m2As);
2135 cntAcc = _mm256_add_epi64(cntAcc, bc);
2137 cntAcc = _mm256_add_epi64(cntAcc, bc);
2142 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2143 count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2165 __m256i m1COshft, m2COshft;
2166 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2168 __m256i cntAcc = _mm256_setzero_si256();
2169 __m256i cntAcc2 = _mm256_setzero_si256();
2172 unsigned bit_count = 0;
2173 unsigned gap_count = 1;
2177 unsigned co2, co1 = 0;
2178 for (;block < block_end; block+=2, xor_block+=2)
2180 __m256i m1A = _mm256_load_si256(block);
2181 __m256i m2A = _mm256_load_si256(block+1);
2182 __m256i m1B = _mm256_load_si256(xor_block);
2183 __m256i m2B = _mm256_load_si256(xor_block+1);
2185 m1A = _mm256_xor_si256 (m1A, m1B);
2186 m2A = _mm256_xor_si256 (m2A, m2B);
2190 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2192 cntAcc2 = _mm256_add_epi64(cntAcc2, bc);
2195 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2196 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2198 co2 = _mm256_extract_epi32(m1CO, 7);
2200 __m256i m1As = _mm256_slli_epi32(m1A, 1);
2201 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2204 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2205 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2209 co2 = _mm256_extract_epi32(m2CO, 7);
2210 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2211 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2213 m1As = _mm256_or_si256(m1As, m1COshft);
2214 m2As = _mm256_or_si256(m2As, m2COshft);
2219 m1A = _mm256_xor_si256(m1A, m1As);
2220 m2A = _mm256_xor_si256(m2A, m2As);
2224 cntAcc = _mm256_add_epi64(cntAcc, bc);
2226 cntAcc = _mm256_add_epi64(cntAcc, bc);
2231 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2232 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2233 gap_count -= (w0 & 1u);
2237 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc2);
2238 bit_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2240 *gcount = gap_count;
2241 *bcount = bit_count;
2252 unsigned* gcount,
unsigned* bcount)
2256 const __m256i* block_end =
2259 __m256i m1COshft, m2COshft;
2260 __m256i mCOidx = _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 0);
2261 __m256i cntAcc = _mm256_setzero_si256();
2264 unsigned bit_count = 0;
2265 unsigned gap_count = 1;
2269 unsigned co2, co1 = 0;
2270 for (;block < block_end; block+=2)
2272 __m256i m1A = _mm256_load_si256(block);
2273 __m256i m2A = _mm256_load_si256(block+1);
2286 __m256i m1CO = _mm256_srli_epi32(m1A, 31);
2287 __m256i m2CO = _mm256_srli_epi32(m2A, 31);
2289 co2 = _mm256_extract_epi32(m1CO, 7);
2291 __m256i m1As = _mm256_slli_epi32(m1A, 1);
2292 __m256i m2As = _mm256_slli_epi32(m2A, 1);
2295 m1COshft = _mm256_permutevar8x32_epi32(m1CO, mCOidx);
2296 m1COshft = _mm256_insert_epi32(m1COshft, co1, 0);
2300 co2 = _mm256_extract_epi32(m2CO, 7);
2301 m2COshft = _mm256_permutevar8x32_epi32(m2CO, mCOidx);
2302 m2COshft = _mm256_insert_epi32(m2COshft, co1, 0);
2304 m1As = _mm256_or_si256(m1As, m1COshft);
2305 m2As = _mm256_or_si256(m2As, m2COshft);
2310 m1A = _mm256_xor_si256(m1A, m1As);
2311 m2A = _mm256_xor_si256(m2A, m2As);
2315 cntAcc = _mm256_add_epi64(cntAcc, bc);
2317 cntAcc = _mm256_add_epi64(cntAcc, bc);
2322 _mm256_store_si256 ((__m256i*)cnt_v, cntAcc);
2323 gap_count += (unsigned)(cnt_v[0] + cnt_v[1] + cnt_v[2] + cnt_v[3]);
2324 gap_count -= (w0 & 1u);
2326 *gcount = gap_count;
2327 *bcount = bit_count;
2342 const __m256i* block1_end =
2344 __m256i maskZ = _mm256_setzero_si256();
2346 unsigned simd_lane = 0;
2349 mA = _mm256_xor_si256(_mm256_load_si256(block1),
2350 _mm256_load_si256(block2));
2351 mB = _mm256_xor_si256(_mm256_load_si256(block1+1),
2352 _mm256_load_si256(block2+1));
2353 __m256i mOR = _mm256_or_si256(mA, mB);
2354 if (!_mm256_testz_si256(mOR, mOR))
2356 if (!_mm256_testz_si256(mA, mA))
2359 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2361 int bsf = bm::bsf_asm32(
mask);
2362 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2363 unsigned widx = bsf >> 2;
2364 unsigned w = simd_buf[widx];
2365 bsf = bm::bsf_asm32(w);
2366 *pos = (simd_lane * 256) + (widx * 32) + bsf;
2370 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2372 int bsf = bm::bsf_asm32(
mask);
2373 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2374 unsigned widx = bsf >> 2;
2375 unsigned w = simd_buf[widx];
2376 bsf = bm::bsf_asm32(w);
2377 *pos = ((++simd_lane) * 256) + (widx * 32) + bsf;
2382 block1+=2; block2+=2;
2384 }
while (block1 < block1_end);
2398 block = (
const __m256i*)((
bm::word_t*)(block) + off);
2399 const __m256i* block_end =
2401 __m256i maskZ = _mm256_setzero_si256();
2403 unsigned simd_lane = 0;
2406 mA = _mm256_load_si256(block); mB = _mm256_load_si256(block+1);
2407 __m256i mOR = _mm256_or_si256(mA, mB);
2408 if (!_mm256_testz_si256(mOR, mOR))
2410 if (!_mm256_testz_si256(mA, mA))
2413 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mA, maskZ));
2415 int bsf = bm::bsf_asm32(
mask);
2416 _mm256_store_si256 ((__m256i*)simd_buf, mA);
2417 unsigned widx = bsf >> 2;
2418 unsigned w = simd_buf[widx];
2419 bsf = bm::bsf_asm32(w);
2420 *pos = (off * 32) + (simd_lane * 256) + (widx * 32) + bsf;
2424 unsigned mask = ~_mm256_movemask_epi8(_mm256_cmpeq_epi32(mB, maskZ));
2426 int bsf = bm::bsf_asm32(
mask);
2427 _mm256_store_si256 ((__m256i*)simd_buf, mB);
2428 unsigned widx = bsf >> 2;
2429 unsigned w = simd_buf[widx];
2430 bsf = bm::bsf_asm32(w);
2431 *pos = (off * 32) + ((++simd_lane) * 256) + (widx * 32) + bsf;
2438 }
while (block < block_end);
2454 unsigned avx_vect_waves,
2457 __m256i xcnt = _mm256_setzero_si256();
2462 for (
unsigned i = 0;
i < avx_vect_waves; ++
i)
2464 __m256i ymm0 = _mm256_loadu_si256((__m256i*)(pbuf - 1));
2465 __m256i ymm1 = _mm256_loadu_si256((__m256i*)(pbuf + 16 - 1));
2466 __m256i ymm_s2 = _mm256_add_epi16(ymm1, ymm0);
2467 xcnt = _mm256_add_epi16(xcnt, ymm_s2);
2472 xcnt = _mm256_sub_epi16(_mm256_bsrli_epi128(xcnt, 2), xcnt);
2477 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 4), xcnt);
2478 xcnt = _mm256_add_epi16(_mm256_bsrli_epi128(xcnt, 8), xcnt);
2479 __m128i xcnt2 =
_mm_add_epi16(_mm256_extracti128_si256(xcnt, 1), _mm256_extracti128_si256(xcnt, 0));
2493 unsigned nb,
unsigned start)
2495 const unsigned unroll_factor = 16;
2496 const unsigned len = (
size - start);
2497 const unsigned len_unr =
len - (
len % unroll_factor);
2502 __m256i nbM = _mm256_set1_epi32(
int(nb));
2504 for (k = 0; k < len_unr; k+=unroll_factor)
2506 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2509 __m256i wcmpA= _mm256_cmpeq_epi8(nbM, nbA);
2510 if (~0u !=
unsigned(_mm256_movemask_epi8(wcmpA)))
2512 __m256i idxB = _mm256_loadu_si256((__m256i*)(idx+k+8));
2515 __m256i wcmpB = _mm256_cmpeq_epi8(nbM, nbB);
2516 if (~0u !=
unsigned(_mm256_movemask_epi8(wcmpB)))
2519 for (; k <
len; ++k)
2535 unsigned start,
unsigned stop )
2537 const unsigned unroll_factor = 8;
2538 const unsigned len = (stop - start);
2539 const unsigned len_unr =
len - (
len % unroll_factor);
2545 __m256i mask1 = _mm256_set1_epi32(1);
2551 unsigned k = 0,
mask, w_idx;
2552 for (; k < len_unr; k+=unroll_factor)
2554 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2555 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask);
2558 nbitA = _mm256_and_si256 (nbitA, sw_mask);
2560 __m256i maskA = _mm256_sllv_epi32(mask1, nbitA);
2562 _mm256_store_si256 ((__m256i*)mword_v, nwordA);
2565 mask_tmp = _mm256_shuffle_epi32 (nwordA,
_MM_SHUFFLE(1,1,1,1));
2566 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2567 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
2571 mask_tmp = _mm256_xor_si256 (mask_tmp, mask_tmp);
2572 mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2576 __m256i mtmp0 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 0);
2577 __m256i mtmp1 = _mm256_permute2x128_si256(mask_tmp, mask_tmp, 1);
2578 mask_tmp = _mm256_or_si256 (mtmp0, mtmp1);
2579 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 4);
2580 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2581 mtmp0 = _mm256_bsrli_epi128(mask_tmp, 8);
2582 mask_tmp = _mm256_or_si256 (mtmp0, mask_tmp);
2584 int u0 = _mm256_extract_epi32(mask_tmp, 0);
2589 _mm256_store_si256 ((__m256i*)mask_v, maskA);
2597 mask_tmp = _mm256_bsrli_epi128(maskA, 4);
2598 mask_tmp = _mm256_or_si256 (mask_tmp, maskA);
2599 __m256i m0 = _mm256_bsrli_epi128(mask_tmp, 8);
2600 mask_tmp = _mm256_or_si256 (m0, mask_tmp);
2602 u0 = _mm256_extract_epi32(mask_tmp, 0);
2603 u4 = _mm256_extract_epi32(mask_tmp, 4);
2608 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 0);
2609 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2610 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2618 block[mword_v[0]] |= mask_v[0];
2619 block[mword_v[1]] |= mask_v[1];
2620 block[mword_v[2]] |= mask_v[2];
2621 block[mword_v[3]] |= mask_v[3];
2628 mask_tmp = _mm256_permute2x128_si256 (nwordA, nwordA, 1);
2629 __m256i m0 = _mm256_shuffle_epi32(mask_tmp, 0x0);
2630 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, m0));
2638 block[mword_v[4]] |= mask_v[4];
2639 block[mword_v[5]] |= mask_v[5];
2640 block[mword_v[6]] |= mask_v[6];
2641 block[mword_v[7]] |= mask_v[7];
2647 for (; k <
len; ++k)
2649 unsigned n = idx[k];
2653 block[nword] |= (1u << nbit);
2664 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2665 __m256i mask1 = _mm256_set1_epi32(1);
2667 __m256i v0, v1, acc1, acc2;
2668 v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(0));
2669 v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(1));
2670 v0 = _mm256_sub_epi32(v0, stride_idx);
2671 v1 = _mm256_sub_epi32(v1, stride_idx);
2672 v0 = _mm256_sllv_epi32(mask1, v0);
2673 v1 = _mm256_sllv_epi32(mask1, v1);
2674 acc1 = _mm256_or_si256(v1, v0);
2675 v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(2));
2676 v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(3));
2677 v0 = _mm256_sub_epi32(v0, stride_idx);
2678 v1 = _mm256_sub_epi32(v1, stride_idx);
2679 v0 = _mm256_sllv_epi32(mask1, v0);
2680 v1 = _mm256_sllv_epi32(mask1, v1);
2681 acc2 = _mm256_or_si256(v1, v0);
2682 target = _mm256_or_si256(target, acc1);
2683 v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(4));
2684 v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(5));
2685 v0 = _mm256_sub_epi32(v0, stride_idx);
2686 v1 = _mm256_sub_epi32(v1, stride_idx);
2687 v0 = _mm256_sllv_epi32(mask1, v0);
2688 v1 = _mm256_sllv_epi32(mask1, v1);
2689 acc1 = _mm256_or_si256(v1, v0);
2690 target = _mm256_or_si256(target, acc2);
2691 v0 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(6));
2692 v1 = _mm256_permutevar8x32_epi32(
source, _mm256_set1_epi32(7));
2693 v0 = _mm256_sub_epi32(v0, stride_idx);
2694 v1 = _mm256_sub_epi32(v1, stride_idx);
2695 v0 = _mm256_sllv_epi32(mask1, v0);
2696 v1 = _mm256_sllv_epi32(mask1, v1);
2697 acc2 = _mm256_or_si256(v1, v0);
2699 target = _mm256_or_si256(target, acc1);
2700 target = _mm256_or_si256(target, acc2);
2711 unsigned start,
unsigned stop )
2713 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2714 __m256i mask1 = _mm256_set1_epi32(1);
2715 __m256i* block_avx = (__m256i*)block;
2717 unsigned stride = 0;
2718 __m256i* avx_stride_p = block_avx + stride;
2719 __m256i blkA = _mm256_load_si256(avx_stride_p);
2721 for (
unsigned i = start;
i < stop; ++
i)
2723 unsigned n = idx[
i];
2725 unsigned new_stride = nbit >> 8;
2726 unsigned stride_bit = nbit & 0xFF;
2727 if (new_stride != stride)
2729 _mm256_store_si256(avx_stride_p, blkA);
2730 stride = new_stride;
2731 avx_stride_p = block_avx + stride;
2732 blkA = _mm256_load_si256(avx_stride_p);
2735 __m256i v0 = _mm256_set1_epi32(stride_bit);
2736 __m256i s0 = _mm256_sub_epi32(v0, stride_idx);
2737 __m256i k0 = _mm256_sllv_epi32(mask1, s0);
2738 blkA = _mm256_or_si256(blkA, k0);
2741 _mm256_store_si256(avx_stride_p, blkA);
2750 unsigned start,
unsigned stop )
2752 const unsigned unroll_factor = 8;
2753 const unsigned len = (stop - start);
2754 const unsigned len_unr =
len - (
len % unroll_factor);
2758 __m256i stride_idx = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2759 __m256i mask1 = _mm256_set1_epi32(1);
2762 __m256i stride_bit_mask = _mm256_set1_epi32(0xFF);
2770 __m256i* block_avx = (__m256i*)block;
2771 __m256i* avx_stride_p = block_avx + stride;
2773 __m256i blkA = _mm256_load_si256(avx_stride_p);
2775 unsigned k = 0,
mask;
2776 for (; k < len_unr; k+=unroll_factor)
2778 __m256i idxA = _mm256_loadu_si256((__m256i*)(idx+k));
2779 __m256i nbitA = _mm256_and_si256 (idxA, sb_mask);
2780 __m256i strideA = _mm256_srli_epi32 (nbitA, 8);
2781 __m256i strideBitA = _mm256_and_si256 (nbitA, stride_bit_mask);
2784 __m256i mask_tmp = _mm256_shuffle_epi32 (strideA, 0x0);
2785 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
2786 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, strideA));
2789 unsigned new_stride = (unsigned)_mm256_extract_epi32(strideA, 0);
2790 if (new_stride != stride)
2792 _mm256_store_si256(avx_stride_p, blkA);
2793 stride = new_stride;
2794 avx_stride_p = block_avx + stride;
2795 blkA = _mm256_load_si256(avx_stride_p);
2802 _mm256_store_si256 ((__m256i*)mstride_bit_v, strideBitA);
2803 _mm256_store_si256 ((__m256i*)mstride_v, strideA);
2804 for (
unsigned j = 0; j < 8; ++j)
2806 unsigned new_stride = mstride_v[j];
2807 if (new_stride != stride)
2809 _mm256_store_si256(avx_stride_p, blkA);
2810 stride = new_stride;
2811 avx_stride_p = block_avx + stride;
2812 blkA = _mm256_load_si256(avx_stride_p);
2815 mask_tmp = _mm256_set1_epi32(mstride_bit_v[j]);
2816 mask_tmp = _mm256_sub_epi32(mask_tmp, stride_idx);
2817 mask_tmp = _mm256_sllv_epi32(mask1, mask_tmp);
2818 blkA = _mm256_or_si256(blkA, mask_tmp);
2822 _mm256_store_si256(avx_stride_p, blkA);
2825 for (; k <
len; ++k)
2827 unsigned n = idx[k];
2831 block[nword] |= (1u << nbit);
2843 __m256i stride_idx1 = _mm256_set_epi32(224, 192, 160, 128, 96, 64, 32, 0);
2844 __m256i stride_idx2 = _mm256_add_epi32(stride_idx1, _mm256_set1_epi32(32));
2845 __m256i maskFF = _mm256_set1_epi32(-1);
2846 __m256i maskZ = _mm256_setzero_si256();
2848 __m256i v0 = _mm256_set1_epi32(
i);
2849 __m256i s0 = _mm256_sub_epi32(v0, stride_idx1);
2850 __m256i k1 = _mm256_sllv_epi32(maskFF, s0);
2853 __m256i cmp_eq = _mm256_cmpeq_epi32(k1, maskZ);
2854 cmp_eq = _mm256_xor_si256(maskFF, cmp_eq);
2855 k1 = _mm256_xor_si256(k1, cmp_eq);
2858 __m256i cmp_gt = _mm256_cmpgt_epi32 (stride_idx2, v0);
2859 cmp_gt = _mm256_xor_si256(maskFF, cmp_gt);
2860 __m256i
r = _mm256_xor_si256(k1, cmp_gt);
2880 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
2881 __m256i mm_val = _mm256_set1_epi32(
value);
2883 __m256i norm_vect8 = _mm256_sub_epi32(vect8, mask0x8);
2884 __m256i norm_val = _mm256_sub_epi32(mm_val, mask0x8);
2886 __m256i cmp_mask_gt = _mm256_cmpgt_epi32(norm_vect8, norm_val);
2887 __m256i cmp_mask_eq = _mm256_cmpeq_epi32(mm_val, vect8);
2889 __m256i cmp_mask_ge = _mm256_or_si256(cmp_mask_gt, cmp_mask_eq);
2890 int mask = _mm256_movemask_epi8(cmp_mask_ge);
2893 int bsf = bm::bsf_asm32(
mask);
2909 __m256i mZ = _mm256_setzero_si256();
2910 __m256i mVal = _mm256_set1_epi16(
value);
2913 __m256i mSub = _mm256_subs_epu16(mVal, vect16);
2914 __m256i mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
2915 unsigned mask = _mm256_movemask_epi8(mge_mask);
2918 int lz = _tzcnt_u32(
mask);
2938 template<
bool RET_TEST=false>
2944 const unsigned linear_cutoff = 64;
2945 const unsigned unroll_factor = 16;
2951 unsigned end = ((*buf) >> 3);
2953 const unsigned arr_end = end + 1;
2954 if (end <= unroll_factor)
2956 for (;
true; ++start)
2957 if (
buf[start] >= pos)
2964 unsigned dsize = end - start;
2965 for (; dsize >= 64; dsize = end - start)
2967 unsigned mid = (start + end) >> 1;
2972 if (
buf[mid = (start + end) >> 1] < pos)
2976 if (
buf[mid = (start + end) >> 1] < pos)
2980 if (
buf[mid = (start + end) >> 1] < pos)
2987 dsize = end - start + 1;
2988 if (dsize < linear_cutoff)
2993 dsize = arr_end - start;
2995 __m256i mZ = _mm256_setzero_si256();
2996 __m256i mPos = _mm256_set1_epi16((
unsigned short)pos);
2997 __m256i vect16, mSub, mge_mask;
2999 for (
unsigned len_unr = start + (dsize - (dsize % unroll_factor));
3000 start < len_unr; start += unroll_factor)
3002 vect16 = _mm256_loadu_si256((__m256i*)(&
buf[start]));
3003 mSub = _mm256_subs_epu16(mPos, vect16);
3004 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3005 if (
int mask = _mm256_movemask_epi8(mge_mask);
mask)
3007 int lz = _tzcnt_u32(
mask);
3016 vect16 = _mm256_loadu_si256((__m256i*)(&
buf[start]));
3017 mSub = _mm256_subs_epu16(mPos, vect16);
3018 mge_mask = _mm256_cmpeq_epi16(mSub, mZ);
3019 int mask = _mm256_movemask_epi8(mge_mask);
3021 int lz = _tzcnt_u32(
mask);
3025 for (;
true; ++start)
3026 if (
buf[start] >= pos)
3031 if (
unsigned mid = (start + end) >> 1;
buf[mid] < pos)
3035 if (
unsigned mid = (start + end) >> 1;
buf[mid] < pos)
3041 res = ((*buf) & 1) ^ ((start-1) & 1);
3042 if constexpr(RET_TEST)
3059 return bm::avx2_gap_bfind<true>(
buf, pos, 0);
3079 unsigned unroll_factor = 8;
3080 unsigned len = to - from + 1;
3081 unsigned len_unr =
len - (
len % unroll_factor);
3083 __m256i mask0x8 = _mm256_set1_epi32(0x80000000);
3084 __m256i vect_target = _mm256_set1_epi32(target);
3085 __m256i norm_target = _mm256_sub_epi32(vect_target, mask0x8);
3088 __m256i vect80, norm_vect80, cmp_mask_ge;
3091 for (; k < len_unr; k += unroll_factor)
3093 vect80 = _mm256_loadu_si256((__m256i*)(&arr_base[k]));
3094 norm_vect80 = _mm256_sub_epi32(vect80, mask0x8);
3096 cmp_mask_ge = _mm256_or_si256(
3097 _mm256_cmpgt_epi32(norm_vect80, norm_target),
3098 _mm256_cmpeq_epi32(vect80, vect_target)
3100 mask = _mm256_movemask_epi8(cmp_mask_ge);
3103 int bsf = bm::bsf_asm32(
mask);
3104 return from + k + (bsf / 4);
3108 for (; k <
len; ++k)
3110 if (arr_base[k] >= target)
3148 const unsigned unroll_factor = 8;
3149 const unsigned len = (
size - start);
3150 const unsigned len_unr =
len - (
len % unroll_factor);
3154 __m256i maskFF = _mm256_set1_epi32(~0u);
3156 __m256i mask_tmp, mask_0;
3160 unsigned k = 0,
mask, w_idx;
3161 for (; k < len_unr; k+=unroll_factor)
3163 __m256i nbitA, nwordA;
3164 const unsigned base = start + k;
3165 __m256i* idx_ptr = (__m256i*)(idx+base);
3167 nbitA = _mm256_and_si256 (_mm256_loadu_si256(idx_ptr), sb_mask);
3171 mask_tmp = _mm256_shuffle_epi32 (nwordA,
_MM_SHUFFLE(1,1,1,1));
3172 mask_tmp = _mm256_permute2x128_si256 (mask_tmp, mask_tmp, 0);
3173 mask = _mm256_movemask_epi8(_mm256_cmpeq_epi32(mask_tmp, nwordA));
3174 _mm256_store_si256((__m256i*)mword_v, nwordA);
3179 mask_tmp = _mm256_set1_epi32(blk[w_idx]);
3183 mask_tmp = _mm256_set_epi32(blk[mword_v[7]], blk[mword_v[6]],
3184 blk[mword_v[5]], blk[mword_v[4]],
3185 blk[mword_v[3]], blk[mword_v[2]],
3186 blk[mword_v[1]], blk[mword_v[0]]);
3191 __m256i shiftA = _mm256_and_si256 (nbitA, sw_mask);
3192 __m256i mask1 = _mm256_srli_epi32 (maskFF, 31);
3193 mask_0 = _mm256_sllv_epi32(mask1, shiftA);
3195 mask_tmp = _mm256_and_si256(mask_tmp, mask_0);
3196 if (!_mm256_testz_si256(mask_tmp, mask_tmp))
3198 __m256i* target_ptr = (__m256i*)(
arr+base);
3200 __m256i maskZ = _mm256_xor_si256(maskFF, maskFF);
3201 mask1 = _mm256_slli_epi32(mask1, bit_idx);
3202 mask_tmp = _mm256_cmpeq_epi32 (mask_tmp, maskZ);
3203 mask_tmp = _mm256_xor_si256 (mask_tmp, maskFF);
3204 mask_tmp = _mm256_and_si256 (mask_tmp, mask1);
3205 _mm256_storeu_si256 (target_ptr,
3206 _mm256_or_si256 (mask_tmp,
3207 _mm256_loadu_si256(target_ptr)));
3212 for (; k <
len; ++k)
3214 const unsigned base = start + k;
3235 unsigned bitval = (*block) & 1u;
3238 unsigned bit_idx = 0;
3240 const unsigned vCAP = 64;
3241 __m256i maskZ = _mm256_set1_epi32(0);
3243 for (; block < block_end; block += 8)
3249 __m256i accA = _mm256_load_si256((__m256i*)block);
3250 __m256i cmpA = _mm256_cmpeq_epi8(accA, maskZ);
3251 unsigned mask = ~_mm256_movemask_epi8(cmpA);
3257 unsigned w64_idx = _tzcnt_u32(
mask);
3259 bit_idx += k * vCAP;
3266 if (!
val ||
val == ~0ull)
3272 bitval ^= unsigned(
cmp);
3273 unsigned long long pcu =
reinterpret_cast<unsigned long long>(pcurr);
3283 unsigned bits_consumed = 0;
3287 if (bitval != (
val & tz))
3292 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3297 tz = (unsigned)_tzcnt_u64(bitval ? ~
val :
val);
3300 bool cmp = ((bits_consumed+=tz) < vCAP);
3308 bitval ^= unsigned(
cmp);
3309 bit_idx += tz & (vCAP - bits_consumed);
3310 unsigned long long pcu =
reinterpret_cast<unsigned long long>(pcurr);
3314 BM_ASSERT((pcurr-1) == (dest+1) || *(pcurr-1) > *(pcurr-2));
3324 unsigned len = (unsigned)(pcurr - dest);
3349 const __m256i* sub_block = (__m256i*) (block + off);
3350 __m256i* t_sub_block = (__m256i*)(target_block + off);
3354 const __m256i* xor_sub_block = (__m256i*) (xor_block + off);
3355 __m256i mA, mB, mC, mD;
3356 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3357 _mm256_load_si256(xor_sub_block));
3358 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3359 _mm256_load_si256(xor_sub_block+1));
3360 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3361 _mm256_load_si256(xor_sub_block+2));
3362 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3363 _mm256_load_si256(xor_sub_block+3));
3365 _mm256_store_si256(t_sub_block, mA);
3366 _mm256_store_si256(t_sub_block+1, mB);
3367 _mm256_store_si256(t_sub_block+2, mC);
3368 _mm256_store_si256(t_sub_block+3, mD);
3372 _mm256_store_si256(t_sub_block , _mm256_load_si256(sub_block));
3373 _mm256_store_si256(t_sub_block+1, _mm256_load_si256(sub_block+1));
3374 _mm256_store_si256(t_sub_block+2, _mm256_load_si256(sub_block+2));
3375 _mm256_store_si256(t_sub_block+3, _mm256_load_si256(sub_block+3));
3402 const __m256i* sub_block = (
const __m256i*) (xor_block + off);
3403 __m256i* t_sub_block = (__m256i*)(target_block + off);
3405 __m256i mA, mB, mC, mD;
3406 mA = _mm256_xor_si256(_mm256_load_si256(sub_block),
3407 _mm256_load_si256(t_sub_block));
3408 mB = _mm256_xor_si256(_mm256_load_si256(sub_block+1),
3409 _mm256_load_si256(t_sub_block+1));
3410 mC = _mm256_xor_si256(_mm256_load_si256(sub_block+2),
3411 _mm256_load_si256(t_sub_block+2));
3412 mD = _mm256_xor_si256(_mm256_load_si256(sub_block+3),
3413 _mm256_load_si256(t_sub_block+3));
3415 _mm256_store_si256(t_sub_block, mA);
3416 _mm256_store_si256(t_sub_block+1, mB);
3417 _mm256_store_si256(t_sub_block+2, mC);
3418 _mm256_store_si256(t_sub_block+3, mD);
3428 #pragma GCC diagnostic pop
3432 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
3433 avx2_xor_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3435 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
3436 avx2_andnot_arr_2_mask((__m256i*)(dst), (__m256i*)(src), (__m256i*)(src_end), (bm::word_t)mask)
3438 #define VECT_BITCOUNT(first, last) \
3439 avx2_bit_count((__m256i*) (first), (__m256i*) (last))
3441 #define VECT_BITCOUNT_AND(first, last, mask) \
3442 avx2_bit_count_and((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3444 #define VECT_BITCOUNT_OR(first, last, mask) \
3445 avx2_bit_count_or((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3447 #define VECT_BITCOUNT_XOR(first, last, mask) \
3448 avx2_bit_count_xor((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3450 #define VECT_BITCOUNT_SUB(first, last, mask) \
3451 avx2_bit_count_sub((__m256i*) (first), (__m256i*) (last), (__m256i*) (mask))
3453 #define VECT_INVERT_BLOCK(first) \
3454 avx2_invert_block((__m256i*)first);
3456 #define VECT_AND_BLOCK(dst, src) \
3457 avx2_and_block((__m256i*) dst, (const __m256i*) (src))
3459 #define VECT_AND_DIGEST(dst, src) \
3460 avx2_and_digest((__m256i*) dst, (const __m256i*) (src))
3462 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
3463 avx2_and_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3465 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
3466 avx2_and_or_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3468 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3469 avx2_and_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3471 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
3472 avx2_and_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3474 #define VECT_OR_BLOCK(dst, src) \
3475 avx2_or_block((__m256i*) dst, (__m256i*) (src))
3477 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3478 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3480 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
3481 avx2_or_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3483 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
3484 avx2_or_block_3way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3486 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
3487 avx2_or_block_5way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2), (__m256i*) (src3), (__m256i*) (src4))
3489 #define VECT_SUB_BLOCK(dst, src) \
3490 avx2_sub_block((__m256i*) dst, (__m256i*) (src))
3492 #define VECT_SUB_DIGEST(dst, src) \
3493 avx2_sub_digest((__m256i*) dst, (const __m256i*) (src))
3495 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
3496 avx2_sub_digest_2way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3498 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
3499 avx2_sub_digest_5way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2), (const __m256i*) (src3), (const __m256i*) (src4))
3501 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
3502 avx2_sub_digest_3way((__m256i*) dst, (const __m256i*) (src1), (const __m256i*) (src2))
3504 #define VECT_XOR_BLOCK(dst, src) \
3505 avx2_xor_block((__m256i*) dst, (__m256i*) (src))
3507 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
3508 avx2_xor_block_2way((__m256i*) dst, (__m256i*) (src1), (__m256i*) (src2))
3510 #define VECT_COPY_BLOCK(dst, src) \
3511 avx2_copy_block((__m256i*) dst, (__m256i*) (src))
3513 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \
3514 avx2_copy_block_unalign((__m256i*) dst, (__m256i*) (src))
3516 #define VECT_STREAM_BLOCK(dst, src) \
3517 avx2_stream_block((__m256i*) dst, (__m256i*) (src))
3519 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
3520 avx2_stream_block_unalign((__m256i*) dst, (__m256i*) (src))
3522 #define VECT_SET_BLOCK(dst, value) \
3523 avx2_set_block((__m256i*) dst, (value))
3525 #define VECT_IS_ZERO_BLOCK(dst) \
3526 avx2_is_all_zero((__m256i*) dst)
3528 #define VECT_IS_ONE_BLOCK(dst) \
3529 avx2_is_all_one((__m256i*) dst)
3531 #define VECT_IS_DIGEST_ZERO(start) \
3532 avx2_is_digest_zero((__m256i*)start)
3534 #define VECT_BLOCK_SET_DIGEST(dst, val) \
3535 avx2_block_set_digest((__m256i*)dst, val)
3537 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
3538 avx2_lower_bound_scan_u32(arr, target, from, to)
3540 #define VECT_SHIFT_L1(b, acc, co) \
3541 avx2_shift_l1((__m256i*)b, acc, co)
3543 #define VECT_SHIFT_R1(b, acc, co) \
3544 avx2_shift_r1((__m256i*)b, acc, co)
3546 #define VECT_SHIFT_R1_AND(b, co, m, digest) \
3547 avx2_shift_r1_and((__m256i*)b, co, (__m256i*)m, digest)
3549 #define VECT_ARR_BLOCK_LOOKUP(idx, size, nb, start) \
3550 avx2_idx_arr_block_lookup(idx, size, nb, start)
3552 #define VECT_SET_BLOCK_BITS(block, idx, start, stop) \
3553 avx2_set_block_bits3(block, idx, start, stop)
3555 #define VECT_BLOCK_CHANGE(block, size) \
3556 avx2_bit_block_calc_change((__m256i*)block, size)
3558 #define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc) \
3559 avx2_bit_block_calc_xor_change((__m256i*)block, (__m256i*)xor_block, size, gc, bc)
3561 #define VECT_BLOCK_CHANGE_BC(block, gc, bc) \
3562 avx2_bit_block_calc_change_bc((__m256i*)block, gc, bc)
3564 #define VECT_BIT_TO_GAP(dest, src, dest_len) \
3565 avx2_bit_to_gap(dest, src, dest_len)
3567 #define VECT_BIT_FIND_FIRST(src1, off, pos) \
3568 avx2_bit_find_first((__m256i*) src1, off, pos)
3570 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \
3571 avx2_bit_find_first_diff((__m256i*) src1, (__m256i*) (src2), pos)
3573 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
3574 avx2_bit_block_xor(t, src, src_xor, d)
3576 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
3577 avx2_bit_block_xor_2way(t, src_xor, d)
3579 #define VECT_GAP_BFIND(buf, pos, is_set) \
3580 avx2_gap_bfind(buf, pos, is_set)
3582 #define VECT_GAP_TEST(buf, pos) \
3583 avx2_gap_test(buf, pos)
3586 #define VECT_BIT_COUNT_DIGEST(blk, d) \
3587 avx2_bit_block_count(blk, d)
#define BM_AVX2_POPCNT_PROLOG
#define BM_CSA256(h, l, a, b, c)
#define BM_AVX2_BIT_COUNT(ret, v)
Bit manipulation primitives (internal)
static vector< string > arr
void avx2_copy_block(__m256i *dst, const __m256i *src)
AVX2 block copy dst = *src.
unsigned avx2_and_block(__m256i *dst, const __m256i *src)
AND array elements against another array dst &= *src.
void avx2_xor_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)
XOR array elements to specified mask dst = *src ^ mask.
bool avx2_test_all_zero_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all NULL
void avx2_bit_block_calc_change_bc(const __m256i *block, unsigned *gcount, unsigned *bcount)
unsigned avx2_gap_test(const unsigned short *buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
bool avx2_is_all_one(const __m256i *block)
check if block is all one bits
bool avx2_and_or_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND-OR block digest stride 2 way dst |= *src1 & *src2.
bool avx2_or_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
OR array elements against another unaligned array dst |= *src.
bool avx2_or_block_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
OR array elements against another 2 arrays dst |= *src1 | src2.
bool avx2_test_all_eq_wave2(const void *ptr0, const void *ptr1)
check if 2 wave of pointers are all the same (NULL or FULL)
unsigned avx2_and_arr_unal(__m256i *dst, const __m256i *src, const __m256i *src_end)
AND array elements against another array (unaligned) dst &= *src.
bool avx2_and_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND block digest stride 2 way dst = *src1 & *src2.
bool avx2_or_block(__m256i *dst, const __m256i *src)
OR array elements against another array dst |= *src.
unsigned avx2_xor_block(__m256i *dst, const __m256i *src)
XOR block against another dst ^= *src.
bool avx2_sub_digest(__m256i *dst, const __m256i *src)
SUB (AND NOT) block digest stride dst &= ~*src.
bm::id_t avx2_bit_count_sub(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND NOT bit count for two aligned bit-blocks.
unsigned avx2_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set)
Hybrid binary search, starts as binary, then switches to scan.
unsigned avx2_bit_to_gap(gap_word_t *dest, const unsigned *block, unsigned dest_len)
Convert bit block to GAP block.
bool avx2_sub_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
SUB block digest stride.
bool avx2_shift_r1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift right by 1
bool avx2_test_all_zero_wave(const void *ptr)
check if wave of pointers is all NULL
unsigned avx2_bit_block_calc_change(const __m256i *block, unsigned size)
bool avx2_sub_digest_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
2-operand SUB (AND NOT) block digest stride dst = *src1 & ~*src2
bool avx2_sub_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
SUB block digest stride.
void avx2_andnot_arr_2_mask(__m256i *dst, const __m256i *src, const __m256i *src_end, bm::word_t mask)
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
void avx2_block_set_digest(__m256i *dst, unsigned value)
set digest stride to 0xFF.. or 0x0 value
bm::id_t avx2_bit_count_xor(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
XOR bit count for two aligned bit-blocks.
bm::id_t avx2_bit_count(const __m256i *block, const __m256i *block_end)
AVX2 Harley-Seal popcount The algorithm is based on the paper "Faster Population Counts using AVX2 In...
bm::id_t avx2_bit_count_and(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
AND bit count for two aligned bit-blocks.
void avx2_stream_block(__m256i *dst, const __m256i *src)
AVX2 block copy dst = *src.
void avx2_copy_block_unalign(__m256i *dst, const __m256i *src)
AVX2 block copy (unaligned SRC) dst = *src.
void avx2_invert_block(__m256i *dst)
Invert bit-block dst = ~*dst or dst ^= *dst.
void avx2_bit_block_calc_xor_change(const __m256i *block, const __m256i *xor_block, unsigned size, unsigned *gcount, unsigned *bcount)
bool avx2_shift_r1_and(__m256i *block, bm::word_t co1, const __m256i *mask_block, bm::id64_t *digest)
fused block shift right by 1 plus AND
bool avx2_bit_find_first_diff(const __m256i *block1, const __m256i *block2, unsigned *pos)
Find first bit which is different between two bit-blocks.
bm::id_t avx2_bit_block_count(const bm::word_t *const block, bm::id64_t digest)
Calculate population count based on digest.
bool avx2_and_digest_3way(__m256i *dst, const __m256i *src1, const __m256i *src2)
AND block digest stride.
bool avx2_and_digest(__m256i *dst, const __m256i *src)
AND block digest stride dst &= *src.
unsigned avx2_sub_block(__m256i *dst, const __m256i *src)
AND-NOT (SUB) array elements against another array dst &= ~*src.
void avx2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest)
Build partial XOR product of 2 bit-blocks using digest mask.
void avx2_set_block(__m256i *dst, bm::word_t value)
AVX2 block memset dst = value.
bool avx2_test_all_one_wave(const void *ptr)
check if wave of pointers is all 0xFFF
void avx2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
bool avx2_or_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
OR 2 arrays and copy to the destination dst = *src1 | src2.
unsigned avx2_xor_block_2way(__m256i *dst, const __m256i *src1, const __m256i *src2)
3 operand XOR dst = *src1 ^ src2
bool avx2_is_digest_zero(const __m256i *block)
check if digest stride is all zero bits
bool avx2_bit_find_first(const __m256i *block, unsigned off, unsigned *pos)
Find first bit set.
void avx2_stream_block_unalign(__m256i *dst, const __m256i *src)
AVX2 block copy (unaligned SRC) dst = *src.
bool avx2_shift_l1(__m256i *block, bm::word_t *empty_acc, unsigned co1)
block shift left by 1
int avx2_cmpge_u16(__m256i vect16, unsigned short value)
Experimental (test) function to do SIMD vector search in sorted, growing array.
int avx2_cmpge_u32(__m256i vect8, unsigned value)
Experimental (test) function to do SIMD vector search (lower bound) in sorted, growing array.
bool avx2_or_block_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
OR array elements against another 4 arrays dst |= *src1 | src2.
bool avx2_is_all_zero(const __m256i *block)
check if block is all zero bits
bool avx2_and_digest_5way(__m256i *dst, const __m256i *src1, const __m256i *src2, const __m256i *src3, const __m256i *src4)
AND block digest stride.
unsigned avx2_lower_bound_scan_u32(const unsigned *arr, unsigned target, unsigned from, unsigned to)
lower bound (great or equal) linear scan in ascending order sorted array
static void hex(unsigned char c)
const unsigned set_block_digest_wave_size
unsigned avx2_idx_arr_block_lookup(const unsigned *idx, unsigned size, unsigned nb, unsigned start)
__m256i avx2_setbit_to256(unsigned i)
Experiemntal.
const unsigned set_block_mask
const bm::gap_word_t * avx2_gap_sum_arr(const bm::gap_word_t *pbuf, unsigned avx_vect_waves, unsigned *sum)
bm::id_t avx2_bit_count_or(const __m256i *block, const __m256i *block_end, const __m256i *mask_block)
void avx2_set_block_bits2(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
void avx2_set_block_bits(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
void avx2_set_block_bits3(bm::word_t *block, const unsigned *idx, unsigned start, unsigned stop)
Experimental code to set bits via AVX strides.
const unsigned set_word_shift
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
__m256i avx2_setbit_256(__m256i target, __m256i source)
Set a bits in an AVX target, by indexes (int4) from the source.
unsigned short gap_word_t
void avx2_bit_block_gather_scatter(unsigned *arr, const unsigned *blk, const unsigned *idx, unsigned size, unsigned start, unsigned bit_idx)
const unsigned gap_max_bits
const unsigned set_block_shift
const unsigned set_word_mask
unsigned long long bmi_blsi_u64(unsigned long long w)
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
const CharType(& source)[N]
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int _mm_cvtsi128_si32(__m128i a)
static __m128i _mm_add_epi16(__m128i a, __m128i b)
static void _mm_prefetch(const void *p, int i)
static int64_t _mm_popcnt_u64(uint64_t a)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
MACRO for shuffle parameter for _mm_shuffle_ps().