1 #ifndef BMSSE2__H__INCLUDED__
2 #define BMSSE2__H__INCLUDED__
25 #if !defined(__arm64__) && !defined(__arm__)
38 #pragma GCC diagnostic push
39 #pragma GCC diagnostic ignored "-Wconversion"
68 const unsigned mu1 = 0x55555555;
69 const unsigned mu2 = 0x33333333;
70 const unsigned mu3 = 0x0F0F0F0F;
71 const unsigned mu4 = 0x0000003F;
115 }
while (block < block_end);
121 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
132 const unsigned mu1 = 0x55555555;
133 const unsigned mu2 = 0x33333333;
134 const unsigned mu3 = 0x0F0F0F0F;
135 const unsigned mu4 = 0x0000003F;
151 b = sse2_func(
b, tmp1);
181 }
while (block < block_end);
186 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
207 if (m1 != 0xFFFF || m2 != 0xFFFF)
210 }
while (block < block_end);
232 if (m1 != 0xFFFF || m2 != 0xFFFF)
235 }
while (block < block_end);
258 if (m1 != 0xFFFF || m2 != 0xFFFF)
1002 unsigned simd_lane = 0;
1019 unsigned widx = bsf >> 2;
1020 unsigned w = simd_buf[widx];
1022 *pos = (off * 32) +(simd_lane * 128) + (widx * 32) + bsf;
1030 unsigned widx = bsf >> 2;
1031 unsigned w = simd_buf[widx];
1033 *pos = (off * 32) + ((++simd_lane) * 128) + (widx * 32) + bsf;
1038 }
while (block < block_end);
1058 unsigned simd_lane = 0;
1075 unsigned widx = bsf >> 2;
1076 unsigned w = simd_buf[widx];
1078 *pos = (simd_lane * 128) + (widx * 32) + bsf;
1086 unsigned widx = bsf >> 2;
1087 unsigned w = simd_buf[widx];
1089 *pos = ((++simd_lane) * 128) + (widx * 32) + bsf;
1093 block1+=2; block2+=2;
1094 }
while (block1 < block1_end);
1122 for (;block < block_end; block += 2)
1174 for (--block_end; block_end >= block; block_end -= 2)
1225 const unsigned mu1 = 0x55555555;
1226 const unsigned mu2 = 0x33333333;
1227 const unsigned mu3 = 0x0F0F0F0F;
1228 const unsigned mu4 = 0x0000003F;
1241 int count = (
int)(block_end - block)*4;
1244 const int w_shift =
sizeof(w) * 8 - 1;
1245 bool first_word =
true;
1253 count -= (w_prev = (w0 >> w_shift));
1321 count -= !(w_prev ^ (w0 & 1));
1322 count -= w_prev = (w0 >> w_shift);
1326 count -= !w_prev; w_prev ^= w_prev;
1332 count -= !(w_prev ^ (w0 & 1));
1333 count -= w_prev = (w0 >> w_shift);
1337 count -= !w_prev; w_prev ^= w_prev;
1342 count -= !(w_prev ^ (w0 & 1));
1343 count -= w_prev = (w0 >> w_shift);
1347 count -= !w_prev; w_prev ^= w_prev;
1352 count -= !(w_prev ^ (w0 & 1));
1353 count -= w_prev = (w0 >> w_shift);
1357 count -= !w_prev; w_prev ^= w_prev;
1360 }
while (++block < block_end);
1363 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
1365 return unsigned(count);
1370 #pragma GCC diagnostic push
1371 #pragma GCC diagnostic ignored "-Warray-bounds"
1385 const unsigned unroll_factor = 8;
1388 if (pbuf[0] >= pos) {
size = 0; }
1389 else if (pbuf[1] >= pos) {
size = 1; }
1394 __m128i m1, mz, maskF, maskFL;
1401 int shiftL = (64 - (unroll_factor -
size) * 16);
1429 return size - (unroll_factor - bsr_i);
1449 unsigned end = 1 + ((*buf) >> 3);
1451 const unsigned arr_end = end;
1453 unsigned size = end - start;
1455 for (;
size >= 64;
size = end - start)
1457 unsigned mid = (start + end) >> 1;
1462 if (
buf[mid = (start + end) >> 1] < pos)
1466 if (
buf[mid = (start + end) >> 1] < pos)
1470 if (
buf[mid = (start + end) >> 1] < pos)
1476 for (;
size >= 16;
size = end - start)
1478 if (
unsigned mid = (start + end) >> 1;
buf[mid] < pos)
1482 if (
unsigned mid = (start + end) >> 1;
buf[mid] < pos)
1488 size += (end != arr_end);
1493 *is_set = ((*buf) & 1) ^ ((start-1) & 1);
1513 #pragma GCC diagnostic pop
1517 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
1518 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1520 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
1521 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), (bm::word_t)mask)
1523 #define VECT_BITCOUNT(first, last) \
1524 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
1526 #define VECT_BITCOUNT_AND(first, last, mask) \
1527 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
1529 #define VECT_BITCOUNT_OR(first, last, mask) \
1530 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
1532 #define VECT_BITCOUNT_XOR(first, last, mask) \
1533 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
1535 #define VECT_BITCOUNT_SUB(first, last, mask) \
1536 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
1538 #define VECT_INVERT_BLOCK(first) \
1539 sse2_invert_block((__m128i*)first);
1541 #define VECT_AND_BLOCK(dst, src) \
1542 sse2_and_block((__m128i*) dst, (__m128i*) (src))
1544 #define VECT_AND_DIGEST(dst, src) \
1545 sse2_and_digest((__m128i*) dst, (const __m128i*) (src))
1547 #define VECT_AND_OR_DIGEST_2WAY(dst, src1, src2) \
1548 sse2_and_or_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1550 #define VECT_AND_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1551 sse2_and_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1553 #define VECT_AND_DIGEST_3WAY(dst, src1, src2) \
1554 sse2_and_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1556 #define VECT_AND_DIGEST_2WAY(dst, src1, src2) \
1557 sse2_and_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1559 #define VECT_OR_BLOCK(dst, src) \
1560 sse2_or_block((__m128i*) dst, (__m128i*) (src))
1562 #define VECT_OR_BLOCK_2WAY(dst, src1, src2) \
1563 sse2_or_block_2way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1565 #define VECT_OR_BLOCK_3WAY(dst, src1, src2) \
1566 sse2_or_block_3way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2))
1568 #define VECT_OR_BLOCK_5WAY(dst, src1, src2, src3, src4) \
1569 sse2_or_block_5way((__m128i*) (dst), (__m128i*) (src1), (__m128i*) (src2), (__m128i*) (src3), (__m128i*) (src4))
1571 #define VECT_SUB_BLOCK(dst, src) \
1572 sse2_sub_block((__m128i*) dst, (__m128i*) (src))
1574 #define VECT_SUB_DIGEST(dst, src) \
1575 sse2_sub_digest((__m128i*) dst, (const __m128i*) (src))
1577 #define VECT_SUB_DIGEST_2WAY(dst, src1, src2) \
1578 sse2_sub_digest_2way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1580 #define VECT_SUB_DIGEST_5WAY(dst, src1, src2, src3, src4) \
1581 sse2_sub_digest_5way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2), (const __m128i*) (src3), (const __m128i*) (src4))
1583 #define VECT_SUB_DIGEST_3WAY(dst, src1, src2) \
1584 sse2_sub_digest_3way((__m128i*) dst, (const __m128i*) (src1), (const __m128i*) (src2))
1586 #define VECT_XOR_BLOCK(dst, src) \
1587 sse2_xor_block((__m128i*) dst, (__m128i*) (src))
1589 #define VECT_XOR_BLOCK_2WAY(dst, src1, src2) \
1590 sse2_xor_block_2way((__m128i*) (dst), (const __m128i*) (src1), (const __m128i*) (src2))
1592 #define VECT_COPY_BLOCK(dst, src) \
1593 sse2_copy_block((__m128i*) dst, (__m128i*) (src))
1595 #define VECT_COPY_BLOCK_UNALIGN(dst, src) \
1596 sse2_copy_block_unalign((__m128i*) dst, (__m128i*) (src))
1598 #define VECT_STREAM_BLOCK(dst, src) \
1599 sse2_stream_block((__m128i*) dst, (__m128i*) (src))
1601 #define VECT_STREAM_BLOCK_UNALIGN(dst, src) \
1602 sse2_stream_block_unalign((__m128i*) dst, (__m128i*) (src))
1604 #define VECT_SET_BLOCK(dst, value) \
1605 sse2_set_block((__m128i*) dst, value)
1607 #define VECT_IS_ZERO_BLOCK(dst) \
1608 sse2_is_all_zero((__m128i*) dst)
1610 #define VECT_IS_ONE_BLOCK(dst) \
1611 sse2_is_all_one((__m128i*) dst)
1613 #define VECT_IS_DIGEST_ZERO(start) \
1614 sse2_is_digest_zero((__m128i*)start)
1616 #define VECT_BLOCK_SET_DIGEST(dst, val) \
1617 sse2_block_set_digest((__m128i*)dst, val)
1619 #define VECT_LOWER_BOUND_SCAN_U32(arr, target, from, to) \
1620 sse2_lower_bound_scan_u32(arr, target, from, to)
1622 #define VECT_SHIFT_R1(b, acc, co) \
1623 sse2_shift_r1((__m128i*)b, acc, co)
1626 #define VECT_BIT_FIND_FIRST(src, off, pos) \
1627 sse2_bit_find_first((__m128i*) src, off, pos)
1629 #define VECT_BIT_FIND_DIFF(src1, src2, pos) \
1630 sse2_bit_find_first_diff((__m128i*) src1, (__m128i*) (src2), pos)
1632 #define VECT_BIT_BLOCK_XOR(t, src, src_xor, d) \
1633 sse2_bit_block_xor(t, src, src_xor, d)
1635 #define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d) \
1636 sse2_bit_block_xor_2way(t, src_xor, d)
1638 #define VECT_GAP_BFIND(buf, pos, is_set) \
1639 sse2_gap_bfind(buf, pos, is_set)
1641 #define VECT_GAP_TEST(buf, pos) \
1642 sse2_gap_test(buf, pos)
1648 #pragma GCC diagnostic pop
Compute functions for SSE SIMD instruction set (internal)
Bit manipulation primitives (internal)
bool sse2_bit_find_first(const __m128i *block, unsigned off, unsigned *pos) noexcept
Find first non-zero bit.
bool sse2_sub_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
2-operand SUB (AND NOT) block digest stride dst = src1 & ~*src2
bm::id_t sse2_bit_count(const __m128i *block, const __m128i *block_end)
bool sse2_and_or_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND-OR block digest stride dst |= *src1 & src2.
void sse2_bit_block_xor_2way(bm::word_t *target_block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
unsigned sse2_gap_test(const unsigned short *buf, unsigned pos)
Hybrid binary search, starts as binary, then switches to scan.
bool sse2_and_digest_5way(__m128i *dst, const __m128i *src1, const __m128i *src2, const __m128i *src3, const __m128i *src4) noexcept
AND block digest stride.
bool sse2_and_digest_3way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND block digest stride.
bool sse2_and_digest_2way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
AND block digest stride dst = *src1 & src2.
bool sse2_is_all_one(const __m128i *block) noexcept
check if block is all ONE bits
bool sse2_shift_l1(__m128i *block, unsigned *empty_acc, unsigned co1) noexcept
block shift left by 1
bool sse2_and_digest(__m128i *dst, const __m128i *src) noexcept
AND block digest stride dst &= *src.
bool sse2_is_digest_zero(const __m128i *block) noexcept
check if digest stride is all zero bits
void sse2_bit_block_xor(bm::word_t *target_block, const bm::word_t *block, const bm::word_t *xor_block, bm::id64_t digest) noexcept
Build partial XOR product of 2 bit-blocks using digest mask.
bool sse2_is_all_zero(const __m128i *block) noexcept
check if block is all zero bits
unsigned sse2_gap_bfind(const unsigned short *buf, unsigned pos, unsigned *is_set)
Hybrid binary search, starts as binary, then switches to linear scan.
bool sse2_bit_find_first_diff(const __m128i *block1, const __m128i *block2, unsigned *pos) noexcept
Find first bit which is different between two bit-blocks.
void sse2_block_set_digest(__m128i *dst, unsigned value) noexcept
set digest stride to 0xFF.. or 0x0 value
bool sse2_shift_r1(__m128i *block, unsigned *empty_acc, unsigned co1) noexcept
block shift right by 1
bool sse2_sub_digest(__m128i *dst, const __m128i *src) noexcept
SUB (AND NOT) block digest stride dst &= ~*src.
bool sse2_sub_digest_5way(__m128i *dst, const __m128i *src1, const __m128i *src2, const __m128i *src3, const __m128i *src4) noexcept
SUB block digest stride.
bool sse2_sub_digest_3way(__m128i *dst, const __m128i *src1, const __m128i *src2) noexcept
SUB block digest stride.
unsigned word_bitcount64(bm::id64_t x) noexcept
bm::id_t word_bitcount(bm::id_t w) noexcept
unsigned int
A callback function used to compare two keys in a database.
const unsigned set_block_digest_wave_size
bm::id_t sse2_bit_block_calc_count_change(const __m128i *block, const __m128i *block_end, unsigned *bit_count)
bm::id_t sse2_bit_count_op(const __m128i *block, const __m128i *block_end, const __m128i *mask_block, Func sse2_func)
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
unsigned sse2_gap_find(const bm::gap_word_t *pbuf, const bm::gap_word_t pos, unsigned size)
const unsigned set_block_size
unsigned long long int id64_t
const unsigned block_waves
unsigned bit_scan_forward32(unsigned w) noexcept
T bit_scan_fwd(T v) noexcept
unsigned short gap_word_t
unsigned long long bmi_blsi_u64(unsigned long long w)
const struct ncbi::grid::netcache::search::fields::SIZE size
static __m128i _mm_subs_epu16(__m128i a, __m128i b)
static __m128i _mm_setzero_si128()
static __m128i _mm_xor_si128(__m128i a, __m128i b)
static int _mm_cvtsi128_si32(__m128i a)
#define _mm_srli_epi32(a, imm)
static __m128i _mm_srli_si128(__m128i a, int imm)
static __m128i _mm_slli_epi64(__m128i a, int imm)
static int _mm_movemask_epi8(__m128i a)
static __m128i _mm_slli_si128(__m128i a, int imm)
static __m128i _mm_set1_epi16(short w)
static __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
static __m128i _mm_slli_epi32(__m128i a, int imm)
static __m128i _mm_loadu_si128(const __m128i *p)
static void _mm_store_si128(__m128i *p, __m128i a)
static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
static __m128i _mm_load_si128(const __m128i *p)
static __m128i _mm_or_si128(__m128i, __m128i)
static __m128i _mm_add_epi32(__m128i a, __m128i b)
static __m128i _mm_set_epi32(int, int, int, int)
static __m128i _mm_set1_epi32(int)
static __m128i _mm_andnot_si128(__m128i a, __m128i b)
#define _mm_shuffle_epi32(a, imm)
static __m128i _mm_and_si128(__m128i, __m128i)
static __m128i _mm_cmpeq_epi32(__m128i, __m128i)