1 #ifndef BMSSE_UTIL__H__INCLUDED__
2 #define BMSSE_UTIL__H__INCLUDED__
35 #pragma GCC diagnostic push
36 #pragma GCC diagnostic ignored "-Wconversion"
91 }
while (src < src_end);
115 }
while (src < src_end);
130 __m128i accA, accB, accC, accD;
171 }
while (src < src_end);
180 return macc[0] | macc[1] | macc[2] | macc[3];
263 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
264 __m128i accA, accB, accC, accD;
298 }
while (src < src_end);
307 return macc[0] | macc[1] | macc[2] | macc[3];
316 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
317 __m128i accA, accB, accC, accD;
351 }
while (src < src_end);
360 return macc[0] | macc[1] | macc[2] | macc[3];
375 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
409 }
while (src < src_end);
416 return (maskA == 0xFFFFu);
430 __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
461 }
while (src < src_end);
467 return (maskA == 0xFFFFu);
505 src1 += 4; src2 += 4; dst += 4;
507 }
while (src1 < src_end1);
513 return (maskA == 0xFFFFu);
556 src1 += 4; src2 += 4; dst += 4;
558 }
while (src1 < src_end1);
564 return (maskA == 0xFFFFu);
620 src1 += 4; src2 += 4;
621 src3 += 4; src4 += 4;
628 }
while (src1 < src_end1);
634 return (maskA == 0xFFFFu);
650 __m128i accA, accB, accC, accD;
675 }
while (src < src_end);
683 return macc[0] | macc[1] | macc[2] | macc[3];
698 __m128i accA, accB, accC, accD;
722 src1 += 4; src2 += 4; dst += 4;
723 }
while (src1 < src1_end);
731 return macc[0] | macc[1] | macc[2] | macc[3];
748 __m128i accA, accB, accC, accD;
788 }
while (src < src_end);
797 return macc[0] | macc[1] | macc[2] | macc[3];
828 }
while (dst < dst_end);
841 __m128i xmm0, xmm1, xmm2, xmm3;
869 }
while (src < src_end);
882 __m128i xmm0, xmm1, xmm2, xmm3;
910 }
while (src < src_end);
924 __m128i xmm0, xmm1, xmm2, xmm3;
952 }
while (src < src_end);
965 __m128i xmm0, xmm1, xmm2, xmm3;
993 }
while (src < src_end);
1031 }
while (dst < (
__m128i*)dst_end);
1073 unsigned sse_vect_waves,
1078 for (
unsigned i = 0;
i < sse_vect_waves; ++
i)
1088 unsigned short* cnt8 = (
unsigned short*)&xcnt;
1089 *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
1110 unsigned unroll_factor = 8;
1111 unsigned len = to - from + 1;
1112 unsigned len_unr =
len - (
len % unroll_factor);
1119 __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1122 for (; k < len_unr; k+=unroll_factor)
1135 return from + k + (bsf / 4);
1148 return 4 + from + k + (bsf / 4);
1152 for (; k <
len; ++k)
1154 if (arr_base[k] >= target)
1162 #pragma GCC diagnostic pop
SSE2 reinitialization guard class.
BMFORCEINLINE sse_empty_guard() BMNOEXCEPT
BMFORCEINLINE ~sse_empty_guard() BMNOEXCEPT
static vector< string > arr
void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
3 operand XOR dst = *src1 ^ src2
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
void sse2_stream_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned src) dst = *src.
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND-NOT (SUB) array elements against another array dst &= ~*src.
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
XOR array elements to specified mask dst = *src ^ mask.
void sse2_copy_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned SRC) dst = *src.
unsigned sse2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to) BMNOEXCEPT
lower bound (great or equal) linear scan in ascending order sorted array
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
XOR block against another dst ^= *src.
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
OR array elements against another array (unaligned) dst |= *src.
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
AND array elements against another array (unaligned) dst &= *src.
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2.
void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
SSE2 block memset dst = value.
void sse2_invert_block(__m128i *BMRESTRICT dst) BMNOEXCEPT
Invert bit block dst = ~*dst or dst ^= *dst.
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
OR array elements against another array dst |= *src.
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b) BMNOEXCEPT
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b) BMNOEXCEPT
const unsigned set_block_size
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b) BMNOEXCEPT
unsigned bit_scan_forward32(unsigned w) noexcept
unsigned short gap_word_t
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b) BMNOEXCEPT
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum) BMNOEXCEPT
Gap block population count (array sum) utility.
static __m128i _mm_setzero_si128()
static __m128i _mm_xor_si128(__m128i a, __m128i b)
static __m128i _mm_sub_epi16(__m128i a, __m128i b)
static void _mm_stream_si128(__m128i *p, __m128i a)
static __m128i _mm_add_epi16(__m128i a, __m128i b)
#define _mm_srli_epi32(a, imm)
static int _mm_movemask_epi8(__m128i a)
static __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
static __m128i _mm_loadu_si128(const __m128i *p)
static void _mm_store_si128(__m128i *p, __m128i a)
static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
static void _mm_prefetch(const void *p, int i)
static __m128i _mm_load_si128(const __m128i *p)
static __m128i _mm_or_si128(__m128i, __m128i)
static __m128i _mm_sub_epi32(__m128i a, __m128i b)
static __m128i _mm_set1_epi32(int)
static __m128i _mm_andnot_si128(__m128i a, __m128i b)
static __m128i _mm_and_si128(__m128i, __m128i)
static __m128i _mm_cmpeq_epi32(__m128i, __m128i)