NCBI C++ ToolKit
bmsse_util.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef BMSSE_UTIL__H__INCLUDED__
2 #define BMSSE_UTIL__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmsse_util.h
22  \brief Compute functions for SSE SIMD instruction set (internal)
23 */
24 
25 namespace bm
26 {
27 
28 /** @defgroup SSE2 SSE2 functions
29  Processor specific optimizations for SSE2 instructions (internals)
30  @internal
31  @ingroup bvector
32  */
33 
34 #ifdef __GNUG__
35 #pragma GCC diagnostic push
36 #pragma GCC diagnostic ignored "-Wconversion"
37 #endif
38 
39 
40 /*!
41  @brief SSE2 reinitialization guard class
42 
43  SSE2 requires to call _mm_empty() if we are intermixing
44  MMX integer commands with floating point arithmetics.
45  This class guards critical code fragments where SSE2 integer
46  is used.
47 
48  As of 2015 _mm_empty() is considered deprecated, and not even recognised
49  by some compilers (like MSVC) in 64-bit mode.
50  As MMX instructions gets old, we here deprecate and comment out
51  use of _mm_empty()
52 
53  @ingroup SSE2
54 */
56 {
57 public:
59  {
60  //_mm_empty();
61  }
62 
64  {
65  //_mm_empty();
66  }
67 };
68 
69 
70 
71 /*!
72  @brief XOR array elements to specified mask
73  *dst = *src ^ mask
74 
75  @ingroup SSE2
76 */
77 inline
79  const __m128i* BMRESTRICT src,
80  const __m128i* BMRESTRICT src_end,
82 {
83  __m128i xM = _mm_set1_epi32((int)mask);
84  do
85  {
86  _mm_store_si128(dst+0, _mm_xor_si128(_mm_load_si128(src+0), xM));
87  _mm_store_si128(dst+1, _mm_xor_si128(_mm_load_si128(src+1), xM));
88  _mm_store_si128(dst+2, _mm_xor_si128(_mm_load_si128(src+2), xM));
89  _mm_store_si128(dst+3, _mm_xor_si128(_mm_load_si128(src+3), xM));
90  dst += 4; src += 4;
91  } while (src < src_end);
92 }
93 
94 
95 /*!
96  @brief Inverts array elements and NOT them to specified mask
97  *dst = ~*src & mask
98 
99  @ingroup SSE2
100 */
101 inline
103  const __m128i* BMRESTRICT src,
104  const __m128i* BMRESTRICT src_end,
106 {
107  __m128i xM = _mm_set1_epi32((int)mask);
108  do
109  {
110  _mm_store_si128(dst+0, _mm_andnot_si128(_mm_load_si128(src+0), xM)); // xmm1 = (~xmm1) & xM
111  _mm_store_si128(dst+1, _mm_andnot_si128(_mm_load_si128(src+1), xM));
112  _mm_store_si128(dst+2, _mm_andnot_si128(_mm_load_si128(src+2), xM));
113  _mm_store_si128(dst+3, _mm_andnot_si128(_mm_load_si128(src+3), xM));
114  dst += 4; src += 4;
115  } while (src < src_end);
116 }
117 
118 /*!
119  @brief AND blocks2
120  *dst &= *src
121  @return 0 if no bits were set
122  @ingroup SSE2
123 */
124 
125 inline
127  const __m128i* BMRESTRICT src) BMNOEXCEPT
128 {
129  __m128i m1A, m1B, m1C, m1D;
130  __m128i accA, accB, accC, accD;
131  const __m128i* BMRESTRICT src_end =
132  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
133 
134  accA = accB = accC = accD = _mm_setzero_si128();
135 
136  do
137  {
138  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
139  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
140  _mm_store_si128(dst+0, m1A);
141  _mm_store_si128(dst+1, m1B);
142  accA = _mm_or_si128(accA, m1A);
143  accB = _mm_or_si128(accB, m1B);
144 
145  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
146  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
147  _mm_store_si128(dst+2, m1C);
148  _mm_store_si128(dst+3, m1D);
149 
150  accC = _mm_or_si128(accC, m1C);
151  accD = _mm_or_si128(accD, m1D);
152  src += 4; dst += 4;
153 
154 
155  m1A = _mm_and_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
156  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
157  _mm_store_si128(dst+0, m1A);
158  _mm_store_si128(dst+1, m1B);
159  accA = _mm_or_si128(accA, m1A);
160  accB = _mm_or_si128(accB, m1B);
161 
162  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
163  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
164  _mm_store_si128(dst+2, m1C);
165  _mm_store_si128(dst+3, m1D);
166 
167  accC = _mm_or_si128(accC, m1C);
168  accD = _mm_or_si128(accD, m1D);
169  src += 4; dst += 4;
170 
171  } while (src < src_end);
172 
173  accA = _mm_or_si128(accA, accB); // A = A | B
174  accC = _mm_or_si128(accC, accD); // C = C | D
175  accA = _mm_or_si128(accA, accC); // A = A | C
176 
177 
179  _mm_store_si128((__m128i*)macc, accA);
180  return macc[0] | macc[1] | macc[2] | macc[3];
181 }
182 
183 /*
184 inline
185 unsigned sse2_and_block(__m128i* BMRESTRICT dst,
186  const __m128i* BMRESTRICT src) BMNOEXCEPT
187 {
188  __m128i m1A, m1B, m1C, m1D;
189  __m128i accA, accB, accC, accD;
190  const __m128i* BMRESTRICT src_end =
191  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
192 
193  const __m128i* BMRESTRICT src2 =
194  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size/2);
195 
196  __m128i* BMRESTRICT dst2 =
197  ( __m128i*)((bm::word_t*)(dst) + bm::set_block_size/2);
198 
199  accA = accB = accC = accD = _mm_setzero_si128();
200 
201  do
202  {
203  m1A = _mm_and_si128(_mm_load_si128(src), _mm_load_si128(dst+0));
204  m1B = _mm_and_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
205  m1C = _mm_and_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
206  m1D = _mm_and_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
207 
208  _mm_store_si128(dst, m1A);
209  _mm_store_si128(dst+1, m1B);
210  _mm_store_si128(dst+2, m1C);
211  _mm_store_si128(dst+3, m1D);
212 
213  accA = _mm_or_si128(accA, m1A);
214  accB = _mm_or_si128(accB, m1B);
215  accC = _mm_or_si128(accC, m1C);
216  accD = _mm_or_si128(accD, m1D);
217 
218  src += 4; dst += 4;
219 
220  m1A = _mm_and_si128(_mm_load_si128(src2), _mm_load_si128(dst2));
221  m1B = _mm_and_si128(_mm_load_si128(src2+1), _mm_load_si128(dst2+1));
222  m1C = _mm_and_si128(_mm_load_si128(src2+2), _mm_load_si128(dst2+2));
223  m1D = _mm_and_si128(_mm_load_si128(src2+3), _mm_load_si128(dst2+3));
224 
225  _mm_store_si128(dst2, m1A);
226  _mm_store_si128(dst2+1, m1B);
227  _mm_store_si128(dst2+2, m1C);
228  _mm_store_si128(dst2+3, m1D);
229 
230  accA = _mm_or_si128(accA, m1A);
231  accB = _mm_or_si128(accB, m1B);
232  accC = _mm_or_si128(accC, m1C);
233  accD = _mm_or_si128(accD, m1D);
234 
235 
236  src2 += 4; dst2 += 4;
237  } while (src2 < src_end);
238 
239  accA = _mm_or_si128(accA, accB); // A = A | B
240  accC = _mm_or_si128(accC, accD); // C = C | D
241  accA = _mm_or_si128(accA, accC); // A = A | C
242 
243 
244  bm::id_t BM_ALIGN16 macc[4] BM_ALIGN16ATTR;
245  _mm_store_si128((__m128i*)macc, accA);
246  return macc[0] | macc[1] | macc[2] | macc[3];
247 }
248 */
249 
250 /*!
251  @brief AND array elements against another array (unaligned)
252  *dst &= *src
253 
254  @return 0 if no bits were set
255 
256  @ingroup SSE2
257 */
258 inline
260  const __m128i* BMRESTRICT src,
261  const __m128i* BMRESTRICT src_end) BMNOEXCEPT
262 {
263  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
264  __m128i accA, accB, accC, accD;
265 
266  accA = _mm_setzero_si128();
267  accB = _mm_setzero_si128();
268  accC = _mm_setzero_si128();
269  accD = _mm_setzero_si128();
270 
271  do
272  {
273  m1A = _mm_loadu_si128(src+0);
274  m2A = _mm_load_si128(dst+0);
275  m1A = _mm_and_si128(m1A, m2A);
276  _mm_store_si128(dst+0, m1A);
277  accA = _mm_or_si128(accA, m1A);
278 
279  m1B = _mm_loadu_si128(src+1);
280  m2B = _mm_load_si128(dst+1);
281  m1B = _mm_and_si128(m1B, m2B);
282  _mm_store_si128(dst+1, m1B);
283  accB = _mm_or_si128(accB, m1B);
284 
285  m1C = _mm_loadu_si128(src+2);
286  m2C = _mm_load_si128(dst+2);
287  m1C = _mm_and_si128(m1C, m2C);
288  _mm_store_si128(dst+2, m1C);
289  accC = _mm_or_si128(accC, m1C);
290 
291  m1D = _mm_loadu_si128(src+3);
292  m2D = _mm_load_si128(dst+3);
293  m1D = _mm_and_si128(m1D, m2D);
294  _mm_store_si128(dst+3, m1D);
295  accD = _mm_or_si128(accD, m1D);
296 
297  src += 4; dst += 4;
298  } while (src < src_end);
299 
300  accA = _mm_or_si128(accA, accB); // A = A | B
301  accC = _mm_or_si128(accC, accD); // C = C | D
302  accA = _mm_or_si128(accA, accC); // A = A | C
303 
304 
306  _mm_store_si128((__m128i*)macc, accA);
307  return macc[0] | macc[1] | macc[2] | macc[3];
308 }
309 
310 
311 inline
313  const __m128i* BMRESTRICT src,
314  const __m128i* BMRESTRICT src_end) BMNOEXCEPT
315 {
316  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
317  __m128i accA, accB, accC, accD;
318 
319  accA = _mm_setzero_si128();
320  accB = _mm_setzero_si128();
321  accC = _mm_setzero_si128();
322  accD = _mm_setzero_si128();
323 
324  do
325  {
326  m1A = _mm_load_si128(src + 0);
327  m2A = _mm_load_si128(dst + 0);
328  m1A = _mm_and_si128(m1A, m2A);
329  _mm_store_si128(dst + 0, m1A);
330  accA = _mm_or_si128(accA, m1A);
331 
332  m1B = _mm_load_si128(src + 1);
333  m2B = _mm_load_si128(dst + 1);
334  m1B = _mm_and_si128(m1B, m2B);
335  _mm_store_si128(dst + 1, m1B);
336  accB = _mm_or_si128(accB, m1B);
337 
338  m1C = _mm_load_si128(src + 2);
339  m2C = _mm_load_si128(dst + 2);
340  m1C = _mm_and_si128(m1C, m2C);
341  _mm_store_si128(dst + 2, m1C);
342  accC = _mm_or_si128(accC, m1C);
343 
344  m1D = _mm_load_si128(src + 3);
345  m2D = _mm_load_si128(dst + 3);
346  m1D = _mm_and_si128(m1D, m2D);
347  _mm_store_si128(dst + 3, m1D);
348  accD = _mm_or_si128(accD, m1D);
349 
350  src += 4; dst += 4;
351  } while (src < src_end);
352 
353  accA = _mm_or_si128(accA, accB); // A = A | B
354  accC = _mm_or_si128(accC, accD); // C = C | D
355  accA = _mm_or_si128(accA, accC); // A = A | C
356 
357 
359  _mm_store_si128((__m128i*)macc, accA);
360  return macc[0] | macc[1] | macc[2] | macc[3];
361 }
362 
363 
364 
365 /*!
366  @brief OR array elements against another array
367  *dst |= *src
368  @return true if all bits are 1
369  @ingroup SSE2
370 */
371 inline
373  const __m128i* BMRESTRICT src) BMNOEXCEPT
374 {
375  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
376  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
377  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
378  const __m128i* BMRESTRICT src_end =
379  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
380 
381  do
382  {
383  m1A = _mm_load_si128(src + 0);
384  m2A = _mm_load_si128(dst + 0);
385  m1A = _mm_or_si128(m1A, m2A);
386  _mm_store_si128(dst + 0, m1A);
387 
388  m1B = _mm_load_si128(src + 1);
389  m2B = _mm_load_si128(dst + 1);
390  m1B = _mm_or_si128(m1B, m2B);
391  _mm_store_si128(dst + 1, m1B);
392 
393  m1C = _mm_load_si128(src + 2);
394  m2C = _mm_load_si128(dst + 2);
395  m1C = _mm_or_si128(m1C, m2C);
396  _mm_store_si128(dst + 2, m1C);
397 
398  m1D = _mm_load_si128(src + 3);
399  m2D = _mm_load_si128(dst + 3);
400  m1D = _mm_or_si128(m1D, m2D);
401  _mm_store_si128(dst + 3, m1D);
402 
403  mAccF1 = _mm_and_si128(mAccF1, m1C);
404  mAccF1 = _mm_and_si128(mAccF1, m1D);
405  mAccF0 = _mm_and_si128(mAccF0, m1A);
406  mAccF0 = _mm_and_si128(mAccF0, m1B);
407 
408  src += 4; dst += 4;
409  } while (src < src_end);
410 
411  __m128i maskF = _mm_set1_epi32(~0u);
412  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
413  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
414  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
415 
416  return (maskA == 0xFFFFu);
417 }
418 
419 /*!
420  @brief OR array elements against another array (unaligned)
421  *dst |= *src
422  @return true if all bits are 1
423  @ingroup SSE2
424 */
425 inline
427  const __m128i* BMRESTRICT src,
428  const __m128i* BMRESTRICT src_end) BMNOEXCEPT
429 {
430  __m128i m1A, m2A, m1B, m2B, m1C, m2C, m1D, m2D;
431  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
432  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
433  do
434  {
435  m1A = _mm_loadu_si128(src + 0);
436  m2A = _mm_load_si128(dst + 0);
437  m1A = _mm_or_si128(m1A, m2A);
438  _mm_store_si128(dst + 0, m1A);
439 
440  m1B = _mm_loadu_si128(src + 1);
441  m2B = _mm_load_si128(dst + 1);
442  m1B = _mm_or_si128(m1B, m2B);
443  _mm_store_si128(dst + 1, m1B);
444 
445  m1C = _mm_loadu_si128(src + 2);
446  m2C = _mm_load_si128(dst + 2);
447  m1C = _mm_or_si128(m1C, m2C);
448  _mm_store_si128(dst + 2, m1C);
449 
450  m1D = _mm_loadu_si128(src + 3);
451  m2D = _mm_load_si128(dst + 3);
452  m1D = _mm_or_si128(m1D, m2D);
453  _mm_store_si128(dst + 3, m1D);
454 
455  mAccF1 = _mm_and_si128(mAccF1, m1C);
456  mAccF1 = _mm_and_si128(mAccF1, m1D);
457  mAccF0 = _mm_and_si128(mAccF0, m1A);
458  mAccF0 = _mm_and_si128(mAccF0, m1B);
459 
460  src += 4; dst += 4;
461  } while (src < src_end);
462 
463  __m128i maskF = _mm_set1_epi32(~0u);
464  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
465  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
466  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
467  return (maskA == 0xFFFFu);
468 }
469 
470 /*!
471  @brief OR 2 blocks anc copy result to the destination
472  *dst = *src1 | src2
473  @return true if all bits are 1
474 
475  @ingroup SSE2
476 */
477 inline
479  const __m128i* BMRESTRICT src1,
480  const __m128i* BMRESTRICT src2) BMNOEXCEPT
481 {
482  __m128i m1A, m1B, m1C, m1D;
483  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
484  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
485  const __m128i* BMRESTRICT src_end1 =
486  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
487 
488  do
489  {
490  m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
491  m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
492  m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
493  m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
494 
495  _mm_store_si128(dst + 0, m1A);
496  _mm_store_si128(dst + 1, m1B);
497  _mm_store_si128(dst + 2, m1C);
498  _mm_store_si128(dst + 3, m1D);
499 
500  mAccF1 = _mm_and_si128(mAccF1, m1C);
501  mAccF1 = _mm_and_si128(mAccF1, m1D);
502  mAccF0 = _mm_and_si128(mAccF0, m1A);
503  mAccF0 = _mm_and_si128(mAccF0, m1B);
504 
505  src1 += 4; src2 += 4; dst += 4;
506 
507  } while (src1 < src_end1);
508 
509  __m128i maskF = _mm_set1_epi32(~0u);
510  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
511  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
512  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
513  return (maskA == 0xFFFFu);
514 }
515 
516 /*!
517  @brief OR array elements against another 2 arrays
518  *dst |= *src1 | src2
519  @return true if all bits are 1
520 
521  @ingroup SSE2
522 */
523 inline
525  const __m128i* BMRESTRICT src1,
526  const __m128i* BMRESTRICT src2) BMNOEXCEPT
527 {
528  __m128i m1A, m1B, m1C, m1D;
529  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
530  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
531  const __m128i* BMRESTRICT src_end1 =
532  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
533 
534  do
535  {
536  m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
537  m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
538  m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
539  m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
540 
541  m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
542  m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
543  m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
544  m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
545 
546  _mm_store_si128(dst + 0, m1A);
547  _mm_store_si128(dst + 1, m1B);
548  _mm_store_si128(dst + 2, m1C);
549  _mm_store_si128(dst + 3, m1D);
550 
551  mAccF1 = _mm_and_si128(mAccF1, m1C);
552  mAccF1 = _mm_and_si128(mAccF1, m1D);
553  mAccF0 = _mm_and_si128(mAccF0, m1A);
554  mAccF0 = _mm_and_si128(mAccF0, m1B);
555 
556  src1 += 4; src2 += 4; dst += 4;
557 
558  } while (src1 < src_end1);
559 
560  __m128i maskF = _mm_set1_epi32(~0u);
561  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
562  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
563  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
564  return (maskA == 0xFFFFu);
565 }
566 
567 /*!
568  @brief OR array elements against another 2 arrays
569  *dst |= *src1 | src2 | src3 | src4
570  @return true if all bits are 1
571 
572  @ingroup SSE2
573 */
574 inline
576  const __m128i* BMRESTRICT src1,
577  const __m128i* BMRESTRICT src2,
578  const __m128i* BMRESTRICT src3,
579  const __m128i* BMRESTRICT src4) BMNOEXCEPT
580 {
581  __m128i m1A, m1B, m1C, m1D;
582  __m128i mAccF0 = _mm_set1_epi32(~0u); // broadcast 0xFF
583  __m128i mAccF1 = _mm_set1_epi32(~0u); // broadcast 0xFF
584 
585  const __m128i* BMRESTRICT src_end1 =
586  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
587 
588  do
589  {
590  m1A = _mm_or_si128(_mm_load_si128(src1 + 0), _mm_load_si128(dst + 0));
591  m1B = _mm_or_si128(_mm_load_si128(src1 + 1), _mm_load_si128(dst + 1));
592  m1C = _mm_or_si128(_mm_load_si128(src1 + 2), _mm_load_si128(dst + 2));
593  m1D = _mm_or_si128(_mm_load_si128(src1 + 3), _mm_load_si128(dst + 3));
594 
595  m1A = _mm_or_si128(m1A, _mm_load_si128(src2 + 0));
596  m1B = _mm_or_si128(m1B, _mm_load_si128(src2 + 1));
597  m1C = _mm_or_si128(m1C, _mm_load_si128(src2 + 2));
598  m1D = _mm_or_si128(m1D, _mm_load_si128(src2 + 3));
599 
600  m1A = _mm_or_si128(m1A, _mm_load_si128(src3 + 0));
601  m1B = _mm_or_si128(m1B, _mm_load_si128(src3 + 1));
602  m1C = _mm_or_si128(m1C, _mm_load_si128(src3 + 2));
603  m1D = _mm_or_si128(m1D, _mm_load_si128(src3 + 3));
604 
605  m1A = _mm_or_si128(m1A, _mm_load_si128(src4 + 0));
606  m1B = _mm_or_si128(m1B, _mm_load_si128(src4 + 1));
607  m1C = _mm_or_si128(m1C, _mm_load_si128(src4 + 2));
608  m1D = _mm_or_si128(m1D, _mm_load_si128(src4 + 3));
609 
610  _mm_stream_si128(dst + 0, m1A);
611  _mm_stream_si128(dst + 1, m1B);
612  _mm_stream_si128(dst + 2, m1C);
613  _mm_stream_si128(dst + 3, m1D);
614 
615  mAccF1 = _mm_and_si128(mAccF1, m1C);
616  mAccF1 = _mm_and_si128(mAccF1, m1D);
617  mAccF0 = _mm_and_si128(mAccF0, m1A);
618  mAccF0 = _mm_and_si128(mAccF0, m1B);
619 
620  src1 += 4; src2 += 4;
621  src3 += 4; src4 += 4;
622 
623  _mm_prefetch ((const char*)src3, _MM_HINT_T0);
624  _mm_prefetch ((const char*)src4, _MM_HINT_T0);
625 
626  dst += 4;
627 
628  } while (src1 < src_end1);
629 
630  __m128i maskF = _mm_set1_epi32(~0u);
631  mAccF0 = _mm_and_si128(mAccF0, mAccF1);
632  __m128i wcmpA = _mm_cmpeq_epi8(mAccF0, maskF);
633  unsigned maskA = unsigned(_mm_movemask_epi8(wcmpA));
634  return (maskA == 0xFFFFu);
635 }
636 
637 
638 
639 /*!
640  @brief XOR block against another
641  *dst ^= *src
642  @return 0 if no bits were set
643  @ingroup SSE2
644 */
645 inline
647  const __m128i* BMRESTRICT src) BMNOEXCEPT
648 {
649  __m128i m1A, m1B, m1C, m1D;
650  __m128i accA, accB, accC, accD;
651 
652  const __m128i* BMRESTRICT src_end =
653  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
654 
655  accA = accB = accC = accD = _mm_setzero_si128();
656 
657  do
658  {
659  m1A = _mm_xor_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
660  m1B = _mm_xor_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
661  m1C = _mm_xor_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
662  m1D = _mm_xor_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
663 
664  _mm_store_si128(dst+0, m1A);
665  _mm_store_si128(dst+1, m1B);
666  _mm_store_si128(dst+2, m1C);
667  _mm_store_si128(dst+3, m1D);
668 
669  accA = _mm_or_si128(accA, m1A);
670  accB = _mm_or_si128(accB, m1B);
671  accC = _mm_or_si128(accC, m1C);
672  accD = _mm_or_si128(accD, m1D);
673 
674  src += 4; dst += 4;
675  } while (src < src_end);
676 
677  accA = _mm_or_si128(accA, accB); // A = A | B
678  accC = _mm_or_si128(accC, accD); // C = C | D
679  accA = _mm_or_si128(accA, accC); // A = A | C
680 
682  _mm_store_si128((__m128i*)macc, accA);
683  return macc[0] | macc[1] | macc[2] | macc[3];
684 }
685 
686 /*!
687  @brief 3 operand XOR
688  *dst = *src1 ^ src2
689  @return 0 if no bits were set
690  @ingroup SSE2
691 */
692 inline
694  const __m128i* BMRESTRICT src1,
695  const __m128i* BMRESTRICT src2) BMNOEXCEPT
696 {
697  __m128i m1A, m1B, m1C, m1D;
698  __m128i accA, accB, accC, accD;
699 
700  const __m128i* BMRESTRICT src1_end =
701  (const __m128i*)((bm::word_t*)(src1) + bm::set_block_size);
702 
703  accA = accB = accC = accD = _mm_setzero_si128();
704 
705  do
706  {
707  m1A = _mm_xor_si128(_mm_load_si128(src1 + 0), _mm_load_si128(src2 + 0));
708  m1B = _mm_xor_si128(_mm_load_si128(src1 + 1), _mm_load_si128(src2 + 1));
709  m1C = _mm_xor_si128(_mm_load_si128(src1 + 2), _mm_load_si128(src2 + 2));
710  m1D = _mm_xor_si128(_mm_load_si128(src1 + 3), _mm_load_si128(src2 + 3));
711 
712  _mm_store_si128(dst + 0, m1A);
713  _mm_store_si128(dst + 1, m1B);
714  _mm_store_si128(dst + 2, m1C);
715  _mm_store_si128(dst + 3, m1D);
716 
717  accA = _mm_or_si128(accA, m1A);
718  accB = _mm_or_si128(accB, m1B);
719  accC = _mm_or_si128(accC, m1C);
720  accD = _mm_or_si128(accD, m1D);
721 
722  src1 += 4; src2 += 4; dst += 4;
723  } while (src1 < src1_end);
724 
725  accA = _mm_or_si128(accA, accB); // A = A | B
726  accC = _mm_or_si128(accC, accD); // C = C | D
727  accA = _mm_or_si128(accA, accC); // A = A | C
728 
730  _mm_store_si128((__m128i*)macc, accA);
731  return macc[0] | macc[1] | macc[2] | macc[3];
732 }
733 
734 
735 /*!
736  @brief AND-NOT (SUB) array elements against another array
737  *dst &= ~*src
738 
739  @return 0 if no bits were set
740 
741  @ingroup SSE2
742 */
743 inline
745  const __m128i* BMRESTRICT src) BMNOEXCEPT
746 {
747  __m128i m1A, m1B, m1C, m1D;
748  __m128i accA, accB, accC, accD;
749 
750  accA = accB = accC = accD = _mm_setzero_si128();
751 
752  const __m128i* BMRESTRICT src_end =
753  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
754 
755  do
756  {
757  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
758  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
759  _mm_store_si128(dst+0, m1A);
760  _mm_store_si128(dst+1, m1B);
761  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
762  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
763  _mm_store_si128(dst+2, m1C);
764  _mm_store_si128(dst+3, m1D);
765 
766  accA = _mm_or_si128(accA, m1A);
767  accB = _mm_or_si128(accB, m1B);
768  accC = _mm_or_si128(accC, m1C);
769  accD = _mm_or_si128(accD, m1D);
770 
771  src += 4; dst += 4;
772 
773  m1A = _mm_andnot_si128(_mm_load_si128(src+0), _mm_load_si128(dst+0));
774  m1B = _mm_andnot_si128(_mm_load_si128(src+1), _mm_load_si128(dst+1));
775  _mm_store_si128(dst+0, m1A);
776  _mm_store_si128(dst+1, m1B);
777  m1C = _mm_andnot_si128(_mm_load_si128(src+2), _mm_load_si128(dst+2));
778  m1D = _mm_andnot_si128(_mm_load_si128(src+3), _mm_load_si128(dst+3));
779  _mm_store_si128(dst+2, m1C);
780  _mm_store_si128(dst+3, m1D);
781 
782  accA = _mm_or_si128(accA, m1A);
783  accB = _mm_or_si128(accB, m1B);
784  accC = _mm_or_si128(accC, m1C);
785  accD = _mm_or_si128(accD, m1D);
786 
787  src += 4; dst += 4;
788  } while (src < src_end);
789 
790  accA = _mm_or_si128(accA, accB); // A = A | B
791  accC = _mm_or_si128(accC, accD); // C = C | D
792  accA = _mm_or_si128(accA, accC); // A = A | C
793 
794 
796  _mm_store_si128((__m128i*)macc, accA);
797  return macc[0] | macc[1] | macc[2] | macc[3];
798 }
799 
800 
801 /*!
802  @brief SSE2 block memset
803  *dst = value
804 
805  @ingroup SSE2
806 */
807 
808 inline
810 {
811  __m128i* BMRESTRICT dst_end =
812  (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
813 
814  __m128i xmm0 = _mm_set1_epi32(int(value));
815  do
816  {
817  _mm_store_si128(dst, xmm0);
818  _mm_store_si128(dst+1, xmm0);
819  _mm_store_si128(dst+2, xmm0);
820  _mm_store_si128(dst+3, xmm0);
821 
822  _mm_store_si128(dst+4, xmm0);
823  _mm_store_si128(dst+5, xmm0);
824  _mm_store_si128(dst+6, xmm0);
825  _mm_store_si128(dst+7, xmm0);
826 
827  dst += 8;
828  } while (dst < dst_end);
829 }
830 
831 /*!
832  @brief SSE2 block copy
833  *dst = *src
834 
835  @ingroup SSE2
836 */
837 inline
839  const __m128i* BMRESTRICT src) BMNOEXCEPT
840 {
841  __m128i xmm0, xmm1, xmm2, xmm3;
842  const __m128i* BMRESTRICT src_end =
843  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
844 
845  do
846  {
847  xmm0 = _mm_load_si128(src+0);
848  xmm1 = _mm_load_si128(src+1);
849  xmm2 = _mm_load_si128(src+2);
850  xmm3 = _mm_load_si128(src+3);
851 
852  _mm_store_si128(dst+0, xmm0);
853  _mm_store_si128(dst+1, xmm1);
854  _mm_store_si128(dst+2, xmm2);
855  _mm_store_si128(dst+3, xmm3);
856 
857  xmm0 = _mm_load_si128(src+4);
858  xmm1 = _mm_load_si128(src+5);
859  xmm2 = _mm_load_si128(src+6);
860  xmm3 = _mm_load_si128(src+7);
861 
862  _mm_store_si128(dst+4, xmm0);
863  _mm_store_si128(dst+5, xmm1);
864  _mm_store_si128(dst+6, xmm2);
865  _mm_store_si128(dst+7, xmm3);
866 
867  src += 8; dst += 8;
868 
869  } while (src < src_end);
870 }
871 
872 /*!
873  @brief SSE2 block copy (unaligned SRC)
874  *dst = *src
875 
876  @ingroup SSE2
877 */
878 inline
880  const __m128i* BMRESTRICT src) BMNOEXCEPT
881 {
882  __m128i xmm0, xmm1, xmm2, xmm3;
883  const __m128i* BMRESTRICT src_end =
884  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
885 
886  do
887  {
888  xmm0 = _mm_loadu_si128(src+0);
889  xmm1 = _mm_loadu_si128(src+1);
890  xmm2 = _mm_loadu_si128(src+2);
891  xmm3 = _mm_loadu_si128(src+3);
892 
893  _mm_store_si128(dst+0, xmm0);
894  _mm_store_si128(dst+1, xmm1);
895  _mm_store_si128(dst+2, xmm2);
896  _mm_store_si128(dst+3, xmm3);
897 
898  xmm0 = _mm_loadu_si128(src+4);
899  xmm1 = _mm_loadu_si128(src+5);
900  xmm2 = _mm_loadu_si128(src+6);
901  xmm3 = _mm_loadu_si128(src+7);
902 
903  _mm_store_si128(dst+4, xmm0);
904  _mm_store_si128(dst+5, xmm1);
905  _mm_store_si128(dst+6, xmm2);
906  _mm_store_si128(dst+7, xmm3);
907 
908  src += 8; dst += 8;
909 
910  } while (src < src_end);
911 }
912 
913 
914 /*!
915  @brief SSE2 block copy
916  *dst = *src
917 
918  @ingroup SSE2
919 */
920 inline
922  const __m128i* BMRESTRICT src) BMNOEXCEPT
923 {
924  __m128i xmm0, xmm1, xmm2, xmm3;
925  const __m128i* BMRESTRICT src_end =
926  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
927 
928  do
929  {
930  xmm0 = _mm_load_si128(src+0);
931  xmm1 = _mm_load_si128(src+1);
932  xmm2 = _mm_load_si128(src+2);
933  xmm3 = _mm_load_si128(src+3);
934 
935  _mm_stream_si128(dst+0, xmm0);
936  _mm_stream_si128(dst+1, xmm1);
937  _mm_stream_si128(dst+2, xmm2);
938  _mm_stream_si128(dst+3, xmm3);
939 
940  xmm0 = _mm_load_si128(src+4);
941  xmm1 = _mm_load_si128(src+5);
942  xmm2 = _mm_load_si128(src+6);
943  xmm3 = _mm_load_si128(src+7);
944 
945  _mm_stream_si128(dst+4, xmm0);
946  _mm_stream_si128(dst+5, xmm1);
947  _mm_stream_si128(dst+6, xmm2);
948  _mm_stream_si128(dst+7, xmm3);
949 
950  src += 8; dst += 8;
951 
952  } while (src < src_end);
953 }
954 
955 /*!
956  @brief SSE2 block copy (unaligned src)
957  *dst = *src
958 
959  @ingroup SSE2
960 */
961 inline
963  const __m128i* BMRESTRICT src) BMNOEXCEPT
964 {
965  __m128i xmm0, xmm1, xmm2, xmm3;
966  const __m128i* BMRESTRICT src_end =
967  (const __m128i*)((bm::word_t*)(src) + bm::set_block_size);
968 
969  do
970  {
971  xmm0 = _mm_loadu_si128(src+0);
972  xmm1 = _mm_loadu_si128(src+1);
973  xmm2 = _mm_loadu_si128(src+2);
974  xmm3 = _mm_loadu_si128(src+3);
975 
976  _mm_stream_si128(dst+0, xmm0);
977  _mm_stream_si128(dst+1, xmm1);
978  _mm_stream_si128(dst+2, xmm2);
979  _mm_stream_si128(dst+3, xmm3);
980 
981  xmm0 = _mm_loadu_si128(src+4);
982  xmm1 = _mm_loadu_si128(src+5);
983  xmm2 = _mm_loadu_si128(src+6);
984  xmm3 = _mm_loadu_si128(src+7);
985 
986  _mm_stream_si128(dst+4, xmm0);
987  _mm_stream_si128(dst+5, xmm1);
988  _mm_stream_si128(dst+6, xmm2);
989  _mm_stream_si128(dst+7, xmm3);
990 
991  src += 8; dst += 8;
992 
993  } while (src < src_end);
994 }
995 
996 
997 /*!
998  @brief Invert bit block
999  *dst = ~*dst
1000  or
1001  *dst ^= *dst
1002 
1003  @ingroup SSE2
1004 */
1005 inline
1007 {
1008  __m128i maskF = _mm_set1_epi32(~0u);
1009  __m128i* BMRESTRICT dst_end =
1010  (__m128i*)((bm::word_t*)(dst) + bm::set_block_size);
1011 
1012  __m128i mA, mB, mC, mD;
1013  do
1014  {
1015  mA = _mm_load_si128(dst + 0);
1016  mB = _mm_load_si128(dst + 1);
1017  mA = _mm_xor_si128(mA, maskF);
1018  mB = _mm_xor_si128(mB, maskF);
1019  _mm_store_si128(dst, mA);
1020  _mm_store_si128(dst + 1, mB);
1021 
1022  mC = _mm_load_si128(dst + 2);
1023  mD = _mm_load_si128(dst + 3);
1024  mC = _mm_xor_si128(mC, maskF);
1025  mD = _mm_xor_si128(mD, maskF);
1026  _mm_store_si128(dst + 2, mC);
1027  _mm_store_si128(dst + 3, mD);
1028 
1029  dst += 4;
1030 
1031  } while (dst < (__m128i*)dst_end);
1032 }
1033 
1036 {
1037  return _mm_and_si128(a, b);
1038 }
1039 
1042 {
1043  return _mm_or_si128(a, b);
1044 }
1045 
1046 
1049 {
1050  return _mm_xor_si128(a, b);
1051 }
1052 
1055 {
1056  return _mm_andnot_si128(b, a);
1057 }
1058 
1059 
1060 /*!
1061  @brief Gap block population count (array sum) utility
1062  @param pbuf - unrolled, aligned to 1-start GAP buffer
1063  @param sse_vect_waves - number of SSE vector lines to process
1064  @param sum - result acumulator
1065  @return tail pointer
1066 
1067  @internal
1068  @ingroup SSE2
1069 */
1070 inline
1072  const bm::gap_word_t* BMRESTRICT pbuf,
1073  unsigned sse_vect_waves,
1074  unsigned* sum) BMNOEXCEPT
1075 {
1076  __m128i xcnt = _mm_setzero_si128();
1077 
1078  for (unsigned i = 0; i < sse_vect_waves; ++i)
1079  {
1080  __m128i mm0 = _mm_loadu_si128((__m128i*)(pbuf - 1));
1081  __m128i mm1 = _mm_loadu_si128((__m128i*)(pbuf + 8 - 1));
1082  __m128i mm_s2 = _mm_add_epi16(mm1, mm0);
1083  xcnt = _mm_add_epi16(xcnt, mm_s2);
1084  pbuf += 16;
1085  }
1086  xcnt = _mm_sub_epi16(_mm_srli_epi32(xcnt, 16), xcnt);
1087 
1088  unsigned short* cnt8 = (unsigned short*)&xcnt;
1089  *sum += (cnt8[0]) + (cnt8[2]) + (cnt8[4]) + (cnt8[6]);
1090  return pbuf;
1091 }
1092 
1093 /**
1094  lower bound (great or equal) linear scan in ascending order sorted array
1095  @ingroup SSE2
1096  \internal
1097 */
1098 inline
1099 unsigned sse2_lower_bound_scan_u32(const unsigned* BMRESTRICT arr,
1100  unsigned target,
1101  unsigned from,
1102  unsigned to) BMNOEXCEPT
1103 {
1104  // a > b (unsigned, 32-bit) is the same as (a - 0x80000000) > (b - 0x80000000) (signed, 32-bit)
1105  // see more at:
1106  // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
1107 
1108  const unsigned* BMRESTRICT arr_base = &arr[from]; // unrolled search base
1109 
1110  unsigned unroll_factor = 8;
1111  unsigned len = to - from + 1;
1112  unsigned len_unr = len - (len % unroll_factor);
1113 
1114  __m128i mask0x8 = _mm_set1_epi32(0x80000000);
1115  __m128i vect_target = _mm_set1_epi32(target);
1116  __m128i norm_target = _mm_sub_epi32(vect_target, mask0x8); // (signed) target - 0x80000000
1117 
1118  int mask;
1119  __m128i vect40, vect41, norm_vect40, norm_vect41, cmp_mask_ge;
1120 
1121  unsigned k = 0;
1122  for (; k < len_unr; k+=unroll_factor)
1123  {
1124  vect40 = _mm_loadu_si128((__m128i*)(&arr_base[k])); // 4 u32s
1125  norm_vect40 = _mm_sub_epi32(vect40, mask0x8); // (signed) vect4 - 0x80000000
1126 
1127  cmp_mask_ge = _mm_or_si128( // GT | EQ
1128  _mm_cmpgt_epi32 (norm_vect40, norm_target),
1129  _mm_cmpeq_epi32 (vect40, vect_target)
1130  );
1131  mask = _mm_movemask_epi8(cmp_mask_ge);
1132  if (mask)
1133  {
1134  int bsf = bm::bit_scan_forward32(mask); //_bit_scan_forward(mask);
1135  return from + k + (bsf / 4);
1136  }
1137  vect41 = _mm_loadu_si128((__m128i*)(&arr_base[k+4]));
1138  norm_vect41 = _mm_sub_epi32(vect41, mask0x8);
1139 
1140  cmp_mask_ge = _mm_or_si128(
1141  _mm_cmpgt_epi32 (norm_vect41, norm_target),
1142  _mm_cmpeq_epi32 (vect41, vect_target)
1143  );
1144  mask = _mm_movemask_epi8(cmp_mask_ge);
1145  if (mask)
1146  {
1147  int bsf = bm::bit_scan_forward32(mask); //_bit_scan_forward(mask);
1148  return 4 + from + k + (bsf / 4);
1149  }
1150  } // for
1151 
1152  for (; k < len; ++k)
1153  {
1154  if (arr_base[k] >= target)
1155  return from + k;
1156  }
1157  return to + 1;
1158 }
1159 
1160 
1161 #ifdef __GNUG__
1162 #pragma GCC diagnostic pop
1163 #endif
1164 
1165 
1166 } // namespace
1167 
1168 
1169 
1170 #endif
ncbi::TMaskedQueryRegions mask
#define BM_ALIGN16
Definition: bmdef.h:304
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ALIGN16ATTR
Definition: bmdef.h:305
#define BMFORCEINLINE
Definition: bmdef.h:213
SSE2 reinitialization guard class.
Definition: bmsse_util.h:56
BMFORCEINLINE sse_empty_guard() BMNOEXCEPT
Definition: bmsse_util.h:58
BMFORCEINLINE ~sse_empty_guard() BMNOEXCEPT
Definition: bmsse_util.h:63
char value[7]
Definition: config.c:431
void sse2_copy_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
Definition: bmsse_util.h:838
unsigned sse2_xor_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
3 operand XOR dst = *src1 ^ src2
Definition: bmsse_util.h:693
bool sse2_or_block_5way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2, const __m128i *BMRESTRICT src3, const __m128i *BMRESTRICT src4) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2 | src3 | src4.
Definition: bmsse_util.h:575
void sse2_stream_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy dst = *src.
Definition: bmsse_util.h:921
void sse2_stream_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned src) dst = *src.
Definition: bmsse_util.h:962
unsigned sse2_sub_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND-NOT (SUB) array elements against another array dst &= ~*src.
Definition: bmsse_util.h:744
void sse2_xor_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
XOR array elements to specified mask dst = *src ^ mask.
Definition: bmsse_util.h:78
void sse2_copy_block_unalign(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
SSE2 block copy (unaligned SRC) dst = *src.
Definition: bmsse_util.h:879
unsigned sse2_lower_bound_scan_u32(const unsigned *BMRESTRICT arr, unsigned target, unsigned from, unsigned to) BMNOEXCEPT
lower bound (great or equal) linear scan in ascending order sorted array
Definition: bmsse_util.h:1099
unsigned sse2_xor_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
XOR block against another dst ^= *src.
Definition: bmsse_util.h:646
bool sse2_or_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
OR array elements against another array (unaligned) dst |= *src.
Definition: bmsse_util.h:426
unsigned sse2_and_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
AND blocks2 dst &= *src.
Definition: bmsse_util.h:126
unsigned sse2_and_arr_unal(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end) BMNOEXCEPT
AND array elements against another array (unaligned) dst &= *src.
Definition: bmsse_util.h:259
void sse2_andnot_arr_2_mask(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src, const __m128i *BMRESTRICT src_end, bm::word_t mask) BMNOEXCEPT
Inverts array elements and NOT them to specified mask dst = ~*src & mask.
Definition: bmsse_util.h:102
bool sse2_or_block_3way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR array elements against another 2 arrays dst |= *src1 | src2.
Definition: bmsse_util.h:524
void sse2_set_block(__m128i *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
SSE2 block memset dst = value.
Definition: bmsse_util.h:809
void sse2_invert_block(__m128i *BMRESTRICT dst) BMNOEXCEPT
Invert bit block dst = ~*dst or dst ^= *dst.
Definition: bmsse_util.h:1006
bool sse2_or_block(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src) BMNOEXCEPT
OR array elements against another array dst |= *src.
Definition: bmsse_util.h:372
bool sse2_or_block_2way(__m128i *BMRESTRICT dst, const __m128i *BMRESTRICT src1, const __m128i *BMRESTRICT src2) BMNOEXCEPT
OR 2 blocks anc copy result to the destination dst = *src1 | src2.
Definition: bmsse_util.h:478
int i
int len
#include<zmmintrin.h>
Definition: bm.h:78
unsigned int word_t
Definition: bmconst.h:39
BMFORCEINLINE __m128i sse2_or(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1041
BMFORCEINLINE __m128i sse2_and(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1035
const unsigned set_block_size
Definition: bmconst.h:55
BMFORCEINLINE __m128i sse2_sub(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1054
unsigned int id_t
Definition: bmconst.h:38
unsigned bit_scan_forward32(unsigned w) noexcept
Definition: bmutil.h:319
unsigned short gap_word_t
Definition: bmconst.h:78
BMFORCEINLINE __m128i sse2_xor(__m128i a, __m128i b) BMNOEXCEPT
Definition: bmsse_util.h:1048
const bm::gap_word_t * sse2_gap_sum_arr(const bm::gap_word_t *BMRESTRICT pbuf, unsigned sse_vect_waves, unsigned *sum) BMNOEXCEPT
Gap block population count (array sum) utility.
Definition: bmsse_util.h:1071
unsigned int a
Definition: ncbi_localip.c:102
static __m128i _mm_setzero_si128()
Definition: sse2neon.h:5294
static __m128i _mm_xor_si128(__m128i a, __m128i b)
Definition: sse2neon.h:6415
static __m128i _mm_sub_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:6029
static void _mm_stream_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5999
static __m128i _mm_add_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:2924
#define _mm_srli_epi32(a, imm)
Definition: sse2neon.h:5793
static int _mm_movemask_epi8(__m128i a)
Definition: sse2neon.h:4731
static __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:3331
static __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4525
static void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5892
static __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
Definition: sse2neon.h:3239
static void _mm_prefetch(const void *p, int i)
Definition: sse2neon.h:2263
static __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4426
static __m128i _mm_or_si128(__m128i, __m128i)
Definition: sse2neon.h:4976
static __m128i _mm_sub_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6044
static __m128i _mm_set1_epi32(int)
Definition: sse2neon.h:5167
static __m128i _mm_andnot_si128(__m128i a, __m128i b)
Definition: sse2neon.h:3111
@ _MM_HINT_T0
Definition: sse2neon.h:503
int64x2_t __m128i
Definition: sse2neon.h:200
static __m128i _mm_and_si128(__m128i, __m128i)
Definition: sse2neon.h:3083
static __m128i _mm_cmpeq_epi32(__m128i, __m128i)
Definition: sse2neon.h:3230
Modified on Fri Dec 01 04:44:53 2023 by modify_doxy.py rev. 669887