NCBI C++ ToolKit
bmxor.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef BMXORFUNC__H__INCLUDED__
2 #define BMXORFUNC__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmxor.h
22  \brief Functions and utilities for XOR filters (internal)
23 */
24 
25 #include "bmdef.h"
26 #include "bmutil.h"
27 
28 
29 namespace bm
30 {
31 
32 /**
33  Parameters for XOR similarity search.
34  Tuneup params allows to reduce the search space to better
35  balance compression rate vs speed.
36 
37  min_lookup_depth - default: 0, minimal search depth to still try to improve
38  max_lookup_depth - default: (2billion) maximum scan search
39  Set a smaller number to improve search time
40 
41  Example:
42  min_lookup_depth = 64; max_lookup_depth = 1024;
43 
44  stop_gain - default: 65533 - cutoff to stop search when serch found gain
45  better than "stop_gain" absolute value
46  Example:
47  stop_gain = 10000;
48 
49  target_gain_ratio - default: 0.89 - cut off ratio between original block cost
50  metric and XOR basedd metric to stop searching.
51  (90% better than the original block is "good enough")
52  Example:
53  target_gain_ratio = 0.50; // 50% improvement is good enough
54 
55  min_gaps - default: 3 - minimal size of GAP block to be considered for
56  XOR search candidate
57  */
59 {
60  unsigned min_lookup_depth;
61  unsigned max_lookup_depth;
62  unsigned stop_gain;
64  unsigned min_gaps;
65 
67  : min_lookup_depth(0),
68  max_lookup_depth(~0u/2),
70  target_gain_ratio(0.89f),
71  min_gaps(3)
72  {}
73 };
74 
75 
76 /**
77  XOR complementarity type between 2 blocks
78  @internal
79  */
81 {
87 };
88 
89 /*!
90  Function (32-bit) calculates basic complexity statistics on XOR product of
91  two blocks (b1 XOR b2)
92  @ingroup bitfunc
93  @internal
94 */
95 inline
97  const bm::word_t* BMRESTRICT xor_block,
98  unsigned size,
99  unsigned* BMRESTRICT gc,
100  unsigned* BMRESTRICT bc) BMNOEXCEPT
101 {
102  BM_ASSERT(gc && bc);
103 
104  unsigned gap_count = 1;
105  unsigned bit_count = 0;
106 
107  bm::word_t w, w0, w_prev, w_l;
108  w = w0 = *block ^ *xor_block;
109  bit_count += word_bitcount(w);
110 
111  const int w_shift = int(sizeof(w) * 8 - 1);
112  w ^= (w >> 1);
113  gap_count += bm::word_bitcount(w);
114  gap_count -= (w_prev = (w0 >> w_shift)); // negative value correction
115 
116  const bm::word_t* block_end = block + size;
117  for (++block, ++xor_block; block < block_end; ++block, ++xor_block)
118  {
119  w = w0 = *block ^ *xor_block;
120  bit_count += bm::word_bitcount(w);
121  ++gap_count;
122  if (!w)
123  {
124  gap_count -= !w_prev;
125  w_prev = 0;
126  }
127  else
128  {
129  w ^= (w >> 1);
130  gap_count += bm::word_bitcount(w);
131 
132  w_l = w0 & 1;
133  gap_count -= (w0 >> w_shift); // negative value correction
134  gap_count -= !(w_prev ^ w_l); // word border correction
135 
136  w_prev = (w0 >> w_shift);
137  }
138  } // for
139 
140  *gc = gap_count;
141  *bc = bit_count;
142 }
143 
144 /*!
145  Function (64-bit) calculates basic complexity statistics on XOR product of
146  two blocks (b1 XOR b2)
147  @ingroup bitfunc
148  @internal
149 */
150 inline
152  const bm::word_t* BMRESTRICT ref_block,
153  unsigned size,
154  unsigned* BMRESTRICT gc,
155  unsigned* BMRESTRICT bc) BMNOEXCEPT
156 {
157  BM_ASSERT(gc && bc);
158 
159  unsigned gap_count = 1;
160  unsigned bit_count = 0;
161 
162  const bm::id64_t* BMRESTRICT block = (const bm::id64_t*) s_block;
163  const bm::id64_t* BMRESTRICT xor_block = (const bm::id64_t*) ref_block;
164 
165  bm::id64_t w, w0, w_prev, w_l;
166  w = w0 = *block ^ *xor_block;
167  bit_count += word_bitcount64(w);
168 
169  const int w_shift = int(sizeof(w) * 8 - 1);
170  w ^= (w >> 1);
171  gap_count += bm::word_bitcount64(w);
172  gap_count -= unsigned(w_prev = (w0 >> w_shift)); // negative value correction
173 
174  const bm::id64_t* block_end = block + (size/2);
175  for (++block, ++xor_block; block < block_end; ++block, ++xor_block)
176  {
177  w = w0 = *block ^ *xor_block;
178  bit_count += bm::word_bitcount64(w);
179  ++gap_count;
180  if (!w)
181  {
182  gap_count -= !w_prev;
183  w_prev = 0;
184  }
185  else
186  {
187  w ^= (w >> 1);
188  gap_count += bm::word_bitcount64(w);
189  w_l = w0 & 1;
190  gap_count -= unsigned(w0 >> w_shift); // negative value correction
191  gap_count -= !(w_prev ^ w_l); // word border correction
192  w_prev = (w0 >> w_shift);
193  }
194  } // for
195 
196  *gc = gap_count;
197  *bc = bit_count;
198 }
199 
200 
201 
202 /*!
203  Function calculates number of times when bit value changed
204  @internal
205 */
206 inline
208  const bm::word_t* BMRESTRICT xor_block,
209  unsigned size,
210  unsigned* BMRESTRICT gc,
211  unsigned* BMRESTRICT bc) BMNOEXCEPT
212 {
213 #ifdef VECT_BLOCK_XOR_CHANGE
214  VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc);
215 #else
216  #ifdef BM64OPT
217  bm::bit_block_xor_change64(block, xor_block, size, gc, bc);
218  #else
219  bm::bit_block_xor_change32(block, xor_block, size, gc, bc);
220  #endif
221 #endif
222 }
223 
224 /**
225  Structure to compute XOR gap-count profile by sub-block waves
226  @ingroup bitfunc
227  @internal
228 */
230 {
231  // stats for s-block
232  unsigned short sb_gc[bm::block_waves]; ///< GAP counts
233  unsigned short sb_bc[bm::block_waves]; ///< BIT counts
234 
235  // stats ref.block XOR mask
236  unsigned short sb_xor_gc[bm::block_waves]; ///< XOR-mask GAP count
237  unsigned short sb_xor_bc[bm::block_waves]; ///< XOR-mask GAP count
238 };
239 
240 /**
241  Capture the XOR filter results (xor block against ref.block)
242  @internal
243  */
245 {
247 
249  unsigned block_gain; ///< XOR filter improvement (best)
250  size_type ref_idx; ///< reference vector index
251  bm::id64_t xor_d64; ///< recorded digest
252 
253  // sub-block waves masks for metrics
257 
258  // recorded gains for metrics
259  unsigned gc_gain;
260  unsigned bc_gain;
261  unsigned ibc_gain;
262 
263  unsigned xor_gc;
264  unsigned xor_bc;
265 
267 };
268 
269 /**
270  XOR match pair
271  @internal
272  */
274 {
275  bvector_size_type ref_idx; ///< reference vector index
276  bm::id64_t xor_d64; ///< recorded digest
277 
280  : ref_idx(idx), xor_d64(d64)
281  {}
282 
283 };
284 
285 /**
286  XOR match chain
287  @internal
288  */
289 template<typename BLOCK_IDX> struct block_match_chain
290 {
291  BLOCK_IDX nb;
292  unsigned chain_size;
293  unsigned ref_idx[64];
296 
297  bool operator==(const block_match_chain& bmc) const BMNOEXCEPT
298  {
299  if (nb != bmc.nb || chain_size != bmc.chain_size || match != bmc.match)
300  {
301  return false;
302  }
303  for (unsigned i = 0; i < chain_size; ++i)
304  {
305  if (ref_idx[i] != bmc.ref_idx[i])
306  return false;
307  if (match == e_xor_match_EQ)
308  continue;
309  if (xor_d64[i] != bmc.xor_d64[i])
310  return false;
311  }
312  return true;
313  }
314 };
315 
316 /**
317  Greedy algorithm to find additional matches
318  improving the inital best match block on its match type
319 
320  @param match_pairs_vect - [out] target vector of best match pairs
321  @param match_vect - [in/out] vector of all found match descriptors
322 
323  @return number of new finds (if any)
324  */
325 template<typename PVT, typename VT>
326 typename VT::size_type
327 greedy_refine_match_vector(PVT& match_pairs_vect,
328  VT& match_vect,
329  typename VT::size_type best_ref_idx,
330  bm::id64_t d64,
331  bm::xor_complement_match match_type)
332 {
333  BM_ASSERT(match_type && d64);
334  match_pairs_vect.resize(0);
335 
336  bm::id64_t d64_acc(d64);
337 
338  // pass 1 (exact match)
339  //
340  typename VT::size_type sz = match_vect.size();
341  for (typename VT::size_type i = 0; (i < sz) && (d64_acc != ~0ull); ++i)
342  {
343  block_xor_match_descr& xmd = match_vect[i];
344  if (xmd.ref_idx == best_ref_idx) // self hit
345  continue;
346  if (xmd.match_type == match_type) // best compatible match types
347  {
348  bm::id64_t d64_new = ~d64_acc & xmd.xor_d64;
349  if (d64_new)
350  {
351  d64_acc |= d64_new; // add mask to accum
352  match_pairs_vect.push_back(bm::match_pair(xmd.ref_idx, d64_new));
353  }
354  xmd.match_type = e_no_xor_match; // mark it to exclude from pass 2
355  }
356  } // for i
357 
358  // pass 2 (extended match)
359  //
360  const unsigned min_gain_cut_off = 50;
361  for (typename VT::size_type i = 0; (i < sz) && (d64_acc != ~0ull); ++i)
362  {
363  block_xor_match_descr& xmd = match_vect[i];
364  if (xmd.ref_idx == best_ref_idx || !xmd.match_type) // self hit or none
365  continue;
366  BM_ASSERT(xmd.match_type != match_type);
367 
368  bm::id64_t d64_new = 0;
369  switch (match_type)
370  {
371  case e_xor_match_GC:
372  if (xmd.gc_gain > min_gain_cut_off)
373  d64_new = ~d64_acc & xmd.gc_d64;
374  break;
375  case e_xor_match_BC:
376  if (xmd.bc_gain > min_gain_cut_off)
377  d64_new = ~d64_acc & xmd.bc_d64;
378  break;
379  case e_xor_match_iBC:
380  if (xmd.ibc_gain > min_gain_cut_off)
381  d64_new = ~d64_acc & xmd.ibc_d64;
382  break;
383  default:
384  break;
385  } // switch
386 
387  if (d64_new) // some improvement found
388  {
389  d64_acc |= d64_new; // add mask to accum
390  match_pairs_vect.push_back(bm::match_pair(xmd.ref_idx, d64_new));
392  }
393  } // for
394 
395  return match_pairs_vect.size();
396 }
397 
398 /**
399  Check effective bit-rate for the XOR encode vector
400  @return 1 - < 256 (8bit), 2 - < 65536 (16-bit) or 0 - 32-bit
401  @internal
402  */
403 template<typename BMChain, typename RVect>
404 unsigned char check_pair_vect_vbr(const BMChain& mchain, const RVect& ref_vect)
405 {
406  size_t max_idx = 0;
407  for (size_t i = 0; i < mchain.chain_size; ++i)
408  {
409  bvector_size_type ridx = mchain.ref_idx[i];
410  ridx = ref_vect.get_row_idx(ridx);
411  if (ridx > max_idx)
412  max_idx = ridx;
413  } // for i
414  if (max_idx < 256)
415  return 1;
416  if (max_idx < 65536)
417  return 2;
418  return 0;
419 }
420 
421 
422 /**
423  Compute reference (non-XOR) 64-dim complexity descriptor for the
424  s-block.
425  Phase 1 of the XOR filtering process is to establish the base metric
426 
427  @internal
428 */
429 inline
432  unsigned* BMRESTRICT s_gc,
433  unsigned* BMRESTRICT s_bc) BMNOEXCEPT
434 {
435  *s_gc = *s_bc = 0;
436  // TODO: SIMD (for loop can go inside VECT to minimize LUT re-inits)
437  for (unsigned i = 0; i < bm::block_waves; ++i)
438  {
439  unsigned off = (i * bm::set_block_digest_wave_size);
440  const bm::word_t* sub_block = block + off;
441  unsigned gc, bc;
442  // TODO: optimize to compute GC and BC in a single pass
443  #if defined(VECT_BLOCK_CHANGE)
445  #else
446  #ifdef BM64OPT
449  #else
451  #endif
452  #endif
454  sub_block, sub_block + bm::set_block_digest_wave_size);
455  x_descr.sb_bc[i] = (unsigned short) bc;
456  *s_bc += bc;
457  if (i) // wave border correction
458  {
459  bm::word_t w_l = sub_block[-1];
460  bm::word_t w_r = sub_block[0] & 1;
461  w_l >>= 31;
462  gc -= (w_l == w_r);
463  }
464  x_descr.sb_gc[i] = (unsigned short) gc;
465  *s_gc += gc;
466 
467  } // for i
468  // TODO: compute and return d64 - use it later
469 }
470 
471 
472 /**
473  Build partial XOR product of 2 bit-blocks using digest mask
474 
475  @param target_block - target := block ^ xor_block
476  @param block - arg1
477  @param xor_block - arg2
478  @param digest - mask for each block wave to XOR (1) or just copy (0)
479 
480  @internal
481 */
482 inline
483 void bit_block_xor(bm::word_t* target_block,
484  const bm::word_t* block,
485  const bm::word_t* xor_block,
486  bm::id64_t digest) BMNOEXCEPT
487 {
488  BM_ASSERT(target_block);
489  BM_ASSERT(block);
490  BM_ASSERT(xor_block);
491  BM_ASSERT(digest);
492 
493 #ifdef VECT_BIT_BLOCK_XOR
494  VECT_BIT_BLOCK_XOR(target_block, block, xor_block, digest);
495 #else
496  for (unsigned i = 0; i < bm::block_waves; ++i)
497  {
498  const bm::id64_t mask = (1ull << i);
499  unsigned off = (i * bm::set_block_digest_wave_size);
500 
501  #ifdef BM64OPT
502  const bm::id64_t* sub_block = (const bm::id64_t*)(block + off);
503  bm::id64_t* t_sub_block = (bm::id64_t*)(target_block + off);
504  const bm::id64_t* sub_block_end = sub_block + bm::set_block_digest_wave_size/2;
505  if (digest & mask) // XOR filtered sub-block
506  {
507  const bm::id64_t* xor_sub_block = (const bm::id64_t*)(xor_block + off);
508  for ( ;sub_block < sub_block_end; )
509  {
510  t_sub_block[0] = sub_block[0] ^ xor_sub_block[0];
511  t_sub_block[1] = sub_block[1] ^ xor_sub_block[1];
512  t_sub_block[2] = sub_block[2] ^ xor_sub_block[2];
513  t_sub_block[3] = sub_block[3] ^ xor_sub_block[3];
514  t_sub_block+=4; sub_block+=4; xor_sub_block+=4;
515  } // for
516  }
517  else // just copy the source
518  {
519  for (; sub_block < sub_block_end; t_sub_block+=4, sub_block+=4)
520  {
521  t_sub_block[0] = sub_block[0];
522  t_sub_block[1] = sub_block[1];
523  t_sub_block[2] = sub_block[2];
524  t_sub_block[3] = sub_block[3];
525  } // for
526  }
527  #else
528  const bm::word_t* sub_block = block + off;
529  bm::word_t* t_sub_block = target_block + off;
530  const bm::word_t* sub_block_end = sub_block + bm::set_block_digest_wave_size;
531  if (digest & mask) // XOR filtered sub-block
532  {
533  const bm::word_t* xor_sub_block = xor_block + off;
534  for (; sub_block < sub_block_end; )
535  {
536  t_sub_block[0] = sub_block[0] ^ xor_sub_block[0];
537  t_sub_block[1] = sub_block[1] ^ xor_sub_block[1];
538  t_sub_block[2] = sub_block[2] ^ xor_sub_block[2];
539  t_sub_block[3] = sub_block[3] ^ xor_sub_block[3];
540  t_sub_block+=4; sub_block+=4; xor_sub_block+=4;
541  } // for
542  }
543  else // just copy the source
544  {
545  for (; sub_block < sub_block_end; t_sub_block+=4, sub_block+=4)
546  {
547  t_sub_block[0] = sub_block[0];
548  t_sub_block[1] = sub_block[1];
549  t_sub_block[2] = sub_block[2];
550  t_sub_block[3] = sub_block[3];
551  } // for
552  }
553  #endif
554  } // for i
555 #endif
556 }
557 
558 
559 /**
560  Build partial XOR product of 2 bit-blocks using digest mask
561 
562  @param target_block - target := target ^ xor_block
563  @param xor_block - arg
564  @param digest - mask for each block wave to XOR (1)
565 
566  @internal
567 */
568 inline
569 void bit_block_xor(bm::word_t* target_block, const bm::word_t* xor_block,
570  bm::id64_t digest) BMNOEXCEPT
571 {
572  BM_ASSERT(target_block);
573  BM_ASSERT(xor_block);
574  BM_ASSERT(digest);
575 
576  #ifdef VECT_BIT_BLOCK_XOR_2WAY
577  VECT_BIT_BLOCK_XOR_2WAY(target_block, xor_block, digest);
578  #else
579  while (digest)
580  {
581  bm::id64_t t = bm::bmi_blsi_u64(digest); // d & -d;
582  unsigned wave = bm::word_bitcount64(t - 1);
583  unsigned off = wave * bm::set_block_digest_wave_size;
584 
585  #ifdef BM64OPT
586  const bm::id64_t* xor_sub_block = (const bm::id64_t*)(xor_block + off);
587  bm::id64_t* t_sub_block = (bm::id64_t*)(target_block + off);
588  const bm::id64_t* t_sub_block_end = (t_sub_block + bm::set_block_digest_wave_size/2);
589  for (; t_sub_block < t_sub_block_end; t_sub_block+=4, xor_sub_block+=4)
590  {
591  t_sub_block[0] ^= xor_sub_block[0];
592  t_sub_block[1] ^= xor_sub_block[1];
593  t_sub_block[2] ^= xor_sub_block[2];
594  t_sub_block[3] ^= xor_sub_block[3];
595  } // for
596  #else
597  const bm::word_t* xor_sub_block = xor_block + off;
598  bm::word_t* t_sub_block = target_block + off;
599  const bm::word_t* t_sub_block_end = t_sub_block + bm::set_block_digest_wave_size;
600  for (; t_sub_block < t_sub_block_end; t_sub_block+=4, xor_sub_block+=4)
601  {
602  t_sub_block[0] ^= xor_sub_block[0];
603  t_sub_block[1] ^= xor_sub_block[1];
604  t_sub_block[2] ^= xor_sub_block[2];
605  t_sub_block[3] ^= xor_sub_block[3];
606  } // for
607  #endif
608  digest = bm::bmi_bslr_u64(digest); // d &= d - 1;
609  } // while
610  #endif
611 }
612 
613 /**
614  List of reference bit-vectors with their true index associations
615 
616  Each referece vector would have two alternative indexes:
617  - index(position) in the reference list
618  - index(row) in the external bit-matrix (plane index)
619 
620  @internal
621 */
622 template<typename BV>
624 {
625 public:
626  typedef BV bvector_type;
627  typedef typename bvector_type::size_type size_type;
630  typedef typename bvector_type::allocator_type bv_allocator_type;
631 
632 
634  typedef
637 
638 public:
639 
640  /// reset the collection (resize(0))
641  void reset()
642  {
643  rows_acc_ = 0;
645  }
646 
647  /**
648  Add reference vector
649  @param bv - bvector pointer
650  @param ref_idx - reference (row) index
651  */
652  void add(const bvector_type* bv, size_type ref_idx)
653  {
654  BM_ASSERT(bv);
656  ref_bvects_rows_.push_back(ref_idx);
657  }
658 
659  /// Get reference list size
661 
662  /// Get reference vector by the index in this ref-vector
664  { return ref_bvects_[idx]; }
665 
666  /// Get reference row index by the index in this ref-vector
668  { return (size_type)ref_bvects_rows_[idx]; }
669 
670  /// not-found value for find methods
671  static
673 
674  /// Find vector index by the reference index
675  /// @return ~0 if not found
676  size_type find(std::size_t ref_idx) const BMNOEXCEPT
677  {
678  size_type sz = size();
679  for (size_type i = 0; i < sz; ++i) // TODO: optimization
680  if (ref_idx == ref_bvects_rows_[i])
681  return i;
682  return not_found();
683  }
684 
685  /// Find vector index by the pointer
686  /// @return ~0 if not found
688  {
689  size_type sz = size();
690  for (size_type i = 0; i < sz; ++i)
691  if (bv == ref_bvects_[i])
692  return i;
693  return not_found();
694  }
695 
696  /// Fill block allocation digest for all vectors in the reference collection
697  /// @param bv_blocks - [out] bvector of blocks statistics
698  ///
699  void fill_alloc_digest(bvector_type& bv_blocks) const
700  {
701  size_type sz = size();
702  if (sz)
703  {
704  for (size_type i = 0; i < sz; ++i)
705  ref_bvects_[i]->fill_alloc_digest(bv_blocks);
707  bv_blocks.optimize(tb);
708  }
709  }
710 
711  /// Reset and build vector of references from a basic bit-matrix
712  /// all NULL rows are skipped, not added to the ref.vector
713  /// @sa add_vectors
714  ///
715  template<class BMATR>
716  void build(const BMATR& bmatr)
717  {
718  reset();
719  add_vectors(bmatr);
720  }
721 
722  /// Append basic bit-matrix to the list of reference vectors
723  /// @sa build
724  /// @sa add_sparse_vector
725  template<typename BMATR>
726  void add_vectors(const BMATR& bmatr)
727  {
728  size_type rows = bmatr.rows();
729  for (size_type r = 0; r < rows; ++r)
730  if (bvector_type_const_ptr bv = bmatr.get_row(r))
731  add(bv, rows_acc_ + r);
732  rows_acc_ += unsigned(rows);
733  }
734 
735  /// Add bit-transposed sparse vector as a bit-matrix
736  /// @sa add_vectors
737  ///
738  template<class SV>
739  void add_sparse_vector(const SV& sv)
740  {
741  add_vectors(sv.get_bmatrix());
742  }
743 
744  /** Utility function to resize matrix based on number of vectors and blocks
745  */
747  size_type total_blocks) const
748  {
749  if (total_blocks)
750  matr.resize(ref_bvects_.size(), total_blocks, false /*no-copy*/);
751  else
752  matr.resize(0, 0);
753  }
754 
755  /** Calculate blocks digest and resize XOR distance matrix
756  based on total number of available blocks
757  @return true if created ok (false if no blocks found)
758  */
760  bvector_type& bv_blocks) const
761  {
762  fill_alloc_digest(bv_blocks);
763  size_type cnt = bv_blocks.count();
764  if (!cnt)
765  return false;
766  resize_xor_matrix(matr, cnt);
767  return true;
768  }
769 
770 protected:
773 
774 protected:
775  unsigned rows_acc_ = 0; ///< total rows accumulator
776  bvptr_vector_type ref_bvects_; ///< reference vector pointers
777  bv_plane_vector_type ref_bvects_rows_; ///< reference vector row idxs
778 };
779 
780 // --------------------------------------------------------------------------
781 //
782 // --------------------------------------------------------------------------
783 
784 /**
785  XOR similarity model
786 
787  @internal
788 */
789 template<typename BV>
791 {
792 public:
793  typedef BV bvector_type;
794  typedef typename bvector_type::size_type size_type;
795  typedef typename bvector_type::allocator_type bv_allocator_type;
796 
797 
799  typedef
802 
803 public:
804  matrix_chain_type matr; ///< model matrix
805  bvector_type bv_blocks; ///< blocks digest
806 };
807 
808 // --------------------------------------------------------------------------
809 //
810 // --------------------------------------------------------------------------
811 
812 /**
813  XOR scanner to search for complement-similarities in
814  collections of bit-vectors
815 
816  @internal
817 */
818 template<typename BV>
820 {
821 public:
823  typedef BV bvector_type;
824  typedef typename bvector_type::allocator_type bv_allocator_type;
825  typedef typename bvector_type::size_type size_type;
826 
837 
840 
841 public:
842 
845  {
846  free_blocks();
847  }
848 
849 
851  { ref_vect_ = ref_vect; }
852 
854  { return *ref_vect_; }
855 
856  /** Get statistics for the r-(or s-) block
857  @param ri - nb cache index
858  */
860 
861  /** Compute statistics for the r-(or s-) block
862  @param block - bit-block target
863  */
865 
866  /** Scan for all candidate bit-blocks to find mask or match
867  @return XOR referenece match type
868  */
871  size_type ri,
872  size_type ridx_from,
873  size_type ridx_to,
874  unsigned i, unsigned j,
875  bm::word_t* tx_block,
876  const bm::xor_sim_params& params);
877  /**
878  Run a search to add possible XOR match chain additions
879  */
881 
882 
883  /**
884  XOR all match blocks to target using their digest masks
885  */
887  bm::word_t* target_xor_block,
888  const bm::word_t* s_block,
889  size_type s_ri,
890  const match_pairs_vector_type& pm_vect,
891  unsigned i, unsigned j) const BMNOEXCEPT;
892 
893  /**
894  Calculate matrix of best XOR match metrics per block
895  for the attached collection of bit-vectors
896  @return true if computed successfully
897  */
900  const bm::xor_sim_params& params);
901 
902  /**
903  Calculate matrix of best XOR match metrics per block
904  for the attached collection of bit-vectors
905  @return true if computed successfully
906  */
908  const bm::xor_sim_params& params);
909 
910  /**
911  Compute similarity model for block
912  */
914  typename xor_sim_model<BV>::matrix_chain_type &sim_model_matr,
915  size_type nb,
916  size_type nb_rank,
917  const bm::xor_sim_params& params);
918  /**
919  Compute reference complexity descriptor based on XOR vector.
920  Returns the digest of sub-blocks where XOR filtering improved the metric
921  (function needs reference to estimate the improvement).
922 
923  part of Phase 2 of the XOR filtering process
924 
925  @sa compute_sub_block_complexity_descr
926 
927  @internal
928  */
930  const bm::word_t* BMRESTRICT block,
931  bm::id64_t block_d64,
932  const bm::word_t* BMRESTRICT xor_block,
935 
936  /**
937  Check if XOR transform simplified block enough for
938  compressibility objective
939  */
940  bool validate_xor(const bm::word_t* xor_block) const BMNOEXCEPT;
941 
943 
944  /// Return best match type of a found block
945  ///
947  { return x_block_mtype_; }
948 
950  { return found_block_xor_; }
953 
954  unsigned get_s_bc() const BMNOEXCEPT { return s_bc_; }
955  unsigned get_s_gc() const BMNOEXCEPT { return s_gc_; }
957  { return s_block_best_metric_; }
958 
959 
961 
962  static
963  bm::xor_complement_match best_metric(unsigned bc, unsigned gc,
964  unsigned* best_metric) BMNOEXCEPT;
965 
967  { return match_vect_; }
968 
970  { return chain_match_vect_; }
971 
972  /// Return block from the reference vector [vect_idx, block_i, block_j]
973  ///
975  unsigned i, unsigned j) const BMNOEXCEPT
976  { return ref_vect_->get_bv(ri)->get_blocks_manager().get_block_ptr(i, j); }
977 
978  /// Sync TEMP vector size
979  /// @internal
980  void sync_nb_vect();
981 
982 protected:
983 
984  /// Deoptimize vertical slice of GAP blocks
985  /// @param nb - block number
986  ///
988  const xor_sim_params& params);
989 
990  /// Free the collection of temp blocks
992 
993 
994 private:
996  xor_scanner& operator=(const xor_scanner&) = delete;
997 
998 private:
1000 
1001  const bv_ref_vector_type* ref_vect_ = 0; ///< ref.vect for XOR filter
1002  bv_blocks_vector_type nb_blocks_vect_; ///< pointers to temp blocks
1006 
1007  bv_allocator_type alloc_; ///< allocator to produce blocks
1008 
1009  bm::block_waves_xor_descr x_descr_; ///< XOR desriptor
1010 
1011  // S-block statistics
1012  //
1013  unsigned s_bc_; ///< bitcount
1014  unsigned s_gc_; ///< gap count
1015  unsigned s_block_best_metric_; ///< s-block orig.metric
1016 
1017  unsigned x_best_metric_; ///< min(gc, bc, ibc)
1019 
1020  // scan related metrics
1021  bm::id64_t x_d64_; ///< search digest
1022  size_type found_ridx_; ///< match vector (in references)
1024 
1025  // match chain members:
1026  //
1027  xor_matches_vector_type match_vect_; ///< vector of match descr
1028  match_pairs_vector_type chain_match_vect_; ///< refined match pairs
1029 };
1030 
1031 // --------------------------------------------------------------------------
1032 //
1033 // --------------------------------------------------------------------------
1034 //
1035 // Naming conventions and glossary:
1036 //
1037 // s_block - source serialization block to be XOR filtered (anchor block)
1038 // best_ref_block - best reference block picked for XOR transform
1039 // xt_block - s_block XOR best_ref_block
1040 //
1041 
1042 
1043 
1044 template<typename BV>
1046 {
1047  x_descr_ = nb_xdescr_vect_[ri];
1048  s_gc_ = nb_gc_vect_[ri];
1049  s_bc_ = nb_bc_vect_[ri];
1050 
1053 }
1054 
1055 
1056 template<typename BV>
1058 {
1059  BM_ASSERT(IS_VALID_ADDR(block));
1060  BM_ASSERT(!BM_IS_GAP(block));
1061 
1063 
1066 }
1067 // --------------------------------------------------------------------------
1068 
1069 template<typename BV>
1071  const bm::word_t* BMRESTRICT block,
1072  bm::id64_t block_d64,
1073  const bm::word_t* BMRESTRICT xor_block,
1076 {
1077  bm::id64_t d0 = ~block_d64;
1078 
1079  xmd.xor_gc = xmd.xor_bc = 0;
1080 
1081  // Pass 1: compute XOR descriptors
1082  //
1083  for (unsigned i = 0; i < bm::block_waves; ++i)
1084  {
1085  unsigned off = (i * bm::set_block_digest_wave_size);
1086  const bm::word_t* sub_block = block + off;
1087  const bm::word_t* xor_sub_block = xor_block + off;
1088 
1089  unsigned xor_gc, xor_bc;
1090  bm::bit_block_xor_change(sub_block, xor_sub_block,
1092  &xor_gc, &xor_bc);
1093  x_descr.sb_xor_bc[i] = (unsigned short)xor_bc;
1094  BM_ASSERT(xor_gc);
1095  if (i) // wave border correction
1096  {
1097  bm::word_t w_l = (sub_block[-1] ^ xor_sub_block[-1]);
1098  bm::word_t w_r = (sub_block[0] ^ xor_sub_block[0]) & 1;
1099  w_l >>= 31;
1100  xor_gc -= (w_l == w_r);
1101  }
1102  x_descr.sb_xor_gc[i] = (unsigned short)xor_gc;
1103 
1104  xmd.xor_bc += xor_bc;
1105  xmd.xor_gc += xor_gc;
1106 
1107  } // for i
1108 
1109  // Pass 2: find the best match
1110  //
1111  unsigned block_gc_gain(0), block_bc_gain(0), block_ibc_gain(0);
1112  bm::id64_t gc_digest(0), bc_digest(0), ibc_digest(0);
1113  const unsigned wave_max_bits = bm::set_block_digest_wave_size * 32;
1114 
1115  for (unsigned i = 0; i < bm::block_waves; ++i)
1116  {
1117  bm::id64_t dmask = (1ull << i);
1118  if (d0 & dmask)
1119  continue;
1120 
1121  unsigned xor_gc = x_descr.sb_xor_gc[i];
1122  if (xor_gc <= 1)
1123  {
1124  gc_digest |= dmask;
1125  block_gc_gain += x_descr.sb_gc[i]; // all previous GAPs are gone
1126  }
1127  else if (xor_gc < x_descr.sb_gc[i]) // some improvement in GAPs
1128  {
1129  gc_digest |= dmask;
1130  block_gc_gain += (x_descr.sb_gc[i] - xor_gc);
1131  }
1132  unsigned xor_bc = x_descr.sb_xor_bc[i];
1133  if (xor_bc < x_descr.sb_bc[i]) // improvement in BITS
1134  {
1135  bc_digest |= dmask;
1136  block_bc_gain += (x_descr.sb_bc[i] - xor_bc);
1137  }
1138  unsigned xor_ibc = wave_max_bits - xor_bc;
1139  unsigned wave_ibc = wave_max_bits - x_descr.sb_bc[i];
1140  if (xor_ibc < wave_ibc) // improvement in 0 BITS
1141  {
1142  ibc_digest |= dmask;
1143  block_ibc_gain += (wave_ibc - xor_ibc);
1144  }
1145 
1146  } // for i
1147 
1148  // Save metric results into XOR match descriptor
1149  //
1150 
1151  xmd.gc_d64 = gc_digest;
1152  xmd.bc_d64 = bc_digest;
1153  xmd.ibc_d64 = ibc_digest;
1154 
1155  xmd.gc_gain = block_gc_gain;
1156  xmd.bc_gain = block_bc_gain;
1157  xmd.ibc_gain = block_ibc_gain;
1158 
1159 
1160  // Find the winning metric and its digest mask
1161  //
1162  if (!(block_gc_gain | block_bc_gain | block_ibc_gain)) // match not found
1163  {
1164  // this is to check if xor filter has
1165  // canceled out a whole sub-block wave
1166  //
1167  bm::id64_t d0_x = ~bm::calc_block_digest0(xor_block);
1168  if (d0 == d0_x)
1169  {
1170  xmd.match_type = bm::e_xor_match_GC;
1171  xmd.block_gain = bm::block_waves;
1172  xmd.xor_d64 = d0;
1173  return;
1174  }
1175  xmd.match_type = bm::e_no_xor_match;
1176  xmd.block_gain = 0; xmd.xor_d64 = 0;
1177  return;
1178  }
1179 
1180  int new_gc = int(s_gc_) - int(block_gc_gain);
1181  if (new_gc < 0)
1182  new_gc = 0;
1183  int new_bc = int(s_bc_) - int(block_bc_gain);
1184  if (new_bc < 0)
1185  new_bc = 0;
1186  int new_ibc = int(bm::gap_max_bits) - int(s_bc_) - int(block_ibc_gain);
1187  if (new_ibc < 0)
1188  new_ibc = 0;
1189 
1190 
1191  unsigned best_m;
1192  xmd.match_type = best_metric(unsigned(new_bc), unsigned(new_gc), &best_m);
1193  switch (xmd.match_type)
1194  {
1195  case e_xor_match_GC:
1196  if (new_ibc < new_gc)
1197  {
1198  xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1199  }
1200  else
1201  {
1202  xmd.block_gain = block_gc_gain; xmd.xor_d64 = gc_digest;
1203  }
1204  break;
1205  case e_xor_match_BC:
1206  if (new_ibc < new_bc)
1207  {
1208  xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1209  }
1210  else
1211  {
1212  xmd.block_gain = block_bc_gain; xmd.xor_d64 = bc_digest;
1213  }
1214  break;
1215  case e_xor_match_iBC:
1216  xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1217  break;
1218  default:
1219  break;
1220  } // switch
1221 #if 0
1222  // Disabled as cases compression ratio degradation in some tests
1223  if (!xmd.xor_d64) // best metric choice did not work try best gain
1224  {
1225  if (block_gc_gain >= block_bc_gain && block_gc_gain >= block_ibc_gain)
1226  {
1227  xmd.block_gain = block_gc_gain; xmd.xor_d64 = gc_digest;
1228  }
1229  else
1230  if (block_bc_gain > block_gc_gain && block_bc_gain > block_ibc_gain)
1231  {
1232  xmd.block_gain = block_bc_gain; xmd.xor_d64 = bc_digest;
1233  }
1234  else
1235  {
1236  xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1237  }
1238  }
1239 #endif
1240 }
1241 
1242 // --------------------------------------------------------------------------
1243 
1244 template<typename BV>
1247  size_type s_ri,
1248  size_type ridx_from,
1249  size_type ridx_to,
1250  unsigned i, unsigned j,
1251  bm::word_t* tx_block,
1252  const bm::xor_sim_params& params)
1253 {
1254  BM_ASSERT(ridx_from <= ridx_to);
1255  BM_ASSERT(IS_VALID_ADDR(s_block));
1256  BM_ASSERT(tx_block);
1257 
1258  if (ridx_to > ref_vect_->size())
1259  ridx_to = ref_vect_->size();
1260 
1262  bm::id64_t d64 = 0;
1263  found_block_xor_ = 0;
1264 
1265  unsigned best_block_gain = 0;
1266  int best_ri = -1;
1267 
1268  match_vect_.resize(0);
1269 
1270  unsigned s_gc(0);
1271  bool s_gap = BM_IS_GAP(s_block);
1272  if (s_gap)
1273  {
1274  const bm::gap_word_t* gap_s_block = BMGAP_PTR(s_block);
1275  s_gc = bm::gap_length(gap_s_block);
1276  if (s_gc <= 3)
1277  return e_no_xor_match;
1278  s_block = nb_blocks_vect_.at(s_ri);
1279  BM_ASSERT(s_block);
1280  }
1281  bm::id64_t s_block_d64 = bm::calc_block_digest0(s_block);
1282 
1283  // scan pass: over reference vectors
1284  //
1285  if (ridx_to > ridx_from + params.max_lookup_depth)
1286  ridx_to = ridx_from + params.max_lookup_depth;
1287 
1288  size_type depth = 0;
1289  for (size_type ri = ridx_from; ri < ridx_to; ++ri, ++depth)
1290  {
1291  const bm::word_t* ref_block = get_ref_block(ri, i, j);
1292  if (BM_IS_GAP(ref_block))
1293  {
1294  const bm::gap_word_t* gap_ref_block = BMGAP_PTR(ref_block);
1295  unsigned r_gc = bm::gap_length(gap_ref_block);
1296  if (r_gc <= 3)
1297  continue;
1298 
1299  if (s_gap) // both blocks GAPs - check if XOR does not make sense
1300  {
1301  if (s_gc < r_gc) // S-GAP block is shorter than Ref BLOCK
1302  {
1303  unsigned gc_diff = r_gc - s_gc;
1304  if (gc_diff >= s_gc) // cannot improve anything
1305  continue;
1306  }
1307  }
1308 
1309  if (nb_blocks_vect_.size() > ri)
1310  ref_block = nb_blocks_vect_[ri];
1311  }
1312  if (!IS_VALID_ADDR(ref_block))
1313  continue;
1314 
1315  BM_ASSERT(s_block != ref_block);
1316 
1318  compute_xor_complexity_descr(s_block, s_block_d64, ref_block, x_descr_, xmd);
1319  if (xmd.xor_d64) // candidate XOR block found
1320  {
1321  xmd.ref_idx = ri;
1322  BM_ASSERT(xmd.match_type);
1323  if (xmd.block_gain > best_block_gain)
1324  {
1325  // check if "it is good enough"
1327 
1328  best_block_gain = xmd.block_gain;
1329  best_ri = int(ri);
1330  d64 = xmd.xor_d64;
1331  if (xmd.block_gain >= (bm::gap_max_bits-3))
1332  break;
1333  unsigned curr_best;
1334  //bm::xor_complement_match match =
1335  best_metric(xmd.xor_bc, xmd.xor_gc, &curr_best);
1336  float gain_ratio =
1337  float(xmd.block_gain) / float(s_block_best_metric_);
1338 
1339  if (depth >= params.min_lookup_depth &&
1340  x_block_mtype_ == xmd.match_type)
1341  {
1342  if (xmd.block_gain >= params.stop_gain)
1343  break;
1344  if (gain_ratio > params.target_gain_ratio)
1345  break;
1346  }
1347  }
1348  match_vect_.push_back(xmd); // place into vector of matches
1349  }
1350  } // for ri
1351 
1352  found_ridx_ = size_type(best_ri);
1353  x_d64_ = d64;
1354 
1355  if (best_ri != -1) // found some gain, validate it now
1356  {
1357  // assumed that if XOR compression c_level is at the highest
1358  const float bie_bits_per_int = 3.0f; // c_level_ < 6 ? 3.75f : 3.0f;
1359  const unsigned bie_limit =
1360  unsigned(float(bm::gap_max_bits) / bie_bits_per_int);
1361 
1362  unsigned xor_bc, xor_gc;
1363  const bm::word_t* ref_block = get_ref_block(size_type(best_ri), i, j);
1364  bool r_gap = BM_IS_GAP(ref_block);
1365  if (r_gap)
1366  ref_block = nb_blocks_vect_[size_type(best_ri)];
1367  found_block_xor_ = ref_block;
1368 
1369  // TODO: one pass operation?
1370  bm::bit_block_xor(tx_block, s_block, ref_block, d64);
1371  bm::bit_block_change_bc(tx_block, &xor_gc, &xor_bc);
1372 
1373  if (!xor_bc) // check if completely identical block?
1374  {
1375  x_best_metric_ = xor_bc;
1376  found_ridx_ = size_type(best_ri);
1377  x_block_mtype_ = rb_found = e_xor_match_BC;
1378 
1379  unsigned block_pos;
1380  bool found = bm::block_find_first_diff(s_block, ref_block, &block_pos);
1381  if (!found)
1382  {
1383  x_block_mtype_ = rb_found = e_xor_match_EQ; x_d64_ = 0;
1384  }
1385  }
1386  else // find the best matching metric (GC, BC, iBC, ..)
1387  {
1388  rb_found = best_metric(xor_bc, xor_gc, &x_best_metric_);
1389 
1390  // double check if XOR improves compression
1391  // with accounted serialization overhead
1392  //
1393  if (rb_found)
1394  {
1395  if (x_best_metric_ > bie_limit ||
1397  return e_no_xor_match;
1398 
1399  unsigned gain = get_s_block_best() - x_best_metric_;
1400  gain *= 3; // use bit estimate (speculative)
1401  // gain should be greater than overhead for storing
1402  // reference data: xor token, digest-64, block idx
1403  unsigned gain_min = unsigned(sizeof(char) + sizeof(unsigned));
1404  if (d64 != ~0ull) // if mask is all 1s - it is not used
1405  gain_min += (unsigned)sizeof(bm::id64_t);
1406  else
1407  {
1408  if (x_best_metric_ <= 1)
1409  return rb_found;
1410  }
1411  gain_min *= 8; // in bits
1412  if (gain > gain_min)
1413  return rb_found;
1414 
1415  return e_no_xor_match;
1416  }
1417  }
1418  }
1419  return rb_found;
1420 }
1421 
1422 // --------------------------------------------------------------------------
1423 
1424 template<typename BV>
1426 {
1427  size_type match_size = 0;
1428  if (x_d64_ == ~0ull || !x_d64_)
1429  return match_size;
1431  match_size = (size_type)
1434  return match_size;
1435 }
1436 
1437 // --------------------------------------------------------------------------
1438 
1439 template<typename BV>
1442  const bm::xor_sim_params& params)
1443 {
1444  const bv_ref_vector_type* ref_vect_curr = this->ref_vect_; // save ref-vect
1445 
1446  ref_vect_ = &ref_vect;
1447  bool sim_ok = compute_sim_model(sim_model, params);
1448 
1449  ref_vect_ = ref_vect_curr; // restore state
1450  return sim_ok;
1451 }
1452 
1453 template<typename BV>
1455  const bm::xor_sim_params& params)
1456 {
1458 
1459  sim_model.bv_blocks.clear(false);
1460  bool ret = ref_vect_->build_nb_digest_and_xor_matrix(sim_model.matr,
1461  sim_model.bv_blocks);
1462  if (!ret)
1463  return ret;
1464 
1465  sync_nb_vect();
1466 
1467  typename bvector_type::enumerator en(sim_model.bv_blocks);
1468  for (size_type col = 0; en.valid(); ++en, ++col)
1469  {
1470  size_type nb = *en;
1471  compute_sim_model(sim_model.matr, nb, col, params);
1472  } // for en
1473  return true;
1474 }
1475 
1476 // --------------------------------------------------------------------------
1477 
1478 template<typename BV>
1480  typename xor_sim_model<BV>::matrix_chain_type &sim_model_matr,
1481  size_type nb,
1482  size_type nb_rank,
1483  const bm::xor_sim_params& params)
1484 {
1485  BM_ASSERT(nb_rank < sim_model_matr.cols());
1486 
1487  const float bie_bits_per_int = 3.0f; // c_level_ < 6 ? 3.75f : 3.0f;
1488  const unsigned bie_limit =
1489  unsigned(float(bm::gap_max_bits) / bie_bits_per_int);
1490 
1491  deoptimize_gap_blocks(nb, params);
1492 
1493  size_type rsize = ref_vect_->size();
1494 
1495  unsigned i0, j0;
1496  bm::get_block_coord(nb, i0, j0);
1497 
1498  for (size_type ri=0; true; ++ri)
1499  {
1500  bm::block_match_chain<size_type>* m_row = sim_model_matr.row(ri);
1501  bm::block_match_chain<size_type>& bmc = m_row[nb_rank];
1502  bmc.nb = nb;
1503  bmc.chain_size = 0;
1504  bmc.match = e_no_xor_match;
1505 
1506  if (ri == rsize-1)
1507  break;
1508 
1509  const bm::word_t* s_block = get_ref_block(ri, i0, j0);
1510  if (!IS_VALID_ADDR(s_block))
1511  continue;
1512 
1513  if (BM_IS_GAP(s_block))
1514  {
1515  const bm::gap_word_t* gap_s_block = BMGAP_PTR(s_block);
1516  unsigned gc = bm::gap_length(gap_s_block);
1517  if (gc <= params.min_gaps)
1518  continue;
1519  }
1520 
1521  // compute s-block best metrics
1522  get_s_block_stats(ri);
1523  if (s_block_best_metric_ < 3)
1524  continue;
1525 
1526  // scan ref-vector plains for best similarity
1527  //
1528  bmc.match = search_best_xor_mask(s_block, ri, ri+1, rsize,
1529  i0, j0, xor_tmp_block_, params);
1530 
1531  // take index in the ref-vector (not translated to a plain number)
1532  //
1533  size_type ridx = found_ridx();
1534  bm::id64_t d64 = get_xor_digest();
1535 
1536  size_type chain_size = 0;
1537  if (d64 && d64 != ~0ULL) // something found
1538  {
1539  chain_size = refine_match_chain();
1540  }
1541  switch (bmc.match)
1542  {
1543  case e_no_xor_match:
1544  if (chain_size) // try chain compression for improve
1545  {
1547  BM_ASSERT(chain_size == pm_vect.size());
1548 
1550  s_block, ri,
1551  pm_vect, i0, j0);
1552  unsigned xor_bc, xor_gc;
1553  bm::bit_block_change_bc(xor_tmp_block_, &xor_gc, &xor_bc);
1554 
1555  bmc.match = best_metric(xor_bc, xor_gc, &x_best_metric_);
1556  if (bmc.match)
1557  {
1558  if ((x_best_metric_ > bie_limit) ||
1560  {
1561  bmc.match = e_no_xor_match;
1562  continue;
1563  }
1564  for (size_type k = 0; k < chain_size; ++k)
1565  {
1566  const bm::match_pair& mp = pm_vect[k];
1567  bmc.ref_idx[k] = (unsigned)mp.ref_idx;
1568  bmc.xor_d64[k] = mp.xor_d64;
1569  bmc.chain_size++;
1570  } // for k
1571  }
1572  }
1573  break;
1574  case e_xor_match_EQ:
1575  bmc.chain_size++;
1576  bmc.ref_idx[0] = unsigned(ridx);
1577  break;
1578  default: // improving match found
1579  bmc.chain_size++;
1580  bmc.ref_idx[0] = unsigned(ridx);
1581  bmc.xor_d64[0] = d64;
1582 
1583  if (chain_size) // match chain needs no verification
1584  {
1586  BM_ASSERT(chain_size == pm_vect.size());
1587  auto sz = pm_vect.size();
1588  for (size_type k = 0; k < sz; ++k)
1589  {
1590  BM_ASSERT(k < 64);
1591  const bm::match_pair& mp = pm_vect[k];
1592  bmc.ref_idx[k+1] = (unsigned)mp.ref_idx;
1593  bmc.xor_d64[k+1] = mp.xor_d64;
1594  bmc.chain_size++;
1595  } // for k
1596  }
1597 
1598  } // switch
1599 
1600  } // for ri
1601 }
1602 
1603 
1604 // --------------------------------------------------------------------------
1605 
1606 template<typename BV>
1608  bm::word_t* target_xor_block,
1609  const bm::word_t* s_block,
1610  size_type s_ri,
1611  const match_pairs_vector_type& pm_vect,
1612  unsigned i, unsigned j) const BMNOEXCEPT
1613 {
1614  bool s_gap = BM_IS_GAP(s_block);
1615  if (s_gap)
1616  {
1617  s_block = nb_blocks_vect_.at(s_ri);
1618  BM_ASSERT(s_block);
1619  }
1620  auto sz = pm_vect.size();
1621  for (typename match_pairs_vector_type::size_type k = 0; k < sz; ++k)
1622  {
1623  const bm::match_pair& mp = pm_vect[k];
1624  const bm::word_t* ref_block = get_ref_block(mp.ref_idx, i, j);
1625  if (BM_IS_GAP(ref_block))
1626  {
1627  ref_block = nb_blocks_vect_[mp.ref_idx];
1628  BM_ASSERT(ref_block);
1629  }
1630  if (!k)
1631  bm::bit_block_xor(target_xor_block, s_block, ref_block, mp.xor_d64);
1632  else
1633  bm::bit_block_xor(target_xor_block, ref_block, mp.xor_d64);
1634  } // for k
1635 }
1636 
1637 // --------------------------------------------------------------------------
1638 
1639 template<typename BV>
1641 xor_scanner<BV>::best_metric(unsigned bc, unsigned gc,
1642  unsigned* best_metric) BMNOEXCEPT
1643 {
1645  unsigned ibc = bm::gap_max_bits - bc;
1646  if (!ibc)
1647  {
1648  *best_metric = gc;
1649  return e_xor_match_GC;
1650  }
1651  if (gc < bc) // GC < BC
1652  {
1653  if (gc <= ibc)
1654  {
1655  *best_metric = gc;
1656  return e_xor_match_GC;
1657  }
1658  }
1659  else // GC >= BC
1660  {
1661  if (bc <= ibc)
1662  {
1663  *best_metric = bc;
1664  return e_xor_match_BC;
1665  }
1666  }
1667  *best_metric = ibc;
1668  return e_xor_match_iBC;
1669 }
1670 
1671 // --------------------------------------------------------------------------
1672 
1673 
1674 template<typename BV>
1676 {
1677  size_t sz = nb_blocks_vect_.size();
1678  for (size_t i = 0; i < sz; ++i)
1679  {
1680  bm::word_t* blk = nb_blocks_vect_[i];
1681  if (blk)
1682  alloc_.free_bit_block(blk);
1683  }
1685 }
1686 
1687 // --------------------------------------------------------------------------
1688 
1689 template<typename BV>
1691  const xor_sim_params& params)
1692 {
1693  size_type rsize = ref_vect_->size();
1694  BM_ASSERT(nb_blocks_vect_.size() == rsize);
1695  unsigned i0, j0;
1696  bm::get_block_coord(nb, i0, j0);
1697 
1698  for (size_type ri=0; ri < rsize; ++ri)
1699  {
1700  const bm::word_t* block = get_ref_block(ri, i0, j0);
1701  if (BM_IS_GAP(block))
1702  {
1703  const bm::gap_word_t* gap_block = BMGAP_PTR(block);
1704  unsigned gc = bm::gap_length(gap_block);
1705  if (gc <= params.min_gaps)
1706  continue;
1707 
1708  bm::word_t* t_block = nb_blocks_vect_.at(ri);
1709  if (!t_block)
1710  {
1711  t_block = alloc_.alloc_bit_block();
1712  nb_blocks_vect_[ri] = t_block;
1713  }
1714  bm::gap_convert_to_bitset(t_block, BMGAP_PTR(block));
1715  block = t_block;
1716  }
1717  if (!IS_VALID_ADDR(block))
1718  continue;
1719 
1720  block_waves_xor_descr& x_descr = nb_xdescr_vect_[ri];
1721  unsigned gc, bc;
1722  bm::compute_s_block_descr(block, x_descr, &gc, &bc);
1723  nb_gc_vect_[ri] = gc;
1724  nb_bc_vect_[ri] = bc;
1725  } // for ri
1726 
1727 }
1728 
1729 // --------------------------------------------------------------------------
1730 
1731 template<typename BV>
1733 {
1734  size_type rsize = ref_vect_->size();
1735  if (nb_blocks_vect_.size() == rsize)
1736  return;
1737  free_blocks();
1738  nb_blocks_vect_.resize(rsize);
1739  bm::word_t** vect_data = nb_blocks_vect_.data();
1740  for (size_type i = 0; i < rsize; ++i)
1741  vect_data[i] = 0;
1742  nb_gc_vect_.resize(rsize);
1743  nb_bc_vect_.resize(rsize);
1744  nb_xdescr_vect_.resize(rsize);
1745 }
1746 
1747 // --------------------------------------------------------------------------
1748 
1749 } // namespace bm
1750 
1751 #endif
ncbi::TMaskedQueryRegions mask
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d)
Definition: bmavx2.h:3573
#define VECT_BLOCK_CHANGE(block, size)
Definition: bmavx2.h:3555
#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d)
Definition: bmavx2.h:3576
#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc)
Definition: bmavx2.h:3558
Definitions(internal)
#define IS_VALID_ADDR(addr)
Definition: bmdef.h:161
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BMGAP_PTR(ptr)
Definition: bmdef.h:189
#define BM_IS_GAP(ptr)
Definition: bmdef.h:191
#define BM_ASSERT
Definition: bmdef.h:139
Bit manipulation primitives (internal)
List of reference bit-vectors with their true index associations.
Definition: bmxor.h:624
const bvector_type * get_bv(size_type idx) const noexcept
Get reference vector by the index in this ref-vector.
Definition: bmxor.h:663
bvector_type::size_type size_type
Definition: bmxor.h:627
void add_vectors(const BMATR &bmatr)
Append basic bit-matrix to the list of reference vectors.
Definition: bmxor.h:726
bm::block_match_chain< size_type > block_match_chain_type
Definition: bmxor.h:633
void fill_alloc_digest(bvector_type &bv_blocks) const
Fill block allocation digest for all vectors in the reference collection.
Definition: bmxor.h:699
bm::heap_vector< std::size_t, bv_allocator_type, true > bv_plane_vector_type
Definition: bmxor.h:772
static size_type not_found() noexcept
not-found value for find methods
Definition: bmxor.h:672
bvector_type * bvector_type_ptr
Definition: bmxor.h:628
bool build_nb_digest_and_xor_matrix(matrix_chain_type &matr, bvector_type &bv_blocks) const
Calculate blocks digest and resize XOR distance matrix based on total number of available blocks.
Definition: bmxor.h:759
size_type find(std::size_t ref_idx) const noexcept
Find vector index by the reference index.
Definition: bmxor.h:676
size_type find_bv(const bvector_type *bv) const noexcept
Find vector index by the pointer.
Definition: bmxor.h:687
const bvector_type * bvector_type_const_ptr
Definition: bmxor.h:629
void add(const bvector_type *bv, size_type ref_idx)
Add reference vector.
Definition: bmxor.h:652
void reset()
reset the collection (resize(0))
Definition: bmxor.h:641
void build(const BMATR &bmatr)
Reset and build vector of references from a basic bit-matrix all NULL rows are skipped,...
Definition: bmxor.h:716
bm::dynamic_heap_matrix< block_match_chain_type, bv_allocator_type > matrix_chain_type
Definition: bmxor.h:636
bm::heap_vector< bvector_type_const_ptr, bv_allocator_type, true > bvptr_vector_type
Definition: bmxor.h:771
size_type size() const noexcept
Get reference list size.
Definition: bmxor.h:660
size_type get_row_idx(size_type idx) const noexcept
Get reference row index by the index in this ref-vector.
Definition: bmxor.h:667
void resize_xor_matrix(matrix_chain_type &matr, size_type total_blocks) const
Utility function to resize matrix based on number of vectors and blocks.
Definition: bmxor.h:746
bvector_type::allocator_type bv_allocator_type
Definition: bmxor.h:630
void add_sparse_vector(const SV &sv)
Add bit-transposed sparse vector as a bit-matrix.
Definition: bmxor.h:739
bv_plane_vector_type ref_bvects_rows_
reference vector row idxs
Definition: bmxor.h:777
unsigned rows_acc_
total rows accumulator
Definition: bmxor.h:775
bvptr_vector_type ref_bvects_
reference vector pointers
Definition: bmxor.h:776
const value_type * row(size_type row_idx) const noexcept
Definition: bmbuffer.h:802
size_type cols() const noexcept
Definition: bmbuffer.h:757
void resize(size_type rows_in, size_type cols_in, bool copy_content=true)
Definition: bmbuffer.h:759
value_type * data() const noexcept
Definition: bmbuffer.h:381
void resize(size_type new_size)
vector resize
Definition: bmbuffer.h:462
value_type & at(size_type pos)
Definition: bmbuffer.h:400
size_type size() const noexcept
Definition: bmbuffer.h:421
void push_back(const value_type &v)
push new element to the back of the vector
Definition: bmbuffer.h:534
XOR scanner to search for complement-similarities in collections of bit-vectors.
Definition: bmxor.h:820
const bv_ref_vector_type * ref_vect_
ref.vect for XOR filter
Definition: bmxor.h:1001
bm::bv_ref_vector< BV > bv_ref_vector_type
Definition: bmxor.h:822
bm::heap_vector< unsigned, bv_allocator_type, true > bv_bcgc_vector_type
Definition: bmxor.h:836
bm::id64_t get_xor_digest() const noexcept
Definition: bmxor.h:952
void sync_nb_vect()
Sync TEMP vector size.
Definition: bmxor.h:1732
const bm::word_t * get_ref_block(size_type ri, unsigned i, unsigned j) const noexcept
Return block from the reference vector [vect_idx, block_i, block_j].
Definition: bmxor.h:974
bvector_type::size_type size_type
Definition: bmxor.h:825
bm::xor_complement_match search_best_xor_mask(const bm::word_t *s_block, size_type ri, size_type ridx_from, size_type ridx_to, unsigned i, unsigned j, bm::word_t *tx_block, const bm::xor_sim_params &params)
Scan for all candidate bit-blocks to find mask or match.
Definition: bmxor.h:1246
bm::block_waves_xor_descr & get_descr() noexcept
Definition: bmxor.h:960
bv_bcgc_vector_type nb_bc_vect_
Definition: bmxor.h:1004
bool validate_xor(const bm::word_t *xor_block) const noexcept
Check if XOR transform simplified block enough for compressibility objective.
bool compute_sim_model(xor_sim_model< BV > &sim_model, const bv_ref_vector_type &ref_vect, const bm::xor_sim_params &params)
Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors.
Definition: bmxor.h:1440
size_type found_ridx_
match vector (in references)
Definition: bmxor.h:1022
const bm::word_t * found_block_xor_
Definition: bmxor.h:1023
void apply_xor_match_vector(bm::word_t *target_xor_block, const bm::word_t *s_block, size_type s_ri, const match_pairs_vector_type &pm_vect, unsigned i, unsigned j) const noexcept
XOR all match blocks to target using their digest masks.
Definition: bmxor.h:1607
bm::bit_block_t xor_tmp_block_
Definition: bmxor.h:999
bv_xdescr_vector_type nb_xdescr_vect_
Definition: bmxor.h:1005
bm::id64_t x_d64_
search digest
Definition: bmxor.h:1021
unsigned s_bc_
bitcount
Definition: bmxor.h:1013
size_type found_ridx() const noexcept
Definition: bmxor.h:942
void compute_xor_complexity_descr(const bm::word_t *block, bm::id64_t block_d64, const bm::word_t *xor_block, bm::block_waves_xor_descr &x_descr, bm::block_xor_match_descr &xmd) const noexcept
Compute reference complexity descriptor based on XOR vector.
Definition: bmxor.h:1070
unsigned s_block_best_metric_
s-block orig.metric
Definition: bmxor.h:1015
match_pairs_vector_type & get_match_pairs() noexcept
Definition: bmxor.h:969
bv_bcgc_vector_type nb_gc_vect_
Definition: bmxor.h:1003
size_type refine_match_chain()
Run a search to add possible XOR match chain additions.
Definition: bmxor.h:1425
match_pairs_vector_type chain_match_vect_
refined match pairs
Definition: bmxor.h:1028
bv_blocks_vector_type nb_blocks_vect_
pointers to temp blocks
Definition: bmxor.h:1002
bvector_type::allocator_type bv_allocator_type
Definition: bmxor.h:824
xor_matches_vector_type match_vect_
vector of match descr
Definition: bmxor.h:1027
bm::block_waves_xor_descr x_descr_
XOR desriptor.
Definition: bmxor.h:1009
unsigned get_s_bc() const noexcept
Definition: bmxor.h:954
bv_allocator_type alloc_
allocator to produce blocks
Definition: bmxor.h:1007
bv_ref_vector_type::matrix_chain_type matrix_chain_type
Definition: bmxor.h:832
void deoptimize_gap_blocks(size_type nb, const xor_sim_params &params)
Deoptimize vertical slice of GAP blocks.
Definition: bmxor.h:1690
const bm::word_t * get_found_block() const noexcept
Definition: bmxor.h:949
void free_blocks() noexcept
Free the collection of temp blocks.
Definition: bmxor.h:1675
bm::heap_vector< bm::block_xor_match_descr, bv_allocator_type, true > xor_matches_vector_type
Definition: bmxor.h:828
void compute_sim_model(typename xor_sim_model< BV >::matrix_chain_type &sim_model_matr, size_type nb, size_type nb_rank, const bm::xor_sim_params &params)
Compute similarity model for block.
Definition: bmxor.h:1479
xor_matches_vector_type & get_match_vector() noexcept
Definition: bmxor.h:966
bm::xor_complement_match x_block_mtype_
metric type
Definition: bmxor.h:1018
bm::heap_vector< bm::block_waves_xor_descr, bv_allocator_type, true > bv_xdescr_vector_type
Definition: bmxor.h:839
void compute_s_block_stats(const bm::word_t *block) noexcept
Compute statistics for the r-(or s-) block.
Definition: bmxor.h:1057
void get_s_block_stats(size_type ri) noexcept
Get statistics for the r-(or s-) block.
Definition: bmxor.h:1045
void set_ref_vector(const bv_ref_vector_type *ref_vect) noexcept
Definition: bmxor.h:850
BV bvector_type
Definition: bmxor.h:823
bm::xor_complement_match get_best_match_type() const noexcept
Return best match type of a found block.
Definition: bmxor.h:946
static bm::xor_complement_match best_metric(unsigned bc, unsigned gc, unsigned *best_metric) noexcept
Definition: bmxor.h:1641
const bv_ref_vector_type & get_ref_vector() const noexcept
Definition: bmxor.h:853
unsigned get_s_block_best() const noexcept
Definition: bmxor.h:956
unsigned get_x_best_metric() const noexcept
Definition: bmxor.h:951
bool compute_sim_model(xor_sim_model< BV > &sim_model, const bm::xor_sim_params &params)
Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors.
Definition: bmxor.h:1454
unsigned x_best_metric_
min(gc, bc, ibc)
Definition: bmxor.h:1017
bm::heap_vector< bm::match_pair, bv_allocator_type, true > match_pairs_vector_type
Definition: bmxor.h:830
bm::heap_vector< bm::word_t *, bv_allocator_type, true > bv_blocks_vector_type
Definition: bmxor.h:834
unsigned get_s_gc() const noexcept
Definition: bmxor.h:955
unsigned s_gc_
gap count
Definition: bmxor.h:1014
static unsigned char depth[2 *(256+1+29)+1]
unsigned word_bitcount64(bm::id64_t x) noexcept
Definition: bmutil.h:605
bm::id64_t bit_block_xor(bm::word_t *dst, const bm::word_t *src) noexcept
Plain bitblock XOR operation. Function does not analyse availability of source and destination blocks...
Definition: bmfunc.h:8434
bm::id_t word_bitcount(bm::id_t w) noexcept
Definition: bmutil.h:582
void bit_block_xor_change64(const bm::word_t *s_block, const bm::word_t *ref_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
Definition: bmxor.h:151
void bit_block_xor_change32(const bm::word_t *block, const bm::word_t *xor_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
Definition: bmxor.h:96
bm::id64_t calc_block_digest0(const bm::word_t *const block) noexcept
Compute digest for 64 non-zero areas.
Definition: bmfunc.h:1120
unsigned bit_count_min_unroll(const bm::word_t *block, const bm::word_t *block_end) noexcept
Bitcount for bit block without agressive unrolling.
Definition: bmfunc.h:4996
void gap_convert_to_bitset(unsigned *dest, const T *buf, unsigned len=0) noexcept
GAP block to bitblock conversion.
Definition: bmfunc.h:4475
bm::gap_word_t gap_length(const bm::gap_word_t *buf) noexcept
Returs GAP block length.
Definition: bmfunc.h:1603
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
#include<zmmintrin.h>
Definition: bm.h:78
xor_complement_match
XOR complementarity type between 2 blocks.
Definition: bmxor.h:81
@ e_no_xor_match
Definition: bmxor.h:82
@ e_xor_match_GC
Definition: bmxor.h:83
@ e_xor_match_BC
Definition: bmxor.h:84
@ e_xor_match_EQ
Definition: bmxor.h:86
@ e_xor_match_iBC
Definition: bmxor.h:85
const unsigned set_block_digest_wave_size
Definition: bmconst.h:67
unsigned int word_t
Definition: bmconst.h:39
unsigned char check_pair_vect_vbr(const BMChain &mchain, const RVect &ref_vect)
Check effective bit-rate for the XOR encode vector.
Definition: bmxor.h:404
void compute_s_block_descr(const bm::word_t *block, block_waves_xor_descr &x_descr, unsigned *s_gc, unsigned *s_bc) noexcept
Compute reference (non-XOR) 64-dim complexity descriptor for the s-block.
Definition: bmxor.h:430
void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) noexcept
Recalc linear bvector block index into 2D matrix coordinates.
Definition: bmfunc.h:180
unsigned bit_block_change64(const bm::word_t *in_block, unsigned size) noexcept
Definition: bmfunc.h:5203
VT::size_type greedy_refine_match_vector(PVT &match_pairs_vect, VT &match_vect, typename VT::size_type best_ref_idx, bm::id64_t d64, bm::xor_complement_match match_type)
Greedy algorithm to find additional matches improving the inital best match block on its match type.
Definition: bmxor.h:327
unsigned bit_block_change32(const bm::word_t *block, unsigned size) noexcept
Definition: bmfunc.h:5161
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
Definition: bmutil.h:335
void bit_block_change_bc(const bm::word_t *block, unsigned *gc, unsigned *bc) noexcept
Definition: bmfunc.h:5252
unsigned long long int id64_t
Definition: bmconst.h:35
const unsigned block_waves
Definition: bmconst.h:66
bool block_find_first_diff(const bm::word_t *blk, const bm::word_t *arg_blk, unsigned *pos) noexcept
Find first bit which is different between two blocks (GAP or bit)
Definition: bmfunc.h:9268
unsigned short gap_word_t
Definition: bmconst.h:78
void bit_block_xor_change(const bm::word_t *block, const bm::word_t *xor_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
Definition: bmxor.h:207
bm::id_t bvector_size_type
Definition: bm.h:103
const unsigned gap_max_bits
Definition: bmconst.h:81
unsigned long long bmi_blsi_u64(unsigned long long w)
Definition: bmutil.h:345
const struct ncbi::grid::netcache::search::fields::SIZE size
EIPRangeType t
Definition: ncbi_localip.c:101
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static unsigned cnt[256]
#define VT(vt)
std::vector< bm::id64_t > ref_vect
Definition: stress64.cpp:365
XOR match chain.
Definition: bmxor.h:290
bool operator==(const block_match_chain &bmc) const noexcept
Definition: bmxor.h:297
BLOCK_IDX nb
Definition: bmxor.h:291
unsigned chain_size
Definition: bmxor.h:292
bm::id64_t xor_d64[64]
Definition: bmxor.h:294
unsigned ref_idx[64]
Definition: bmxor.h:293
bm::xor_complement_match match
Definition: bmxor.h:295
Structure to compute XOR gap-count profile by sub-block waves.
Definition: bmxor.h:230
unsigned short sb_bc[bm::block_waves]
BIT counts.
Definition: bmxor.h:233
unsigned short sb_gc[bm::block_waves]
GAP counts.
Definition: bmxor.h:232
unsigned short sb_xor_gc[bm::block_waves]
XOR-mask GAP count.
Definition: bmxor.h:236
unsigned short sb_xor_bc[bm::block_waves]
XOR-mask GAP count.
Definition: bmxor.h:237
Capture the XOR filter results (xor block against ref.block)
Definition: bmxor.h:245
size_type ref_idx
reference vector index
Definition: bmxor.h:250
bm::id64_t xor_d64
recorded digest
Definition: bmxor.h:251
bvector_size_type size_type
Definition: bmxor.h:246
unsigned block_gain
XOR filter improvement (best)
Definition: bmxor.h:249
bm::id64_t ibc_d64
Definition: bmxor.h:256
bm::xor_complement_match match_type
match type
Definition: bmxor.h:248
XOR match pair.
Definition: bmxor.h:274
bvector_size_type ref_idx
reference vector index
Definition: bmxor.h:275
bm::id64_t xor_d64
recorded digest
Definition: bmxor.h:276
match_pair(bvector_size_type idx, bm::id64_t d64)
Definition: bmxor.h:279
XOR similarity model.
Definition: bmxor.h:791
bm::dynamic_heap_matrix< block_match_chain_type, bv_allocator_type > matrix_chain_type
Definition: bmxor.h:801
bm::block_match_chain< size_type > block_match_chain_type
Definition: bmxor.h:798
bvector_type::size_type size_type
Definition: bmxor.h:794
matrix_chain_type matr
model matrix
Definition: bmxor.h:804
bvector_type bv_blocks
blocks digest
Definition: bmxor.h:805
bvector_type::allocator_type bv_allocator_type
Definition: bmxor.h:795
Parameters for XOR similarity search.
Definition: bmxor.h:59
unsigned stop_gain
Definition: bmxor.h:62
unsigned min_lookup_depth
Definition: bmxor.h:60
unsigned min_gaps
Definition: bmxor.h:64
unsigned max_lookup_depth
Definition: bmxor.h:61
float target_gain_ratio
Definition: bmxor.h:63
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:28 2024 by modify_doxy.py rev. 669887