1 #ifndef BMXORFUNC__H__INCLUDED__
2 #define BMXORFUNC__H__INCLUDED__
104 unsigned gap_count = 1;
105 unsigned bit_count = 0;
108 w = w0 = *block ^ *xor_block;
111 const int w_shift =
int(
sizeof(w) * 8 - 1);
114 gap_count -= (w_prev = (w0 >> w_shift));
117 for (++block, ++xor_block; block < block_end; ++block, ++xor_block)
119 w = w0 = *block ^ *xor_block;
124 gap_count -= !w_prev;
133 gap_count -= (w0 >> w_shift);
134 gap_count -= !(w_prev ^ w_l);
136 w_prev = (w0 >> w_shift);
159 unsigned gap_count = 1;
160 unsigned bit_count = 0;
166 w = w0 = *block ^ *xor_block;
169 const int w_shift =
int(
sizeof(w) * 8 - 1);
172 gap_count -= unsigned(w_prev = (w0 >> w_shift));
175 for (++block, ++xor_block; block < block_end; ++block, ++xor_block)
177 w = w0 = *block ^ *xor_block;
182 gap_count -= !w_prev;
190 gap_count -= unsigned(w0 >> w_shift);
191 gap_count -= !(w_prev ^ w_l);
192 w_prev = (w0 >> w_shift);
213 #ifdef VECT_BLOCK_XOR_CHANGE
325 template<
typename PVT,
typename VT>
326 typename VT::size_type
329 typename VT::size_type best_ref_idx,
334 match_pairs_vect.resize(0);
340 typename VT::size_type sz = match_vect.size();
341 for (
typename VT::size_type
i = 0; (
i < sz) && (d64_acc != ~0ull); ++
i)
344 if (xmd.
ref_idx == best_ref_idx)
360 const unsigned min_gain_cut_off = 50;
361 for (
typename VT::size_type
i = 0; (
i < sz) && (d64_acc != ~0ull); ++
i)
372 if (xmd.
gc_gain > min_gain_cut_off)
373 d64_new = ~d64_acc & xmd.
gc_d64;
376 if (xmd.
bc_gain > min_gain_cut_off)
377 d64_new = ~d64_acc & xmd.
bc_d64;
380 if (xmd.
ibc_gain > min_gain_cut_off)
381 d64_new = ~d64_acc & xmd.
ibc_d64;
395 return match_pairs_vect.size();
403 template<
typename BMChain,
typename RVect>
407 for (
size_t i = 0;
i < mchain.chain_size; ++
i)
443 #if defined(VECT_BLOCK_CHANGE)
455 x_descr.sb_bc[
i] = (
unsigned short) bc;
464 x_descr.sb_gc[
i] = (
unsigned short) gc;
493 #ifdef VECT_BIT_BLOCK_XOR
508 for ( ;sub_block < sub_block_end; )
510 t_sub_block[0] = sub_block[0] ^ xor_sub_block[0];
511 t_sub_block[1] = sub_block[1] ^ xor_sub_block[1];
512 t_sub_block[2] = sub_block[2] ^ xor_sub_block[2];
513 t_sub_block[3] = sub_block[3] ^ xor_sub_block[3];
514 t_sub_block+=4; sub_block+=4; xor_sub_block+=4;
519 for (; sub_block < sub_block_end; t_sub_block+=4, sub_block+=4)
521 t_sub_block[0] = sub_block[0];
522 t_sub_block[1] = sub_block[1];
523 t_sub_block[2] = sub_block[2];
524 t_sub_block[3] = sub_block[3];
533 const bm::word_t* xor_sub_block = xor_block + off;
534 for (; sub_block < sub_block_end; )
536 t_sub_block[0] = sub_block[0] ^ xor_sub_block[0];
537 t_sub_block[1] = sub_block[1] ^ xor_sub_block[1];
538 t_sub_block[2] = sub_block[2] ^ xor_sub_block[2];
539 t_sub_block[3] = sub_block[3] ^ xor_sub_block[3];
540 t_sub_block+=4; sub_block+=4; xor_sub_block+=4;
545 for (; sub_block < sub_block_end; t_sub_block+=4, sub_block+=4)
547 t_sub_block[0] = sub_block[0];
548 t_sub_block[1] = sub_block[1];
549 t_sub_block[2] = sub_block[2];
550 t_sub_block[3] = sub_block[3];
576 #ifdef VECT_BIT_BLOCK_XOR_2WAY
589 for (; t_sub_block < t_sub_block_end; t_sub_block+=4, xor_sub_block+=4)
591 t_sub_block[0] ^= xor_sub_block[0];
592 t_sub_block[1] ^= xor_sub_block[1];
593 t_sub_block[2] ^= xor_sub_block[2];
594 t_sub_block[3] ^= xor_sub_block[3];
597 const bm::word_t* xor_sub_block = xor_block + off;
600 for (; t_sub_block < t_sub_block_end; t_sub_block+=4, xor_sub_block+=4)
602 t_sub_block[0] ^= xor_sub_block[0];
603 t_sub_block[1] ^= xor_sub_block[1];
604 t_sub_block[2] ^= xor_sub_block[2];
605 t_sub_block[3] ^= xor_sub_block[3];
622 template<
typename BV>
707 bv_blocks.optimize(tb);
715 template<
class BMATR>
725 template<
typename BMATR>
789 template<
typename BV>
818 template<
typename BV>
874 unsigned i,
unsigned j,
1044 template<typename BV>
1056 template<
typename BV>
1069 template<
typename BV>
1079 xmd.xor_gc = xmd.xor_bc = 0;
1087 const bm::word_t* xor_sub_block = xor_block + off;
1089 unsigned xor_gc, xor_bc;
1093 x_descr.sb_xor_bc[
i] = (
unsigned short)xor_bc;
1097 bm::word_t w_l = (sub_block[-1] ^ xor_sub_block[-1]);
1098 bm::word_t w_r = (sub_block[0] ^ xor_sub_block[0]) & 1;
1100 xor_gc -= (w_l == w_r);
1102 x_descr.sb_xor_gc[
i] = (
unsigned short)xor_gc;
1104 xmd.xor_bc += xor_bc;
1105 xmd.xor_gc += xor_gc;
1111 unsigned block_gc_gain(0), block_bc_gain(0), block_ibc_gain(0);
1112 bm::id64_t gc_digest(0), bc_digest(0), ibc_digest(0);
1121 unsigned xor_gc = x_descr.sb_xor_gc[
i];
1125 block_gc_gain += x_descr.sb_gc[
i];
1127 else if (xor_gc < x_descr.sb_gc[
i])
1130 block_gc_gain += (x_descr.sb_gc[
i] - xor_gc);
1132 unsigned xor_bc = x_descr.sb_xor_bc[
i];
1133 if (xor_bc < x_descr.sb_bc[
i])
1136 block_bc_gain += (x_descr.sb_bc[
i] - xor_bc);
1138 unsigned xor_ibc = wave_max_bits - xor_bc;
1139 unsigned wave_ibc = wave_max_bits - x_descr.sb_bc[
i];
1140 if (xor_ibc < wave_ibc)
1142 ibc_digest |= dmask;
1143 block_ibc_gain += (wave_ibc - xor_ibc);
1151 xmd.gc_d64 = gc_digest;
1152 xmd.bc_d64 = bc_digest;
1153 xmd.ibc_d64 = ibc_digest;
1155 xmd.gc_gain = block_gc_gain;
1156 xmd.bc_gain = block_bc_gain;
1157 xmd.ibc_gain = block_ibc_gain;
1162 if (!(block_gc_gain | block_bc_gain | block_ibc_gain))
1176 xmd.block_gain = 0; xmd.xor_d64 = 0;
1192 xmd.match_type =
best_metric(
unsigned(new_bc),
unsigned(new_gc), &best_m);
1193 switch (xmd.match_type)
1196 if (new_ibc < new_gc)
1198 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1202 xmd.block_gain = block_gc_gain; xmd.xor_d64 = gc_digest;
1206 if (new_ibc < new_bc)
1208 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1212 xmd.block_gain = block_bc_gain; xmd.xor_d64 = bc_digest;
1216 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1225 if (block_gc_gain >= block_bc_gain && block_gc_gain >= block_ibc_gain)
1227 xmd.block_gain = block_gc_gain; xmd.xor_d64 = gc_digest;
1230 if (block_bc_gain > block_gc_gain && block_bc_gain > block_ibc_gain)
1232 xmd.block_gain = block_bc_gain; xmd.xor_d64 = bc_digest;
1236 xmd.block_gain = block_ibc_gain; xmd.xor_d64 = ibc_digest;
1244 template<
typename BV>
1250 unsigned i,
unsigned j,
1265 unsigned best_block_gain = 0;
1303 unsigned gc_diff = r_gc - s_gc;
1304 if (gc_diff >= s_gc)
1358 const float bie_bits_per_int = 3.0f;
1359 const unsigned bie_limit =
1362 unsigned xor_bc, xor_gc;
1403 unsigned gain_min = unsigned(
sizeof(
char) +
sizeof(
unsigned));
1412 if (gain > gain_min)
1424 template<
typename BV>
1439 template<
typename BV>
1453 template<
typename BV>
1467 typename bvector_type::enumerator en(sim_model.
bv_blocks);
1468 for (
size_type col = 0; en.valid(); ++en, ++col)
1478 template<
typename BV>
1487 const float bie_bits_per_int = 3.0f;
1488 const unsigned bie_limit =
1537 if (d64 && d64 != ~0ULL)
1552 unsigned xor_bc, xor_gc;
1564 for (
size_type k = 0; k < chain_size; ++k)
1576 bmc.
ref_idx[0] = unsigned(ridx);
1580 bmc.
ref_idx[0] = unsigned(ridx);
1587 auto sz = pm_vect.
size();
1606 template<
typename BV>
1620 auto sz = pm_vect.size();
1639 template<
typename BV>
1674 template<
typename BV>
1678 for (
size_t i = 0;
i < sz; ++
i)
1682 alloc_.free_bit_block(blk);
1689 template<
typename BV>
1711 t_block =
alloc_.alloc_bit_block();
1731 template<
typename BV>
#define BM_DECLARE_TEMP_BLOCK(x)
#define VECT_BIT_BLOCK_XOR(t, src, src_xor, d)
#define VECT_BLOCK_CHANGE(block, size)
#define VECT_BIT_BLOCK_XOR_2WAY(t, src_xor, d)
#define VECT_BLOCK_XOR_CHANGE(block, xor_block, size, gc, bc)
#define IS_VALID_ADDR(addr)
Bit manipulation primitives (internal)
List of reference bit-vectors with their true index associations.
const bvector_type * get_bv(size_type idx) const noexcept
Get reference vector by the index in this ref-vector.
bvector_type::size_type size_type
void add_vectors(const BMATR &bmatr)
Append basic bit-matrix to the list of reference vectors.
bm::block_match_chain< size_type > block_match_chain_type
void fill_alloc_digest(bvector_type &bv_blocks) const
Fill block allocation digest for all vectors in the reference collection.
bm::heap_vector< std::size_t, bv_allocator_type, true > bv_plane_vector_type
static size_type not_found() noexcept
not-found value for find methods
bvector_type * bvector_type_ptr
bool build_nb_digest_and_xor_matrix(matrix_chain_type &matr, bvector_type &bv_blocks) const
Calculate blocks digest and resize XOR distance matrix based on total number of available blocks.
size_type find(std::size_t ref_idx) const noexcept
Find vector index by the reference index.
size_type find_bv(const bvector_type *bv) const noexcept
Find vector index by the pointer.
const bvector_type * bvector_type_const_ptr
void add(const bvector_type *bv, size_type ref_idx)
Add reference vector.
void reset()
reset the collection (resize(0))
void build(const BMATR &bmatr)
Reset and build vector of references from a basic bit-matrix all NULL rows are skipped,...
bm::dynamic_heap_matrix< block_match_chain_type, bv_allocator_type > matrix_chain_type
bm::heap_vector< bvector_type_const_ptr, bv_allocator_type, true > bvptr_vector_type
size_type size() const noexcept
Get reference list size.
size_type get_row_idx(size_type idx) const noexcept
Get reference row index by the index in this ref-vector.
void resize_xor_matrix(matrix_chain_type &matr, size_type total_blocks) const
Utility function to resize matrix based on number of vectors and blocks.
bvector_type::allocator_type bv_allocator_type
void add_sparse_vector(const SV &sv)
Add bit-transposed sparse vector as a bit-matrix.
bv_plane_vector_type ref_bvects_rows_
reference vector row idxs
unsigned rows_acc_
total rows accumulator
bvptr_vector_type ref_bvects_
reference vector pointers
const value_type * row(size_type row_idx) const noexcept
size_type cols() const noexcept
void resize(size_type rows_in, size_type cols_in, bool copy_content=true)
value_type * data() const noexcept
void resize(size_type new_size)
vector resize
value_type & at(size_type pos)
size_type size() const noexcept
void push_back(const value_type &v)
push new element to the back of the vector
buffer_type::size_type size_type
XOR scanner to search for complement-similarities in collections of bit-vectors.
const bv_ref_vector_type * ref_vect_
ref.vect for XOR filter
bm::bv_ref_vector< BV > bv_ref_vector_type
bm::heap_vector< unsigned, bv_allocator_type, true > bv_bcgc_vector_type
bm::id64_t get_xor_digest() const noexcept
void sync_nb_vect()
Sync TEMP vector size.
const bm::word_t * get_ref_block(size_type ri, unsigned i, unsigned j) const noexcept
Return block from the reference vector [vect_idx, block_i, block_j].
bvector_type::size_type size_type
bm::xor_complement_match search_best_xor_mask(const bm::word_t *s_block, size_type ri, size_type ridx_from, size_type ridx_to, unsigned i, unsigned j, bm::word_t *tx_block, const bm::xor_sim_params ¶ms)
Scan for all candidate bit-blocks to find mask or match.
bm::block_waves_xor_descr & get_descr() noexcept
bv_bcgc_vector_type nb_bc_vect_
bool validate_xor(const bm::word_t *xor_block) const noexcept
Check if XOR transform simplified block enough for compressibility objective.
bool compute_sim_model(xor_sim_model< BV > &sim_model, const bv_ref_vector_type &ref_vect, const bm::xor_sim_params ¶ms)
Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors.
size_type found_ridx_
match vector (in references)
const bm::word_t * found_block_xor_
void apply_xor_match_vector(bm::word_t *target_xor_block, const bm::word_t *s_block, size_type s_ri, const match_pairs_vector_type &pm_vect, unsigned i, unsigned j) const noexcept
XOR all match blocks to target using their digest masks.
bm::bit_block_t xor_tmp_block_
bv_xdescr_vector_type nb_xdescr_vect_
bm::id64_t x_d64_
search digest
size_type found_ridx() const noexcept
void compute_xor_complexity_descr(const bm::word_t *block, bm::id64_t block_d64, const bm::word_t *xor_block, bm::block_waves_xor_descr &x_descr, bm::block_xor_match_descr &xmd) const noexcept
Compute reference complexity descriptor based on XOR vector.
unsigned s_block_best_metric_
s-block orig.metric
match_pairs_vector_type & get_match_pairs() noexcept
bv_bcgc_vector_type nb_gc_vect_
size_type refine_match_chain()
Run a search to add possible XOR match chain additions.
match_pairs_vector_type chain_match_vect_
refined match pairs
bv_blocks_vector_type nb_blocks_vect_
pointers to temp blocks
bvector_type::allocator_type bv_allocator_type
xor_matches_vector_type match_vect_
vector of match descr
bm::block_waves_xor_descr x_descr_
XOR desriptor.
unsigned get_s_bc() const noexcept
bv_allocator_type alloc_
allocator to produce blocks
bv_ref_vector_type::matrix_chain_type matrix_chain_type
void deoptimize_gap_blocks(size_type nb, const xor_sim_params ¶ms)
Deoptimize vertical slice of GAP blocks.
const bm::word_t * get_found_block() const noexcept
void free_blocks() noexcept
Free the collection of temp blocks.
bm::heap_vector< bm::block_xor_match_descr, bv_allocator_type, true > xor_matches_vector_type
void compute_sim_model(typename xor_sim_model< BV >::matrix_chain_type &sim_model_matr, size_type nb, size_type nb_rank, const bm::xor_sim_params ¶ms)
Compute similarity model for block.
xor_matches_vector_type & get_match_vector() noexcept
bm::xor_complement_match x_block_mtype_
metric type
bm::heap_vector< bm::block_waves_xor_descr, bv_allocator_type, true > bv_xdescr_vector_type
void compute_s_block_stats(const bm::word_t *block) noexcept
Compute statistics for the r-(or s-) block.
void get_s_block_stats(size_type ri) noexcept
Get statistics for the r-(or s-) block.
void set_ref_vector(const bv_ref_vector_type *ref_vect) noexcept
bm::xor_complement_match get_best_match_type() const noexcept
Return best match type of a found block.
static bm::xor_complement_match best_metric(unsigned bc, unsigned gc, unsigned *best_metric) noexcept
const bv_ref_vector_type & get_ref_vector() const noexcept
unsigned get_s_block_best() const noexcept
unsigned get_x_best_metric() const noexcept
bool compute_sim_model(xor_sim_model< BV > &sim_model, const bm::xor_sim_params ¶ms)
Calculate matrix of best XOR match metrics per block for the attached collection of bit-vectors.
unsigned x_best_metric_
min(gc, bc, ibc)
bm::heap_vector< bm::match_pair, bv_allocator_type, true > match_pairs_vector_type
bm::heap_vector< bm::word_t *, bv_allocator_type, true > bv_blocks_vector_type
unsigned get_s_gc() const noexcept
static unsigned char depth[2 *(256+1+29)+1]
unsigned word_bitcount64(bm::id64_t x) noexcept
bm::id64_t bit_block_xor(bm::word_t *dst, const bm::word_t *src) noexcept
Plain bitblock XOR operation. Function does not analyse availability of source and destination blocks...
bm::id_t word_bitcount(bm::id_t w) noexcept
void bit_block_xor_change64(const bm::word_t *s_block, const bm::word_t *ref_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
void bit_block_xor_change32(const bm::word_t *block, const bm::word_t *xor_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
bm::id64_t calc_block_digest0(const bm::word_t *const block) noexcept
Compute digest for 64 non-zero areas.
unsigned bit_count_min_unroll(const bm::word_t *block, const bm::word_t *block_end) noexcept
Bitcount for bit block without agressive unrolling.
void gap_convert_to_bitset(unsigned *dest, const T *buf, unsigned len=0) noexcept
GAP block to bitblock conversion.
bm::gap_word_t gap_length(const bm::gap_word_t *buf) noexcept
Returs GAP block length.
unsigned int
A callback function used to compare two keys in a database.
xor_complement_match
XOR complementarity type between 2 blocks.
const unsigned set_block_digest_wave_size
unsigned char check_pair_vect_vbr(const BMChain &mchain, const RVect &ref_vect)
Check effective bit-rate for the XOR encode vector.
void compute_s_block_descr(const bm::word_t *block, block_waves_xor_descr &x_descr, unsigned *s_gc, unsigned *s_bc) noexcept
Compute reference (non-XOR) 64-dim complexity descriptor for the s-block.
void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) noexcept
Recalc linear bvector block index into 2D matrix coordinates.
unsigned bit_block_change64(const bm::word_t *in_block, unsigned size) noexcept
VT::size_type greedy_refine_match_vector(PVT &match_pairs_vect, VT &match_vect, typename VT::size_type best_ref_idx, bm::id64_t d64, bm::xor_complement_match match_type)
Greedy algorithm to find additional matches improving the inital best match block on its match type.
unsigned bit_block_change32(const bm::word_t *block, unsigned size) noexcept
unsigned long long bmi_bslr_u64(unsigned long long w) noexcept
void bit_block_change_bc(const bm::word_t *block, unsigned *gc, unsigned *bc) noexcept
unsigned long long int id64_t
const unsigned block_waves
bool block_find_first_diff(const bm::word_t *blk, const bm::word_t *arg_blk, unsigned *pos) noexcept
Find first bit which is different between two blocks (GAP or bit)
unsigned short gap_word_t
void bit_block_xor_change(const bm::word_t *block, const bm::word_t *xor_block, unsigned size, unsigned *gc, unsigned *bc) noexcept
bm::id_t bvector_size_type
const unsigned gap_max_bits
unsigned long long bmi_blsi_u64(unsigned long long w)
const struct ncbi::grid::netcache::search::fields::SIZE size
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
std::vector< bm::id64_t > ref_vect
bool operator==(const block_match_chain &bmc) const noexcept
bm::xor_complement_match match
Structure to compute XOR gap-count profile by sub-block waves.
unsigned short sb_bc[bm::block_waves]
BIT counts.
unsigned short sb_gc[bm::block_waves]
GAP counts.
unsigned short sb_xor_gc[bm::block_waves]
XOR-mask GAP count.
unsigned short sb_xor_bc[bm::block_waves]
XOR-mask GAP count.
Capture the XOR filter results (xor block against ref.block)
size_type ref_idx
reference vector index
bm::id64_t xor_d64
recorded digest
bvector_size_type size_type
unsigned block_gain
XOR filter improvement (best)
bm::xor_complement_match match_type
match type
bvector_size_type ref_idx
reference vector index
bm::id64_t xor_d64
recorded digest
match_pair(bvector_size_type idx, bm::id64_t d64)
bm::dynamic_heap_matrix< block_match_chain_type, bv_allocator_type > matrix_chain_type
bm::block_match_chain< size_type > block_match_chain_type
bvector_type::size_type size_type
matrix_chain_type matr
model matrix
bvector_type bv_blocks
blocks digest
bvector_type::allocator_type bv_allocator_type
Parameters for XOR similarity search.
unsigned min_lookup_depth
unsigned max_lookup_depth