NCBI C++ ToolKit
bmrandom.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef BMRANDOM__H__INCLUDED__
2 #define BMRANDOM__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmrandom.h
22  \brief Generation of random subset
23 */
24 
25 #ifndef BM__H__INCLUDED__
26 // BitMagic utility headers do not include main "bm.h" declaration
27 // #include "bm.h" or "bm64.h" explicitly
28 # error missing include (bm.h or bm64.h)
29 #endif
30 
31 
32 #include "bmfunc.h"
33 #include "bmdef.h"
34 
35 #include <stdlib.h>
36 #include <random>
37 #include <algorithm>
38 
39 namespace bm
40 {
41 
42 
43 /*!
44  Class implements algorithm for random subset generation.
45 
46  Implemented method tries to be fair, but doesn't guarantee
47  true randomeness or absense of bias.
48 
49  Performace note:
50  Class holds temporary buffers and variables, so it is recommended to
51  re-use instances over multiple calls.
52 
53  \ingroup setalgo
54 */
55 template<class BV>
57 {
58 public:
59  typedef BV bvector_type;
60  typedef typename BV::size_type size_type;
61 public:
62  random_subset();
64 
65  /// Get random subset of input vector
66  ///
67  /// @param bv_out - destination vector
68  /// @param bv_in - input vector
69  /// @param sample_count - number of bits to pick
70  ///
71  void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72 
73 
74 private:
75  typedef
76  typename BV::blocks_manager_type blocks_manager_type;
77 
78 private:
79  /// simple picking algorithm for small number of items
80  /// in [first,last] range
81  ///
82  void simple_pick(BV& bv_out,
83  const BV& bv_in,
84  size_type sample_count,
86  size_type last);
87 
88  void get_subset(BV& bv_out,
89  const BV& bv_in,
90  size_type in_count,
91  size_type sample_count);
92 
93  void get_block_subset(bm::word_t* blk_out,
94  const bm::word_t* blk_src,
95  unsigned count);
96  static
97  unsigned process_word(bm::word_t* blk_out,
98  const bm::word_t* blk_src,
99  unsigned nword,
100  unsigned take_count) BMNOEXCEPT;
101 
102  static
103  void get_random_array(bm::word_t* blk_out,
105  unsigned bit_list_size,
106  unsigned count);
107  static
108  unsigned compute_take_count(unsigned bc,
109  size_type in_count, size_type sample_count) BMNOEXCEPT;
110 
111 
112 private:
115 private:
116  typename bvector_type::rs_index_type rsi_; ///< RS index (block counts)
117  bvector_type bv_nb_; ///< blocks vector
120 };
121 
122 
123 ///////////////////////////////////////////////////////////////////
124 
125 
126 
127 template<class BV>
129 {
130  sub_block_ = new bm::word_t[bm::set_block_size];
131 }
132 
133 template<class BV>
135 {
136  delete [] sub_block_;
137 }
138 
139 template<class BV>
140 void random_subset<BV>::sample(BV& bv_out,
141  const BV& bv_in,
142  size_type sample_count)
143 {
144  if (sample_count == 0)
145  {
146  bv_out.clear(true);
147  return;
148  }
149 
150  rsi_.init();
151  bv_in.build_rs_index(&rsi_, &bv_nb_);
152 
153  size_type in_count = rsi_.count();
154  if (in_count <= sample_count)
155  {
156  bv_out = bv_in;
157  return;
158  }
159 
160  float pick_ratio = float(sample_count) / float(in_count);
161  if (pick_ratio < 0.054f)
162  {
164  bool b = bv_in.find_range(first, last);
165  if (!b)
166  return;
167  simple_pick(bv_out, bv_in, sample_count, first, last);
168  return;
169  }
170 
171  if (sample_count > in_count/2)
172  {
173  // build the complement vector and subtract it
174  BV tmp_bv;
175  size_type delta_count = in_count - sample_count;
176 
177  get_subset(tmp_bv, bv_in, in_count, delta_count);
178  bv_out.bit_sub(bv_in, tmp_bv);
179  return;
180  }
181  get_subset(bv_out, bv_in, in_count, sample_count);
182 }
183 
184 template<class BV>
186  const BV& bv_in,
187  size_type sample_count,
189  size_type last)
190 {
191  bv_out.clear(true);
192 
193  std::random_device rd;
194  #ifdef BM64ADDR
195  std::mt19937_64 mt_rand(rd());
196  #else
197  std::mt19937 mt_rand(rd());
198  #endif
199  std::uniform_int_distribution<size_type> dist(first, last);
200 
201  while (sample_count)
202  {
203  size_type fidx;
204  size_type ridx = dist(mt_rand); // generate random position
205 
206  BM_ASSERT(ridx >= first && ridx <= last);
207 
208  bool b = bv_in.find(ridx, fidx); // find next valid bit after random
209  BM_ASSERT(b);
210  if (b)
211  {
212  // set true if was false
213  bool is_set = bv_out.set_bit_conditional(fidx, true, false);
214  sample_count -= is_set;
215  while (!is_set) // find next valid (and not set) bit
216  {
217  ++fidx;
218  // searching always left to right may create a bias...
219  b = bv_in.find(fidx, fidx);
220  if (!b)
221  break;
222  if (fidx > last)
223  break;
224  is_set = bv_out.set_bit_conditional(fidx, true, false);
225  sample_count -= is_set;
226  } // while
227  }
228  } // while
229 }
230 
231 
232 template<class BV>
234  const BV& bv_in,
235  size_type in_count,
236  size_type sample_count)
237 {
238  bv_out.clear(true);
239  bv_out.resize(bv_in.size());
240 
241  const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
242  blocks_manager_type& bman_out = bv_out.get_blocks_manager();
243 
244  bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
245 
246  size_type first_nb, last_nb;
247  bool b = bv_nb_.find_range(first_nb, last_nb);
248  BM_ASSERT(b);
249  if (!b)
250  return;
251 
252  std::random_device rd;
253  #ifdef BM64ADDR
254  std::mt19937_64 mt_rand(rd());
255  #else
256  std::mt19937 mt_rand(rd());
257  #endif
258  std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
259 
260  size_type curr_sample_count = sample_count;
261  for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
262  {
263  // pick block at random
264  //
265  size_type nb;
266  size_type ridx = dist_nb(mt_rand); // generate random block idx
267  BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
268 
269  b = bv_nb_.find(ridx, nb); // find next valid nb
270  if (!b)
271  {
272  b = bv_nb_.find(first_nb, nb);
273  if (!b)
274  {
275  //b = bv_nb_.find(first_nb, nb);
276  BM_ASSERT(!bv_nb_.any());
277  //BM_ASSERT(b);
278  // TODO: seems to be some rounding error, needs to be addressed
279  return; // cannot find block
280  }
281  }
282  bv_nb_.clear_bit_no_check(nb); // remove from blocks list
283 
284  // calculate proportinal sample count
285  //
286  unsigned bc = rsi_.count(nb);
287  BM_ASSERT(bc && (bc <= 65536));
288  take_count = compute_take_count(bc, in_count, sample_count);
289  if (take_count > curr_sample_count)
290  take_count = unsigned(curr_sample_count);
291  BM_ASSERT(take_count);
292  if (!take_count)
293  continue;
294  {
295  unsigned i0, j0;
296  bm::get_block_coord(nb, i0, j0);
297  const bm::word_t* blk_src = bman_in.get_block(i0, j0);
298  BM_ASSERT(blk_src);
299 
300  // allocate target block
301  bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
302  BM_ASSERT(!blk_out);
303  if (blk_out)
304  {
305  blk_out = bman_out.deoptimize_block(nb);
306  }
307  else
308  {
309  blk_out = bman_out.get_allocator().alloc_bit_block();
310  bman_out.set_block(nb, blk_out);
311  }
312  if (take_count == bc) // whole block take (strange)
313  {
314  // copy the whole src block
315  if (BM_IS_GAP(blk_src))
316  bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
317  else
318  bm::bit_block_copy(blk_out, blk_src);
319  continue;
320  }
321  bm::bit_block_set(blk_out, 0);
322  if (bc < 4096) // use array shuffle
323  {
324  unsigned arr_len;
325  // convert source block to bit-block
326  if (BM_IS_GAP(blk_src))
327  {
328  arr_len = bm::gap_convert_to_arr(bit_list_,
329  BMGAP_PTR(blk_src),
331  }
332  else // bit-block
333  {
334  arr_len = bm::bit_block_convert_to_arr(bit_list_, blk_src, 0);
335  }
336  BM_ASSERT(arr_len);
337  get_random_array(blk_out, bit_list_, arr_len, take_count);
338  }
339  else // dense block
340  {
341  // convert source block to bit-block
342  if (BM_IS_GAP(blk_src))
343  {
344  bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
345  blk_src = tmp_block;
346  }
347  // pick random bits source block to target
348  get_block_subset(blk_out, blk_src, take_count);
349  }
350  }
351  } // for
352 }
353 
354 template<class BV>
356  unsigned bc,
357  size_type in_count,
358  size_type sample_count) BMNOEXCEPT
359 {
360  BM_ASSERT(sample_count);
361  float block_percent = float(bc) / float(in_count);
362  float bits_to_take = float(sample_count) * block_percent;
363  bits_to_take += 0.99f;
364  unsigned to_take = unsigned(bits_to_take);
365  if (to_take > bc)
366  to_take = bc;
367  if (!to_take)
368  to_take = unsigned(sample_count);
369 
370  return to_take;
371 }
372 
373 
374 template<class BV>
376  const bm::word_t* blk_src,
377  unsigned take_count)
378 {
379  for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
380  {
381  // pick random scan start and scan direction
382  //
383  unsigned i = unsigned(rand()) % bm::set_block_size;
384  unsigned new_count;
385 
386  for (; i < bm::set_block_size && take_count; ++i)
387  {
388  if (blk_src[i] && (blk_out[i] != blk_src[i]))
389  {
390  new_count = process_word(blk_out, blk_src, i, take_count);
391  take_count -= new_count;
392  }
393  } // for i
394 
395  } // for
396  // if masked scan did not produce enough results..
397  //
398  if (take_count)
399  {
400  // Find all vacant bits: do logical (src SUB out)
401  for (unsigned i = 0; i < bm::set_block_size; ++i)
402  {
403  sub_block_[i] = blk_src[i] & ~blk_out[i];
404  }
405  // now transform vacant bits to array, then pick random elements
406  //
407  unsigned arr_len =
408  bm::bit_block_convert_to_arr(bit_list_, sub_block_, 0);
409  // bm::gap_max_bits,
410  // bm::gap_max_bits,
411  // 0);
412  BM_ASSERT(arr_len);
413  get_random_array(blk_out, bit_list_, arr_len, take_count);
414  }
415 }
416 
417 template<class BV>
419  const bm::word_t* blk_src,
420  unsigned nword,
421  unsigned take_count) BMNOEXCEPT
422 {
423  unsigned new_bits, mask;
424  do
425  {
426  mask = unsigned(rand());
427  mask ^= mask << 16;
428  } while (mask == 0);
429 
430  std::random_device rd;
431  #ifdef BM64ADDR
432  std::mt19937_64 mt_rand(rd());
433  #else
434  std::mt19937 mt_rand(rd());
435  #endif
436  unsigned src_rand = blk_src[nword] & mask;
437  new_bits = src_rand & ~blk_out[nword];
438  if (new_bits)
439  {
440  unsigned new_count = bm::word_bitcount(new_bits);
441 
442  // check if we accidentally picked more bits than needed
443  if (new_count > take_count)
444  {
445  BM_ASSERT(take_count);
446 
447  unsigned char blist[64];
448  unsigned arr_size = bm::bitscan(new_bits, blist);
449  BM_ASSERT(arr_size == new_count);
450  std::shuffle(blist, blist + arr_size, mt_rand);
451  unsigned value = 0;
452  for (unsigned j = 0; j < take_count; ++j)
453  {
454  value |= (1u << blist[j]);
455  }
456  new_bits = value;
457  new_count = take_count;
458 
459  BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
460  BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
461  }
462 
463  blk_out[nword] |= new_bits;
464  return new_count;
465  }
466  return 0;
467 }
468 
469 
470 template<class BV>
473  unsigned bit_list_size,
474  unsigned count)
475 {
476  std::random_device rd;
477  #ifdef BM64ADDR
478  std::mt19937_64 mt_rand(rd());
479  #else
480  std::mt19937 mt_rand(rd());
481  #endif
482  std::shuffle(bit_list, bit_list + bit_list_size, mt_rand);
483  for (unsigned i = 0; i < count; ++i)
484  {
485  bm::set_bit(blk_out, bit_list[i]);
486  }
487 }
488 
489 } // namespace
490 
491 
492 #endif
ncbi::TMaskedQueryRegions mask
Definitions(internal)
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BMGAP_PTR(ptr)
Definition: bmdef.h:189
#define BM_IS_GAP(ptr)
Definition: bmdef.h:191
#define BM_ASSERT
Definition: bmdef.h:139
Bit manipulation primitives (internal)
void get_block_subset(bm::word_t *blk_out, const bm::word_t *blk_src, unsigned count)
Definition: bmrandom.h:375
static unsigned process_word(bm::word_t *blk_out, const bm::word_t *blk_src, unsigned nword, unsigned take_count) noexcept
Definition: bmrandom.h:418
random_subset & operator=(const random_subset &)
bvector_type bv_nb_
blocks vector
Definition: bmrandom.h:117
void get_subset(BV &bv_out, const BV &bv_in, size_type in_count, size_type sample_count)
Definition: bmrandom.h:233
bm::word_t * sub_block_
Definition: bmrandom.h:119
bm::gap_word_t bit_list_[bm::gap_max_bits]
Definition: bmrandom.h:118
BV::blocks_manager_type blocks_manager_type
Definition: bmrandom.h:76
static void get_random_array(bm::word_t *blk_out, bm::gap_word_t *bit_list, unsigned bit_list_size, unsigned count)
Definition: bmrandom.h:471
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition: bmrandom.h:140
static unsigned compute_take_count(unsigned bc, size_type in_count, size_type sample_count) noexcept
Definition: bmrandom.h:355
void simple_pick(BV &bv_out, const BV &bv_in, size_type sample_count, size_type first, size_type last)
simple picking algorithm for small number of items in [first,last] range
Definition: bmrandom.h:185
bvector_type::rs_index_type rsi_
RS index (block counts)
Definition: bmrandom.h:116
BV::size_type size_type
Definition: bmrandom.h:60
random_subset(const random_subset &)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
bm::id_t word_bitcount(bm::id_t w) noexcept
Definition: bmutil.h:582
unsigned bit_block_convert_to_arr(T *dest, const unsigned *src, bool inverted) noexcept
Convert bit block into an array of ints corresponding to 1 bits.
Definition: bmfunc.h:9064
unsigned short bitscan(V w, B *bits) noexcept
Templated Bitscan with dynamic dispatch for best type.
Definition: bmfunc.h:769
void bit_block_set(bm::word_t *dst, bm::word_t value) noexcept
Bitblock memset operation.
Definition: bmfunc.h:4456
void bit_block_copy(bm::word_t *dst, const bm::word_t *src) noexcept
Bitblock copy operation.
Definition: bmfunc.h:6777
void set_bit(unsigned *dest, unsigned bitpos) noexcept
Set 1 bit in a block.
Definition: bmfunc.h:3721
unsigned bit_list(T w, B *bits) noexcept
Unpacks word into list of ON bit indexes.
Definition: bmfunc.h:595
void gap_convert_to_bitset(unsigned *dest, const T *buf, unsigned len=0) noexcept
GAP block to bitblock conversion.
Definition: bmfunc.h:4475
D gap_convert_to_arr(D *dest, const T *buf, unsigned dest_len, bool invert=false) noexcept
Convert gap block into array of ints corresponding to 1 bits.
Definition: bmfunc.h:4942
int i
#include<zmmintrin.h>
Definition: bm.h:78
unsigned int word_t
Definition: bmconst.h:39
void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) noexcept
Recalc linear bvector block index into 2D matrix coordinates.
Definition: bmfunc.h:180
const unsigned set_block_size
Definition: bmconst.h:55
unsigned short gap_word_t
Definition: bmconst.h:78
const unsigned gap_max_bits
Definition: bmconst.h:81
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define count
Modified on Wed Sep 04 14:58:32 2024 by modify_doxy.py rev. 669887