NCBI C++ ToolKit
bmstrsparsevec.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef BMSTRSPARSEVEC__H__INCLUDED__
2 #define BMSTRSPARSEVEC__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit: http://bitmagic.io
19 */
20 
21 /*! \file bmstrsparsevec.h
22  \brief string sparse vector based on bit-transposed matrix
23 */
24 
25 #include <stddef.h>
26 #include "bmconst.h"
27 
28 #ifndef BM_NO_STL
29 #include <stdexcept>
30 #include <string_view>
31 #endif
32 
33 #ifndef BM__H__INCLUDED__
34 // BitMagic utility headers do not include main "bm.h" declaration
35 // #include "bm.h" or "bm64.h" explicitly
36 # error missing include (bm.h or bm64.h)
37 #endif
38 
39 #include "bmtrans.h"
40 #include "bmalgo.h"
41 #include "bmbuffer.h"
42 #include "bmbmatrix.h"
43 #include "bmdef.h"
44 
45 namespace bm
46 {
47 
48 enum class remap_setup
49 {
50  COPY_RTABLES, //!< copy remap tables only (without data)
51 };
52 
53 
54 /*!
55  \brief succinct sparse vector for strings with compression using bit-slicing ( transposition) method
56 
57  Initial string is bit-transposed into bit-slices so collection may use less
58  memory due to prefix sum (GAP) compression in bit-slices. In addition, the container
59  can use chracter re-mapping using char freaquencies to compute the minimal codes.
60  Re-mapping can reduce memory footprint, get better search performance and improve storage
61  compression.
62 
63  Template parameters:
64  CharType - type of character (char or unsigned char) (wchar not tested)
65  BV - bit-vector for bit-slicing
66  STR_SIZE - initial string size (can dynamically increase on usage)
67 
68  @ingroup sv
69 */
70 template<typename CharType, typename BV, unsigned STR_SIZE>
71 class str_sparse_vector : public base_sparse_vector<CharType, BV, STR_SIZE>
72 {
73 public:
74  typedef BV bvector_type;
77  typedef CharType value_type;
78  typedef CharType* value_type_prt;
79  typedef typename bvector_type::size_type size_type;
80  typedef typename BV::allocator_type allocator_type;
81  typedef typename bvector_type::allocation_policy allocation_policy_type;
82  typedef typename bvector_type::enumerator bvector_enumerator_type;
83  typedef typename allocator_type::allocator_pool_type allocator_pool_type;
87 
88  /*! Statistical information about memory allocation details. */
89  struct statistics : public bv_statistics
90  {};
91 
93  {
94  sv_octet_slices = STR_SIZE
95  };
96 
97  /** Matrix of character remappings
98  @internal
99  */
103 
104  /** Matrix of character frequencies (for optimal code remap)
105  @internal
106  */
107  typedef
109 
110  struct is_remap_support { enum trait { value = true }; };
111  struct is_rsc_support { enum trait { value = false }; };
112  struct is_dynamic_splices { enum trait { value = true }; };
113 
115  {
116  public:
117  typedef
120  protected:
122  };
123 
124  /**
125  Reference class to access elements via common [] operator
126  @ingroup sv
127  */
128  class const_reference : protected reference_base
129  {
130  public:
133  size_type idx)
134  : str_sv_(str_sv), idx_(idx)
135  {
136  this->buf_.resize(str_sv.effective_max_str());
137  }
138 
139  operator const value_type*() const BMNOEXCEPT
140  {
141  return get();
142  }
143 
145  {
146  str_sv_.get(idx_, this->buf_.data(), str_sv_.effective_max_str());
147  return this->buf_.data();
148  }
149 
150  bool operator==(const const_reference& ref) const BMNOEXCEPT
151  { return bool(*this) == bool(ref); }
152  bool is_null() const BMNOEXCEPT { return str_sv_.is_null(idx_); }
153  private:
156  };
157 
158  /**
159  Reference class to access elements via common [] operator
160  @ingroup sv
161  */
162  class reference : protected reference_base
163  {
164  public:
166  size_type idx)
167  : str_sv_(str_sv), idx_(idx)
168  {
169  this->buf_.resize(str_sv.effective_max_str());
170  }
171 
172  operator const value_type*() const BMNOEXCEPT
173  {
174  return get();
175  }
176 
178  {
179  str_sv_.get(idx_, this->buf_.data(), str_sv_.effective_max_str());
180  return this->buf_.data();
181  }
182 
184  {
185  // TO DO: implement element copy bit by bit
186  str_sv_.set(idx_, (const value_type*)ref);
187  return *this;
188  }
189 
191  {
192  str_sv_.set(idx_, str);
193  return *this;
194  }
195  bool operator==(const reference& ref) const BMNOEXCEPT
196  { return bool(*this) == bool(ref); }
197  bool is_null() const BMNOEXCEPT { return str_sv_.is_null(idx_); }
198  private:
201  };
202 
203  /**
204  Const iterator to do quick traverse of the sparse vector.
205 
206  Implementation uses buffer for decoding so, competing changes
207  to the original vector may not match the iterator returned values.
208 
209  This iterator keeps an operational buffer of decoded elements, so memory footprint is NOT negligable.
210 
211  @ingroup sv
212  */
214  {
215  public:
216  friend class str_sparse_vector;
217 #ifndef BM_NO_STL
218  typedef std::input_iterator_tag iterator_category;
219  typedef std::basic_string_view<CharType> string_view_type;
220 #endif
226  typedef typename bvector_type::allocator_type allocator_type;
227  typedef typename allocator_type::allocator_pool_type allocator_pool_type;
228 
229  typedef long long difference_type;
230  typedef CharType* pointer;
231  typedef CharType*& reference;
232  public:
233  /**
234  Construct iterator (not attached to any particular vector)
235  */
237  /**
238  Construct iterator (attached to sparse vector)
239  @param sv - pointer to sparse vector
240  */
242  /**
243  Construct iterator (attached to sparse vector) and positioned
244  @param sv - reference to sparse vector
245  @param pos - position in the vector to start
246  */
248 
250 
251  /**
252  setup iterator to retrieve a sub-string of a string
253  @param from - Position of the first character to be copied
254  @param len - length of a substring (defult: 0 read to the available end)
255  */
256  void set_substr(unsigned from, unsigned len = 0) BMNOEXCEPT;
257 
258 
259  bool operator==(const const_iterator& it) const BMNOEXCEPT
260  { return (pos_ == it.pos_) && (sv_ == it.sv_); }
261  bool operator!=(const const_iterator& it) const BMNOEXCEPT
262  { return ! operator==(it); }
263  bool operator < (const const_iterator& it) const BMNOEXCEPT
264  { return pos_ < it.pos_; }
265  bool operator <= (const const_iterator& it) const BMNOEXCEPT
266  { return pos_ <= it.pos_; }
267  bool operator > (const const_iterator& it) const BMNOEXCEPT
268  { return pos_ > it.pos_; }
269  bool operator >= (const const_iterator& it) const BMNOEXCEPT
270  { return pos_ >= it.pos_; }
271 
272  /// \brief Get current position (value)
273  const value_type* operator*() const
274  { return this->value(); }
275 
276  /// \brief Advance to the next available value
278  { this->advance(); return *this; }
279 
280  /// \brief Advance to the next available value
282  { const_iterator tmp(*this);this->advance(); return tmp; }
283 
284 
285  /// \brief Get zero terminated string value at the current position
286  const value_type* value() const;
287 
288  /// \brief Get current string as string_view
290 
291  /// \brief Get NULL status
292  bool is_null() const BMNOEXCEPT { return sv_->is_null(this->pos_); }
293 
294  /// Returns true if iterator is at a valid position
295  bool valid() const BMNOEXCEPT { return pos_ != bm::id_max; }
296 
297  /// Invalidate current iterator
299 
300  /// Current position (index) in the vector
301  size_type pos() const BMNOEXCEPT { return pos_; }
302 
303  /// re-position to a specified position
305 
306  /// advance iterator forward by one
307  void advance() BMNOEXCEPT;
308 
309  protected:
311  {
312  n_rows = 1024
313  };
315 
316  private:
317  const str_sparse_vector_type* sv_; ///!< ptr to parent
318  unsigned substr_from_; ///!< substring from
319  unsigned substr_to_; ///!< substring to
320  mutable size_type pos_; ///!< Position
321  mutable buffer_matrix_type buf_matrix_; ///!< decode value buffer
322  mutable size_type pos_in_buf_; ///!< buffer position
323  };
324 
325 
326  /**
327  Back insert iterator implements buffered insert, faster than generic
328  access assignment.
329 
330  Limitations for buffered inserter:
331  1. Do not use more than one inserter (into one vector) at the same time
332  2. Use method flush() at the end to send the rest of accumulated buffer
333  flush is happening automatically on destruction, but if flush produces an
334  exception (for whatever reason) it will be an exception in destructor.
335  As such, explicit flush() is safer way to finilize the sparse vector load.
336 
337  @ingroup sv
338  */
340  {
341  public:
342 #ifndef BM_NO_STL
343  typedef std::output_iterator_tag iterator_category;
344 #endif
350  typedef typename bvector_type::allocator_type allocator_type;
351  typedef typename allocator_type::allocator_pool_type allocator_pool_type;
352 
353  typedef void difference_type;
354  typedef void pointer;
355  typedef void reference;
356 
357  public:
361 
363  {
364  BM_ASSERT(bi.empty());
366  bi.buf_matrix_.rows(), bi.buf_matrix_.cols());
367  this->flush_impl(); sv_ = bi.sv_;
368  return *this;
369  }
370 
372 
373  /**
374  Set optimization on load option (deafult: false)
375  */
376  void set_optimize(typename bvector_type::optmode opt_mode) BMNOEXCEPT
377  { opt_mode_ = opt_mode; }
378 
379  /**
380  Method to configure back inserter to collect statistics on optimal character codes.
381  This methos makes back inserter slower, but can be used to accelerate later remap() of
382  the sparse vector. Use flush at the end to apply the remapping.
383  By default inserter does not collect additional statistics.
384 
385  Important! You should NOT use intermediate flush if you set remapping!
386 
387  @sa flush
388  */
389  void set_remap(bool flag) BMNOEXCEPT { remap_flags_ = flag; }
390 
391  /// Get curent remap state flags
392  unsigned get_remap() const BMNOEXCEPT { return remap_flags_; }
393 
394  /** push value to the vector */
396  { this->add(v); return *this; }
397 
398 
399  /** push value to the vector */
400  template<typename StrType>
401  back_insert_iterator& operator=(const StrType& v)
402  {
403  this->add(v.c_str()); return *this; // TODO: avoid c_str()
404  }
405 
406  /** noop */
407  back_insert_iterator& operator*() { return *this; }
408  /** noop */
409  back_insert_iterator& operator++() { return *this; }
410  /** noop */
411  back_insert_iterator& operator++( int ) { return *this; }
412 
413  /** add value to the container*/
414  void add(const value_type* v);
415 
416  /** add NULL (no-value) to the container */
417  void add_null();
418 
419  /** add a series of consequitve NULLs (no-value) to the container */
420  void add_null(size_type count);
421 
422  /** flush the accumulated buffer. It is important to call flush at the end, before destruction of the
423  inserter. Flush may throw exceptions, which will be possible to intercept.
424  Otherwise inserter is flushed in the destructor.
425  */
426  void flush();
427 
428  // access to internals
429  //
430 
431  /// Get octet frequence matrix
432  /// @internal
434  { return omatrix_; }
435 
436  protected:
437  /** return true if insertion buffer is empty */
438  bool empty() const BMNOEXCEPT;
439 
441 
442  /** add value to the buffer without changing the NULL vector
443  @param v - value to push back
444  @internal
445  */
446  void add_value(const value_type* v);
447 
448  /**
449  account new value as remap statistics
450  */
452 
453  void flush_impl();
454 
455  private:
457  {
459  };
461  friend class str_sparse_vector;
462 
463  protected:
464  str_sparse_vector_type* sv_; ///!< pointer on the parent vector
465  bvector_type* bv_null_; ///!< not NULL vector pointer
466  buffer_matrix_type buf_matrix_; ///!< value buffer
467  size_type pos_in_buf_; ///!< buffer position
468  block_idx_type prev_nb_ = 0;///!< previous block added
469  typename
470  bvector_type::optmode opt_mode_ = bvector_type::opt_compress;
471  ///
472  unsigned remap_flags_ = 0; ///< target remapping
473  octet_freq_matrix_type omatrix_; ///< octet frequency matrix
474  };
475 
476 
477 public:
478 
479  /*!
480  \brief Sparse vector constructor
481 
482  \param null_able - defines if vector supports NULL values flag
483  by default it is OFF, use bm::use_null to enable it
484  \param ap - allocation strategy for underlying bit-vectors
485  Default allocation policy uses BM_BIT setting (fastest access)
486  \param bv_max_size - maximum possible size of underlying bit-vectors
487  Please note, this is NOT size of svector itself, it is dynamic upper limit
488  which should be used very carefully if we surely know the ultimate size
489  \param alloc - allocator for bit-vectors
490 
491  \sa bvector<>
492  \sa bm::bvector<>::allocation_policy
493  \sa bm::startegy
494  */
497  size_type bv_max_size = bm::id_max,
498  const allocator_type& alloc = allocator_type());
499 
500  /*! copy-ctor */
501  str_sparse_vector(const str_sparse_vector& str_sv);
502 
503  /*! construct empty sparse vector, copying the remap tables from another vector
504  \param str_sv - source vector to take the remap tables from (assumed to be remaped)
505  \param remap_mode - remap table copy param
506  */
507  str_sparse_vector(const str_sparse_vector& str_sv, bm::remap_setup remap_mode);
508 
509 
510  /*! copy assignmment operator */
513  {
514  if (this != &str_sv)
515  parent_type::copy_from(str_sv);
516  remap_flags_ = str_sv.remap_flags_;
519  return *this;
520  }
521 #ifndef BM_NO_CXX11
522  /*! move-ctor */
524  {
525  parent_type::swap(str_sv);
526  remap_flags_ = str_sv.remap_flags_;
527  remap_matrix1_.swap(str_sv.remap_matrix1_);
528  remap_matrix2_.swap(str_sv.remap_matrix2_);
529  }
530 
531  /*! move assignmment operator */
534  {
535  if (this != &str_sv)
536  this->swap(str_sv);
537  return *this;
538  }
539 #endif
540 
541 public:
542 
543  // ------------------------------------------------------------
544  /*! @name String element access */
545  ///@{
546 
547  /** \brief Operator to get read access to an element */
549  { return const_reference(*this, idx); }
550 
551  /** \brief Operator to get write access to an element */
552  reference operator[](size_type idx) { return reference(*this, idx); }
553 
554  /*!
555  \brief set specified element with bounds checking and automatic resize
556  \param idx - element index (vector auto-resized if needs to)
557  \param str - string to set (zero terminated)
558  */
559  void set(size_type idx, const value_type* str);
560 
561  /*!
562  \brief set NULL status for the specified element
563  Vector is resized automatically
564  \param idx - element index (vector auto-resized if needs to)
565  */
566  void set_null(size_type idx);
567 
568  /**
569  Set NULL all elements set as 1 in the argument vector
570  \param bv_idx - index bit-vector for elements which needs to be turned to NULL
571  */
572  void set_null(const bvector_type& bv_idx)
573  { this->bit_sub_rows(bv_idx, true); }
574 
575  /**
576  Set vector elements spcified by argument bit-vector to empty
577  Note that set to empty elements are NOT going to tuned to NULL (NULL qualifier is preserved)
578  \param bv_idx - index bit-vector for elements which to be set to 0
579  */
580  void clear(const bvector_type& bv_idx)
581  { this->bit_sub_rows(bv_idx, false); }
582 
583 
584  /**
585  Set NULL all elements NOT set as 1 in the argument vector
586  \param bv_idx - index bit-vector for elements which needs to be kept
587  */
588  void keep(const bvector_type& bv_idx) { this->bit_and_rows(bv_idx); }
589 
590 
591  /*!
592  \brief insert the specified element
593  \param idx - element index (vector auto-resized if needs to)
594  \param str - string to set (zero terminated)
595  */
596  void insert(size_type idx, const value_type* str);
597 
598  /**
599  \brief swap two vector elements between each other
600  \param idx1 - element index 1
601  \param idx1 - element index 2
602  */
603  void swap(size_type idx1, size_type idx2);
604 
605 
606 
607  /*!
608  \brief insert STL string
609  \param idx - element index (vector auto-resized if needs to)
610  \param str - STL string to set
611  */
612  template<typename StrType>
613  void insert(size_type idx, const StrType& str)
614  {
615  this->insert(idx, str.c_str()); // TODO: avoid c_str()
616  }
617 
618  /*!
619  \brief erase the specified element
620  \param idx - element index
621  */
622  void erase(size_type idx);
623 
624  /*!
625  \brief get specified element
626 
627  \param idx - element index
628  \param str - string buffer
629  \param buf_size - string buffer size
630 
631  @return string length
632  */
633  size_type get(size_type idx,
634  value_type* str, size_type buf_size) const BMNOEXCEPT;
635 
636  /*!
637  \brief set specified element with bounds checking and automatic resize
638 
639  This is an equivalent of set() method, but templetized to be
640  more compatible with the STL std::string and the likes
641 
642  \param idx - element index (vector auto-resized if needs to)
643  \param str - input string
644  expected an STL class with size() support,
645  like basic_string<> or vector<char>
646  */
647  template<typename StrType>
648  void assign(size_type idx, const StrType& str)
649  {
650  if (idx >= this->size())
651  this->size_ = idx+1;
652 
653  size_type str_size = size_type(str.size());
654  if (!str_size)
655  {
656  this->clear_value_planes_from(0, idx);
657  return;
658  }
659  for (size_type i=0; i < str_size; ++i)
660  {
661  CharType ch = str[i];
662  if (remap_flags_) // compressional re-mapping is in effect
663  {
664  unsigned char remap_value = remap_matrix2_.get(i, unsigned(ch));
665  BM_ASSERT(remap_value);
666  ch = CharType(remap_value);
667  }
668  this->bmatr_.set_octet(idx, i, (unsigned char)ch);
669  if (!ch)
670  break;
671  } // for i
672  this->clear_value_planes_from(unsigned(str_size*8), idx);
673  if (bvector_type* bv_null = this->get_null_bvect())
674  bv_null->set_bit_no_check(idx);
675  }
676 
677  /*!
678  \brief push back a string
679  \param str - string to set
680  (STL class with size() support, like basic_string)
681  */
682  template<typename StrType>
683  void push_back(const StrType& str) { assign(this->size_, str); }
684 
685  /*!
686  \brief push back a string (zero terminated)
687  \param str - string to set
688  */
689  void push_back(const value_type* str) { set(this->size_, str); }
690 
691  /*!
692  \brief push back specified amount of NULL values
693  \param count - number of NULLs to push back
694  */
695  void push_back_null(size_type count);
696 
697  /*!
698  \brief push back NULL value
699  */
701 
702  /*!
703  \brief get specified string element if NOT NULL
704  Template method expects an STL-compatible type basic_string<>
705  \param idx - element index (vector auto-resized if needs to)
706  \param str - string to get [out]
707  \return true if element is not null and try-get successfull
708  */
709  template<typename StrType>
710  bool try_get(size_type idx, StrType& str) const
711  {
712  if (this->is_null(idx))
713  return false;
714  get(idx, str);
715  return true;
716  }
717 
718 
719  /*!
720  \brief get specified string element
721  Template method expects an STL-compatible type basic_string<>
722  \param idx - element index (vector auto-resized if needs to)
723  \param str - string to get [out]
724  */
725  template<typename StrType>
726  void get(size_type idx, StrType& str) const
727  {
728  str.clear();
729  for (unsigned i = 0; true; ++i)
730  {
731  CharType ch = CharType(this->bmatr_.get_octet(idx, i));
732  if (!ch)
733  break;
734  if (remap_flags_)
735  {
736  const unsigned char* remap_row = remap_matrix1_.row(i);
737  unsigned char remap_value = remap_row[unsigned(ch)];
738  BM_ASSERT(remap_value);
739  if (!remap_value) // unknown dictionary element
740  {
741  throw_bad_value(0);
742  break;
743  }
744  ch = CharType(remap_value);
745  }
746  str.push_back(ch);
747  } // for i
748  }
749 
750  /*! Swap content */
751  void swap(str_sparse_vector& str_sv) BMNOEXCEPT;
752 
753  ///@}
754 
755  // ------------------------------------------------------------
756  /*! @name Element comparison functions */
757  ///@{
758 
759  /**
760  \brief Compare vector element with argument lexicographically
761 
762  The function does not account for NULL values, NULL element is treated as an empty string
763 
764  NOTE: for a re-mapped container, input string may have no correct
765  remapping, in this case we have an ambiguity
766  (we know it is not equal (0) but LT or GT?).
767  Behavior is undefined.
768 
769  \param idx - vactor element index
770  \param str - argument to compare with
771 
772  \return 0 - equal, < 0 - vect[idx] < str, >0 otherwise
773  */
774  int compare(size_type idx, const value_type* str) const BMNOEXCEPT;
775 
776  static
777  int compare_str(const value_type* str1, const value_type* str2) BMNOEXCEPT;
778 
779  static
780  int compare_str(const value_type* str1, const value_type* str2, size_t min_len) BMNOEXCEPT;
781 
782  /**
783  \brief Compare two vector elements
784 
785  The function does not account for NULL values, NULL element is treated as an empty string
786  \param idx1 - vactor element index 1
787  \param idx2 - vactor element index 2
788 
789  \return 0 - equal, < 0 - vect[idx1] < vect[idx2], >0 otherwise
790  */
791  int compare(size_type idx1, size_type idx2) const BMNOEXCEPT;
792 
793 
794  /**
795  \brief Find size of common prefix between two vector elements in octets
796  @param prefix_buf - optional param for keeping the common prefix string (without remap decode)
797  \return size of common prefix
798  */
799  template<bool USE_PREFIX_BUF = false>
800  unsigned common_prefix_length(size_type idx1, size_type idx2,
801  value_type* prefix_buf=0) const BMNOEXCEPT;
802 
803  /**
804  Variant of compare for remapped vectors. Caller MUST guarantee vector is remapped.
805  */
807 
808  /**
809  Variant of compare for non-mapped vectors. Caller MUST guarantee vector is not remapped.
810  */
812 
813  ///@}
814 
815 
816  // ------------------------------------------------------------
817  /*! @name Clear */
818  ///@{
819 
820  /*! \brief resize to zero, free memory
821  @param free_mem - true - free all bit-vectors memory,
822  false - set bit-vecor to zero (memory remains reserved)
823  @param remap - 0 - set to no-remap (default), 1 - keep remap substitution matrix for possible re-use
824  (if remap() was ever called on this vector with the datawith same frequency profiles)
825  Note that feeding the data with disimilar frequency profile would cause undefined behavior.
826  @sa remap
827  */
828  void clear_all(bool free_mem, unsigned remap=0) BMNOEXCEPT;
829 
830  /*! \brief resize to zero, free memory, reset remapping */
831  void clear() BMNOEXCEPT { clear_all(true, 0); }
832 
833  /*!
834  \brief clear range (assign bit 0 for all planes)
835  \param left - interval start
836  \param right - interval end (closed interval)
837  \param set_null - set cleared values to unassigned (NULL)
838  */
840  clear_range(size_type left, size_type right, bool set_null = false)
841  {
842  parent_type::clear_range(left, right, set_null);
843  return *this;
844  }
845 
846 
847  ///@}
848 
849 
850  // ------------------------------------------------------------
851  /*! @name Size, etc */
852  ///@{
853 
854  /*! \brief return size of the vector
855  \return size of sparse vector
856  */
857  size_type size() const { return this->size_; }
858 
859  /*! \brief return true if vector is empty
860  \return true if empty
861  */
862  bool empty() const { return (size() == 0); }
863 
864  /*! \brief resize vector
865  \param sz - new size
866  */
867  void resize(size_type sz) { parent_type::resize(sz, true); }
868 
869  /*! \brief get maximum string length capacity
870  \return maximum string length sparse vector can take
871  */
872  static size_type max_str() { return sv_octet_slices; }
873 
874  /*! \brief get effective string length used in vector
875  Calculate and returns efficiency, how close are we
876  to the reserved maximum.
877  \return current string length maximum
878  */
880 
881  /*! \brief get effective string length used in vector
882  \return current string length maximum
883  */
885 
886  /**
887  \brief recalculate size to exclude tail NULL elements
888  After this call size() will return the true size of the vector
889  */
890  void sync_size() BMNOEXCEPT;
891 
892  ///@}
893 
894 
895  // ------------------------------------------------------------
896  /*! @name Memory optimization/compression */
897  ///@{
898 
899  /*!
900  \brief run memory optimization for all vector planes
901  \param temp_block - pre-allocated memory block to avoid unnecessary re-allocs
902  \param opt_mode - requested compression depth
903  \param stat - memory allocation statistics after optimization
904  */
905  void optimize(
906  bm::word_t* temp_block = 0,
907  typename bvector_type::optmode opt_mode = bvector_type::opt_compress,
908  typename str_sparse_vector<CharType, BV, STR_SIZE>::statistics* stat = 0);
909 
910  /*!
911  @brief Calculates memory statistics.
912 
913  Function fills statistics structure containing information about how
914  this vector uses memory and estimation of max. amount of memory
915  bvector needs to serialize itself.
916 
917  @param st - pointer on statistics structure to be filled in.
918 
919  @sa statistics
920  */
921  void calc_stat(
922  struct str_sparse_vector<CharType, BV, STR_SIZE>::statistics* st
923  ) const BMNOEXCEPT;
924 
925  /**
926  @brief Turn sparse vector into immutable mode
927  Read-only (immutable) vector uses less memory and allows faster searches.
928  Before freezing it is recommenede to call optimize() to get full memory saving effect
929  @sa optimize, remap
930  */
931  void freeze() { this->freeze_matr(); }
932 
933  /** Returns true if vector is read-only */
934  bool is_ro() const BMNOEXCEPT { return this->is_ro_; }
935 
936  ///@}
937 
938  // ------------------------------------------------------------
939  /*! @name Iterator access */
940  ///@{
941 
942  /** Provide const iterator access to container content */
943  const_iterator begin() const BMNOEXCEPT;
944 
945  /** Provide const iterator access to the end */
947 
948  /** Get const_itertor re-positioned to specific element
949  @param idx - position in the sparse vector
950  */
952  { return const_iterator(this, idx); }
953 
954  /** Provide back insert iterator
955  Back insert iterator implements buffered insertion, which is faster, than random access
956  or push_back
957  */
959  { return back_insert_iterator(this); }
960 
961  ///@}
962 
963 
964 
965  // ------------------------------------------------------------
966  /*! @name Various traits */
967  ///@{
968 
969  /** \brief various type traits
970  */
971  static constexpr
972  bool is_compressed() BMNOEXCEPT { return false; }
973 
974  static constexpr
975  bool is_str() BMNOEXCEPT { return true; }
976 
977  ///@}
978 
979  // ------------------------------------------------------------
980  /*! @name Char remapping, succinct utilities
981 
982  Remapping runs character usage analysis (frequency analysis)
983  based on that implements reduction of dit-depth thus improves
984  search performance and memory usage (both RAM and serialized).
985 
986  Remapping limits farther modifications of sparse vector.
987  (Use remapped vector as read-only).
988  */
989 
990  ///@{
991 
992  /**
993  Get character remapping status (true | false)
994  */
995  bool is_remap() const BMNOEXCEPT { return remap_flags_ != 0; }
996 
997  /**
998  Build remapping profile and load content from another sparse vector
999  Remapped vector likely saves memory (both RAM and disk) but
1000  should not be modified (should be read-only).
1001 
1002  \param str_sv - source sparse vector (assumed it is not remapped)
1003  \param omatrix - pointer to externall computed char freaquency matrix (optional)
1004  \so remap, freeze
1005  */
1006  void remap_from(const str_sparse_vector& str_sv,
1007  octet_freq_matrix_type* omatrix = 0);
1008 
1009  /**
1010  Build remapping profile and re-load content to save memory
1011  */
1012  void remap();
1013 
1014  /*!
1015  Calculate flags which octets are present on each byte-plane.
1016  @internal
1017  */
1018  void calc_octet_stat(octet_freq_matrix_type& octet_matrix) const;
1019 
1020  /*!
1021  Compute optimal remap codes
1022  @internal
1023  */
1024  void build_octet_remap(
1025  slice_octet_matrix_type& octet_remap_matrix1,
1026  slice_octet_matrix_type& octet_remap_matrix2,
1027  octet_freq_matrix_type& octet_occupancy_matrix) const;
1028  /*!
1029  remap string from external (ASCII) system to matrix internal code
1030  @return true if remapping was ok, false if found incorrect value
1031  for the plane
1032  @internal
1033  */
1034  static
1035  bool remap_tosv(value_type* BMRESTRICT sv_str,
1036  size_type buf_size,
1037  const value_type* BMRESTRICT str,
1038  const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2
1039  ) BMNOEXCEPT;
1040  /*!
1041  remap string from external (ASCII) system to matrix internal code
1042  also creates a zero terminated copy string
1043  @return true if remapping was ok, false if found incorrect value
1044  for the plane
1045  @internal
1046  */
1047  static
1048  bool remap_n_tosv_2way(
1049  value_type* BMRESTRICT sv_str,
1050  value_type* BMRESTRICT str_cp,
1051  size_type buf_size,
1052  const value_type* BMRESTRICT str,
1053  size_t in_len,
1054  const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT;
1055 
1056  /*!
1057  remap string from external (ASCII) system to matrix internal code
1058  @internal
1059  */
1060  bool remap_tosv(value_type* sv_str,
1061  size_type buf_size,
1062  const value_type* str) const BMNOEXCEPT
1063  {
1064  return remap_tosv(sv_str, buf_size, str, remap_matrix2_);
1065  }
1066 
1067  /*!
1068  remap string from external (ASCII) system to matrix internal code
1069  @internal
1070  */
1072  value_type* BMRESTRICT sv_str,
1073  value_type* BMRESTRICT str_cp,
1074  size_type buf_size,
1075  const value_type* BMRESTRICT str,
1076  size_t in_len) const BMNOEXCEPT
1077  {
1078  return remap_n_tosv_2way(
1079  sv_str, str_cp, buf_size, str, in_len, remap_matrix2_);
1080  }
1081 
1082  /*!
1083  remap string from internal code to external (ASCII) system
1084  @return true if remapping was ok, false if found incorrect value
1085  for the plane
1086  @internal
1087  */
1088  static
1089  bool remap_fromsv(
1091  size_type buf_size,
1092  const value_type* BMRESTRICT sv_str,
1093  const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix1
1094  ) BMNOEXCEPT;
1095  /*!
1096  re-calculate remap matrix2 based on matrix1
1097  @internal
1098  */
1099  void recalc_remap_matrix2();
1100 
1101  ///@}
1102 
1103  // ------------------------------------------------------------
1104  /*! @name Export content to C-style */
1105  ///@{
1106 
1107  /**
1108  \brief Bulk export strings to a C-style matrix of chars
1109 
1110  \param cmatr - dest matrix (bm::heap_matrix)
1111  \param idx_from - index in the sparse vector to export from
1112  \param dec_size - decoding size (matrix column allocation should match)
1113  \param zero_mem - set to false if target array is pre-initialized
1114  with 0s to avoid performance penalty
1115 
1116  \return number of actually exported elements (can be less than requested)
1117  */
1118  template<typename CharMatrix>
1119  size_type decode(CharMatrix& cmatr,
1120  size_type idx_from,
1121  size_type dec_size,
1122  bool zero_mem = true) const
1123  {
1124  size_type str_len = effective_max_str();
1125  return decode_substr(cmatr, idx_from, dec_size,
1126  0, unsigned(str_len-1), zero_mem);
1127  }
1128 
1129  /**
1130  \brief Bulk export strings to a C-style matrix of chars
1131 
1132  \param cmatr - dest matrix (bm::heap_matrix)
1133  \param idx_from - index in the sparse vector to export from
1134  \param dec_size - decoding size (matrix column allocation should match)
1135  \param substr_from - sub-string position from
1136  \param substr_to - sub-string position to
1137  \param zero_mem - set to false if target array is pre-initialized
1138  with 0s to avoid performance penalty
1139 
1140  \return number of actually exported elements (can be less than requested)
1141  */
1142  template<typename CharMatrix>
1143  size_type decode_substr(CharMatrix& cmatr,
1144  size_type idx_from,
1145  size_type dec_size,
1146  unsigned substr_from,
1147  unsigned substr_to,
1148  bool zero_mem = true) const
1149  {
1150 
1151  /// Decoder functor
1152  /// @internal
1153  ///
1154  struct sv_decode_visitor_func
1155  {
1156  sv_decode_visitor_func(CharMatrix& cmatr) BMNOEXCEPT2
1157  : cmatr_(cmatr)
1158  {}
1159 
1160  int add_bits(size_type bv_offset,
1161  const unsigned char* BMRESTRICT bits,
1162  unsigned bits_size) BMNOEXCEPT
1163  {
1164  BM_ASSERT(bits_size);
1165 
1166  // can be negative (-1) when bv base offset = 0 and sv = 1,2..
1167  size_type base = bv_offset - sv_off_;
1168  const unsigned_value_type m = mask_;
1169  const unsigned i = substr_i_;
1170  unsigned j = 0;
1171  do
1172  {
1173  size_type idx = bits[j] + base;
1174  value_type* BMRESTRICT str = cmatr_.row(idx);
1175  str[i] |= m;
1176  } while (++j < bits_size);
1177  return 0;
1178  }
1179 
1180  int add_range(size_type bv_offset, size_type sz) BMNOEXCEPT
1181  {
1182  BM_ASSERT(sz);
1183 
1184  auto base = bv_offset - sv_off_;
1185  const unsigned_value_type m = mask_;
1186  const unsigned i = substr_i_;
1187  size_type j = 0;
1188  do
1189  {
1190  size_type idx = j + base;
1191  value_type* BMRESTRICT str = cmatr_.row(idx);
1192  str[i] |= m;
1193  } while(++j < sz);
1194  return 0;
1195  }
1196 
1197  CharMatrix& cmatr_; ///< target array for reverse transpose
1198  unsigned_value_type mask_ = 0; ///< bit-plane mask
1199  unsigned substr_i_= 0; ///< i
1200  size_type sv_off_ = 0; ///< SV read offset
1201  };
1202 
1203 
1204  BM_ASSERT(substr_from <= substr_to);
1205  BM_ASSERT(cmatr.is_init());
1206 
1207  if (zero_mem)
1208  cmatr.set_zero(); // TODO: set zero based on requested capacity
1209 
1210  size_type rows = size_type(cmatr.rows());
1211  size_type max_sz = this->size() - idx_from;
1212  if (max_sz < dec_size)
1213  dec_size = max_sz;
1214  if (rows < dec_size)
1215  dec_size = rows;
1216  if (!dec_size)
1217  return dec_size;
1218 
1219  sv_decode_visitor_func func(cmatr);
1220 
1221  for (unsigned i = substr_from; i <= substr_to; ++i)
1222  {
1223  unsigned bi = 0;
1224  func.substr_i_ = i - substr_from; // to put substr at the str[0]
1225 
1226  auto rsize = this->bmatr_.rows_not_null();
1227  for (unsigned k = i * 8; k < (i * 8) + 8; ++k, ++bi)
1228  {
1229  if (k >= rsize)
1230  goto break2;
1231  const bvector_type* bv = this->bmatr_.get_row(k);
1232  if (!bv)
1233  continue;
1234  BM_ASSERT (bv != this->get_null_bvector());
1235 
1236  func.mask_ = unsigned_value_type(1u << bi);
1237  func.sv_off_ = idx_from;
1238 
1239  size_type end = idx_from + dec_size;
1240  bm::for_each_bit_range_no_check(*bv, idx_from, end-1, func);
1241 
1242  } // for k
1243  } // for i
1244  break2:
1245 
1246  if (remap_flags_)
1247  {
1248  for (unsigned i = 0; i < dec_size; ++i)
1249  {
1250  typename CharMatrix::value_type* str = cmatr.row(i);
1252  } // for i
1253  }
1254  return dec_size;
1255  }
1256 
1257  /**
1258  \brief Bulk import of strings from a C-style matrix of chars
1259 
1260  \param cmatr - source matrix (bm::heap_matrix)
1261  [in/out] parameter gets modified(corrupted)
1262  in the process
1263  \param idx_from - destination index in the sparse vector
1264  \param imp_size - import size (number or rows to import)
1265  */
1266  template<typename CharMatrix>
1267  void import(CharMatrix& cmatr, size_type idx_from, size_type imp_size)
1268  {
1269  if (!imp_size)
1270  return;
1271  if (idx_from < this->size_) // in case it touches existing elements
1272  {
1273  // clear all planes in the range to provide corrrect import of 0 values
1274  this->clear_range(idx_from, idx_from + imp_size - 1);
1275  }
1276  import_no_check(cmatr, idx_from, imp_size);
1277  }
1278 
1279  /**
1280  \brief Bulk push-back import of strings from a C-style matrix of chars
1281 
1282  \param cmatr - source matrix (bm::heap_matrix)
1283  [in/out] parameter gets modified(corrupted)
1284  in the process
1285  \param imp_size - import size (number or rows to import)
1286  */
1287  template<typename CharMatrix>
1288  void import_back(CharMatrix& cmatr, size_type imp_size)
1289  {
1290  if (!imp_size)
1291  return;
1292  import_no_check(cmatr, this->size(), imp_size);
1293  }
1294 
1295 
1296  ///@}
1297 
1298  // ------------------------------------------------------------
1299  /*! @name Merge, split, partition data */
1300  ///@{
1301 
1302  /**
1303  @brief copy range of values from another sparse vector
1304 
1305  Copy [left..right] values from the source vector,
1306  clear everything outside the range.
1307 
1308  \param sv - source vector
1309  \param left - index from in losed diapason of [left..right]
1310  \param right - index to in losed diapason of [left..right]
1311  \param slice_null - "use_null" copy range for NULL vector or
1312  do not copy it
1313  */
1315  size_type left, size_type right,
1316  bm::null_support slice_null = bm::use_null);
1317 
1318  /**
1319  \brief merge with another sparse vector using OR operation
1320  Merge is different from join(), because it borrows data from the source
1321  vector, so it gets modified (destructive join)
1322 
1323  \param tr_sv - [in, out]argument vector to join with (vector mutates)
1324 
1325  \return self reference
1326  */
1329 
1330  /**
1331  Keep only specified interval in the sparse vector, clear all other
1332  elements.
1333 
1334  \param left - interval start
1335  \param right - interval end (closed interval)
1336  \param slice_null - "use_null" copy range for NULL vector or not
1337  */
1338  void keep_range(size_type left, size_type right,
1339  bm::null_support slice_null = bm::use_null);
1340 
1341  ///@}
1342 
1343  // ------------------------------------------------------------
1344 
1345  /*! \brief syncronize internal structures */
1346  void sync(bool force);
1347 
1348  /*!
1349  \brief check if another sparse vector has the same content and size
1350 
1351  \param sv - sparse vector for comparison
1352  \param null_able - flag to consider NULL vector in comparison (default)
1353  or compare only value content planes
1354 
1355  \return true, if it is the same
1356  */
1358  bm::null_support null_able = bm::use_null) const BMNOEXCEPT;
1359 
1360  /**
1361  \brief find position of compressed element by its rank
1362  */
1363  static
1364  bool find_rank(size_type rank, size_type& pos) BMNOEXCEPT;
1365 
1366  /**
1367  \brief size of sparse vector (may be different for RSC)
1368  */
1370 
1371 protected:
1373  {
1374  ins_buf_size = bm::gap_max_bits // 1024 * 8
1375  };
1376 
1377  /// @internal
1378  template<typename CharMatrix, size_t BufSize = ins_buf_size>
1379  void import_no_check(CharMatrix& cmatr,
1380  size_type idx_from, size_type imp_size,
1381  bool set_not_null = true)
1382  {
1383  BM_ASSERT (cmatr.is_init());
1384 
1385  unsigned max_str_size = 0;
1386  {
1387  for (unsigned j = 0; j < imp_size; ++j)
1388  {
1389  typename CharMatrix::value_type* str = cmatr.row(j);
1390  typename CharMatrix::size_type i;
1391  typename CharMatrix::size_type cols = cmatr.cols();
1392  for (i = 0; i < cols; ++i)
1393  {
1394  value_type ch = str[i];
1395  if (!ch)
1396  {
1397  max_str_size =
1398  (unsigned)((i > max_str_size) ? i : max_str_size);
1399  break;
1400  }
1401  if (remap_flags_) // re-mapping is in effect
1402  {
1403  unsigned char remap_value =
1404  remap_matrix2_.get(i, (unsigned char)(ch));
1405  BM_ASSERT(remap_value); // unknown ?!
1406  /*
1407  if (!remap_value) // unknown dictionary element
1408  throw_bad_value(0); */
1409  str[i] = CharType(remap_value);
1410  }
1411  } // for i
1412  } // for j
1413  }
1414 
1415  this->bmatr_.allocate_rows((1+max_str_size) * 8 + this->is_nullable());
1416 
1417  unsigned_value_type ch_slice[BufSize];
1418  for (unsigned i = 0; i < max_str_size; ++i)
1419  {
1420  unsigned ch_acc = 0;
1421 #if defined(BMVECTOPT) || defined(BM_USE_GCC_BUILD)
1422  if (imp_size == ins_buf_size) /// full buffer import can use loop unrolling
1423  {
1424  for (size_type j = 0; j < imp_size; j+=4)
1425  {
1426  unsigned_value_type ch0 = (unsigned_value_type)cmatr.row(j)[i];
1427  unsigned_value_type ch1 = (unsigned_value_type)cmatr.row(j+1)[i];
1428  unsigned_value_type ch2 = (unsigned_value_type)cmatr.row(j+2)[i];
1429  unsigned_value_type ch3 = (unsigned_value_type)cmatr.row(j+3)[i];
1430 
1431  ch_acc |= ch0 | ch1 | ch2 | ch3;
1432  ch_slice[j] = ch0; ch_slice[j+1] = ch1;
1433  ch_slice[j+2] = ch2; ch_slice[j+3] = ch3;
1434  }
1435  }
1436  else
1437 #endif
1438  {
1439  for (size_type j = 0; j < imp_size; ++j)
1440  {
1441  unsigned_value_type ch = (unsigned_value_type)cmatr.row(j)[i];
1442  ch_acc |= ch;
1443  ch_slice[j] = ch;
1444  }
1445  }
1446  import_char_slice(ch_slice, ch_acc, i, idx_from, imp_size);
1447  }
1448 
1449  size_type idx_to = idx_from + imp_size - 1;
1450  if (set_not_null)
1451  {
1452  if (bvector_type* bv_null = this->get_null_bvect())
1453  bv_null->set_range(idx_from, idx_to);
1454  }
1455  if (idx_to >= this->size())
1456  this->size_ = idx_to+1;
1457 
1458  }
1459 #ifdef _MSC_VER
1460 #pragma warning( push )
1461 #pragma warning( disable : 4146 )
1462 #endif
1463  /// @internal
1464  template<size_t BufSize = ins_buf_size>
1466  unsigned ch_acc,
1467  size_type char_slice_idx,
1468  size_type idx_from, size_type imp_size)
1469  {
1470  size_type bit_list[BufSize];
1471  for ( ;ch_acc; ch_acc &= ch_acc - 1) // bit-scan
1472  {
1473  unsigned n_bits = 0;
1474  const unsigned bi = (bm::word_bitcount((ch_acc & -ch_acc) - 1));
1475  unsigned mask = 1u << bi;
1476 #if defined(BMVECTOPT) || defined(BM_USE_GCC_BUILD)
1477  if (imp_size == ins_buf_size) /// full buffer import can use loop unrolling
1478  {
1479  mask |= (mask << 8) | (mask << 16) | (mask << 24);
1480  for (size_type j = 0; j < imp_size; j+=4)
1481  {
1482  unsigned ch0 = ((unsigned)ch_slice[j+0]) |
1483  ((unsigned)ch_slice[j+1] << 8) |
1484  ((unsigned)ch_slice[j+2] << 16) |
1485  ((unsigned)ch_slice[j+3] << 24);
1486  ch0 &= mask;
1487  ch0 = (ch0 >> bi) | (ch0 >> (bi+7)) |
1488  (ch0 >> (bi+14)) | (ch0 >> (bi+21));
1489  ch0 &= 15u;
1491  for (size_type base_idx = idx_from + j ;ch0; ch0 &= ch0 - 1) // bit-scan
1492  {
1493  const unsigned bit_idx =
1494  (bm::word_bitcount((ch0 & -ch0) - 1));
1495  bit_list[n_bits++] = base_idx + bit_idx;
1496  } // for ch0
1497  } // for j
1498  }
1499  else
1500 #endif
1501  {
1502  for (size_type j = 0; j < imp_size; ++j)
1503  {
1504  unsigned ch = unsigned(ch_slice[j]);
1505  if (ch & mask)
1506  bit_list[n_bits++] = idx_from + j;
1507  } // for j
1508  }
1509 
1510  if (n_bits) // set transposed bits to the target plane
1511  {
1512  bvector_type* bv =
1513  this->get_create_slice((unsigned)(char_slice_idx * 8) + bi);
1514  bv->import_sorted(&bit_list[0], n_bits, false);
1515  }
1516  } // for ch_acc
1517  }
1518 #ifdef _MSC_VER
1519 #pragma warning( pop )
1520 #endif
1521  // ------------------------------------------------------------
1522  /*! @name Errors and exceptions */
1523  ///@{
1524 
1525  /**
1526  \brief throw range error
1527  \internal
1528  */
1529  static
1530  void throw_range_error(const char* err_msg);
1531 
1532  /**
1533  \brief throw domain error
1534  \internal
1535  */
1536  static
1537  void throw_bad_value(const char* err_msg);
1538 
1539  ///@}
1540 
1541  /*! \brief set value without checking boundaries */
1542  void set_value(size_type idx, const value_type* str);
1543 
1544  /*! \brief set value without checking boundaries or support of NULL */
1545  void set_value_no_null(size_type idx, const value_type* str);
1546 
1547  /*! \brief insert value without checking boundaries */
1548  void insert_value(size_type idx, const value_type* str);
1549 
1550  /*! \brief insert value without checking boundaries or support of NULL */
1551  void insert_value_no_null(size_type idx, const value_type* str);
1552 
1553 
1554  size_type size_internal() const { return size(); }
1556 
1557  size_t remap_size() const { return remap_matrix1_.get_buffer().size(); }
1558  const unsigned char* get_remap_buffer() const
1559  { return remap_matrix1_.get_buffer().buf(); }
1560  unsigned char* init_remap_buffer()
1561  {
1562  remap_matrix1_.init(true);
1563  return remap_matrix1_.get_buffer().data();
1564  }
1565  void set_remap() { remap_flags_ = 1; }
1566 
1567 protected:
1568 
1570  size_type* idx_from, size_type* idx_to) const
1571  {
1572  *idx_from = from; *idx_to = to; return true;
1573  }
1574 
1576  { return &remap_matrix1_; }
1578  { return &remap_matrix1_; }
1579 
1580  /**
1581  reamp using statistics table from inserter
1582  */
1583  void remap(back_insert_iterator& iit);
1584 
1585  /**
1586  Remap from implementation, please note that move_data flag can violate cosnt-ness
1587  */
1588  void remap_from_impl(const str_sparse_vector& str_sv,
1589  octet_freq_matrix_type* omatrix,
1590  bool move_data);
1591 
1592 protected:
1593  template<class SVect> friend class sparse_vector_serializer;
1594  template<class SVect> friend class sparse_vector_deserializer;
1595 
1596 protected:
1597  unsigned remap_flags_; ///< remapping status
1598  slice_octet_matrix_type remap_matrix1_; ///< octet remap table 1
1599  slice_octet_matrix_type remap_matrix2_; ///< octet remap table 2
1600 };
1601 
1602 //---------------------------------------------------------------------
1603 //---------------------------------------------------------------------
1604 
1605 
1606 template<class CharType, class BV, unsigned STR_SIZE>
1608  bm::null_support null_able,
1610  size_type bv_max_size,
1611  const allocator_type& alloc)
1612 : parent_type(null_able, true, ap, bv_max_size, alloc),
1613  remap_flags_(0)
1614 {
1615  static_assert(STR_SIZE > 1,
1616  "BM:: String vector size must be > 1 (to accomodate 0 terminator)");
1617 }
1618 
1619 
1620 //---------------------------------------------------------------------
1621 
1622 template<class CharType, class BV, unsigned STR_SIZE>
1624  const str_sparse_vector& str_sv)
1625 : parent_type(str_sv),
1626  remap_flags_(str_sv.remap_flags_),
1627  remap_matrix1_(str_sv.remap_matrix1_),
1628  remap_matrix2_(str_sv.remap_matrix2_)
1629 {
1630  static_assert(STR_SIZE > 1,
1631  "BM:: String vector size must be > 1 (to accomodate 0 terminator)");
1632 }
1633 
1634 //---------------------------------------------------------------------
1635 
1636 template<class CharType, class BV, unsigned STR_SIZE>
1638  const str_sparse_vector& str_sv, bm::remap_setup remap_mode)
1639 : parent_type(str_sv.get_null_support(), true),
1640  remap_flags_(str_sv.remap_flags_),
1641  remap_matrix1_(str_sv.remap_matrix1_),
1642  remap_matrix2_(str_sv.remap_matrix2_)
1643 {
1644  BM_ASSERT(str_sv.remap_flags_); // source vector should be remapped
1646  static_assert(STR_SIZE > 1,
1647  "BM:: String vector size must be > 1 (to accomodate 0 terminator)");
1648  (void) remap_mode;
1649 }
1650 
1651 //---------------------------------------------------------------------
1652 
1653 template<class CharType, class BV, unsigned STR_SIZE>
1655  str_sparse_vector& str_sv) BMNOEXCEPT
1656 {
1657  parent_type::swap(str_sv);
1658  bm::xor_swap(remap_flags_, str_sv.remap_flags_);
1659  remap_matrix1_.swap(str_sv.remap_matrix1_);
1660  remap_matrix2_.swap(str_sv.remap_matrix2_);
1661 }
1662 
1663 //---------------------------------------------------------------------
1664 
1665 template<class CharType, class BV, unsigned STR_SIZE>
1667  size_type idx, const value_type* str)
1668 {
1669  if (idx >= this->size())
1670  this->size_ = idx+1;
1671  set_value(idx, str);
1672 }
1673 
1674 //---------------------------------------------------------------------
1675 
1676 template<class CharType, class BV, unsigned STR_SIZE>
1678  size_type idx, const value_type* str)
1679 {
1680  if (idx >= this->size())
1681  {
1682  this->size_ = idx+1;
1683  set_value(idx, str);
1684  return;
1685  }
1686  insert_value(idx, str);
1687  this->size_++;
1688 }
1689 
1690 //---------------------------------------------------------------------
1691 
1692 template<class CharType, class BV, unsigned STR_SIZE>
1694  size_type idx2)
1695 {
1696  BM_ASSERT(idx1 < this->size());
1697  BM_ASSERT(idx2 < this->size());
1698 
1699  this->swap_elements(idx1, idx2);
1700 }
1701 
1702 //---------------------------------------------------------------------
1703 
1704 template<class CharType, class BV, unsigned STR_SIZE>
1706 {
1707  BM_ASSERT(idx < this->size_);
1708  if (idx >= this->size_)
1709  return;
1710  this->erase_column(idx, true);
1711  this->size_--;
1712 }
1713 
1714 //---------------------------------------------------------------------
1715 
1716 template<class CharType, class BV, unsigned STR_SIZE>
1718 {
1719  if (idx >= this->size_)
1720  this->size_ = idx + 1; // assumed nothing todo outside current size
1721  else
1722  this->bmatr_.clear_column(idx, 0);
1723 }
1724 //---------------------------------------------------------------------
1725 
1726 template<class CharType, class BV, unsigned STR_SIZE>
1728 {
1729  BM_ASSERT(count);
1730  BM_ASSERT(bm::id_max - count > this->size_);
1731  BM_ASSERT(this->is_nullable());
1732 
1733  this->size_ += count;
1734 }
1735 
1736 //---------------------------------------------------------------------
1737 
1738 template<class CharType, class BV, unsigned STR_SIZE>
1740  size_type idx, const value_type* str)
1741 {
1742  set_value_no_null(idx, str);
1743  if (bvector_type* bv_null = this->get_null_bvect())
1744  bv_null->set_bit_no_check(idx);
1745 }
1746 
1747 //---------------------------------------------------------------------
1748 
1749 template<class CharType, class BV, unsigned STR_SIZE>
1751  size_type idx, const value_type* str)
1752 {
1753  for (unsigned i = 0; true; ++i)
1754  {
1755  CharType ch = str[i];
1756  if (!ch)
1757  {
1758  this->clear_value_planes_from(i*8, idx);
1759  return;
1760  }
1761  if (remap_flags_) // compressional re-mapping is in effect
1762  {
1763  auto r = remap_matrix2_.rows();
1764  if (i >= r)
1765  {
1766  remap_matrix1_.resize(i + 1, remap_matrix1_.cols(), true);
1767  remap_matrix2_.resize(i + 1, remap_matrix2_.cols(), true);
1768  }
1769  unsigned char remap_value = remap_matrix2_.get(i, unsigned(ch));
1770  BM_ASSERT(remap_value);
1771  if (!remap_value) // unknown dictionary element
1772  {
1773  this->clear_value_planes_from(i*8, idx);
1774  return;
1775  }
1776  ch = CharType(remap_value);
1777  }
1778  this->bmatr_.set_octet(idx, i, (unsigned char)ch);
1779  } // for i
1780 }
1781 
1782 //---------------------------------------------------------------------
1783 
1784 template<class CharType, class BV, unsigned STR_SIZE>
1786  size_type idx, const value_type* str)
1787 {
1788  insert_value_no_null(idx, str);
1789  this->insert_null(idx, true);
1790 }
1791 
1792 //---------------------------------------------------------------------
1793 
1794 template<class CharType, class BV, unsigned STR_SIZE>
1796  size_type idx, const value_type* str)
1797 {
1798  for (unsigned i = 0; true; ++i)
1799  {
1800  CharType ch = str[i];
1801  if (!ch)
1802  {
1803  this->insert_clear_value_planes_from(i*8, idx);
1804  return;
1805  }
1806 
1807  if (remap_flags_) // compressional re-mapping is in effect
1808  {
1809  unsigned char remap_value = remap_matrix2_.get(i, unsigned(ch));
1810  BM_ASSERT(remap_value);
1811  if (!remap_value) // unknown dictionary element
1812  {
1813  this->insert_clear_value_planes_from(i*8, idx);
1814  return;
1815  }
1816  ch = CharType(remap_value);
1817  }
1818  this->bmatr_.insert_octet(idx, i, (unsigned char)ch);
1819  } // for i
1820 }
1821 
1822 
1823 //---------------------------------------------------------------------
1824 
1825 template<class CharType, class BV, unsigned STR_SIZE>
1828  size_type idx, value_type* str, size_type buf_size) const BMNOEXCEPT
1829 {
1830  size_type i = 0;
1831  for (; true; ++i)
1832  {
1833  if (i >= buf_size)
1834  break;
1835  CharType ch = CharType(this->bmatr_.get_octet(idx, i));
1836  str[i] = ch;
1837  if (!ch)
1838  break;
1839  }
1840  if (remap_flags_)
1841  remap_matrix1_.remap(str, i);
1842  return i;
1843 }
1844 
1845 //---------------------------------------------------------------------
1846 
1847 template<class CharType, class BV, unsigned STR_SIZE>
1849  bm::word_t* temp_block,
1850  typename bvector_type::optmode opt_mode,
1852 {
1853  typename bvector_type::statistics stbv;
1854  parent_type::optimize(temp_block, opt_mode, &stbv);
1855 
1856  if (st)
1857  st->add(stbv);
1858 }
1859 
1860 //---------------------------------------------------------------------
1861 
1862 template<class CharType, class BV, unsigned STR_SIZE>
1865  ) const BMNOEXCEPT
1866 {
1867  BM_ASSERT(st);
1868  typename bvector_type::statistics stbv;
1869  parent_type::calc_stat(&stbv);
1870 
1871  st->reset();
1872 
1873  st->bit_blocks += stbv.bit_blocks;
1874  st->gap_blocks += stbv.gap_blocks;
1875  st->ptr_sub_blocks += stbv.ptr_sub_blocks;
1876  st->bv_count += stbv.bv_count;
1877  st->max_serialize_mem += stbv.max_serialize_mem + 8;
1878  st->memory_used += stbv.memory_used;
1879  st->gap_cap_overhead += stbv.gap_cap_overhead;
1880 
1881  size_t remap_mem_usage = sizeof(remap_flags_);
1882  remap_mem_usage += remap_matrix1_.get_buffer().mem_usage();
1883  remap_mem_usage += remap_matrix2_.get_buffer().mem_usage();
1884 
1885  st->memory_used += remap_mem_usage;
1886  if (remap_flags_) // use of remapping requires some extra storage
1887  {
1888  st->max_serialize_mem += (remap_mem_usage * 2);
1889  }
1890 }
1891 
1892 //---------------------------------------------------------------------
1893 
1894 template<class CharType, class BV, unsigned STR_SIZE>
1896  const value_type* str1, const value_type* str2) BMNOEXCEPT
1897 {
1898  BM_ASSERT(str1 && str2);
1899  int res = 0;
1900  for (unsigned i = 0; true; ++i)
1901  {
1902  CharType octet2 = str2[i];
1903  CharType octet1 = str1[i];
1904  if (!octet1)
1905  {
1906  res = -octet2; // -1 || 0
1907  break;
1908  }
1909  res = (octet1 > octet2) - (octet1 < octet2);
1910  if (res || !octet2)
1911  break;
1912  } // for i
1913  return res;
1914 }
1915 
1916 //---------------------------------------------------------------------
1917 
1918 template<class CharType, class BV, unsigned STR_SIZE>
1920  const value_type* str1, const value_type* str2, size_t min_len) BMNOEXCEPT
1921 {
1922  BM_ASSERT(str1 && str2);
1923 
1924  int res = 0;
1925  size_t i = 0;
1926 
1927  CharType octet2, octet1;
1928  if (min_len >= 4)
1929  {
1930  for (; i < min_len-3; i+=4)
1931  {
1932  unsigned i2, i1;
1933  ::memcpy(&i2, &str2[i], sizeof(i2));
1934  ::memcpy(&i1, &str1[i], sizeof(i1));
1936  if (i1 != i2)
1937  {
1938  octet2 = str2[i];
1939  octet1 = str1[i];
1940  res = (octet1 > octet2) - (octet1 < octet2);
1941  if (res)
1942  return res;
1943  octet2 = str2[i+1];
1944  octet1 = str1[i+1];
1945  res = (octet1 > octet2) - (octet1 < octet2);
1946  if (res)
1947  return res;
1948  octet2 = str2[i+2];
1949  octet1 = str1[i+2];
1950  res = (octet1 > octet2) - (octet1 < octet2);
1951  if (res)
1952  return res;
1953  octet2 = str2[i+3];
1954  octet1 = str1[i+3];
1955  res = (octet1 > octet2) - (octet1 < octet2);
1956  if (res)
1957  return res;
1958  BM_ASSERT(0);
1959  break;
1960  }
1961  } // for i
1962  }
1963 
1964 
1965  for (; i < min_len; ++i)
1966  {
1967  octet2 = str2[i];
1968  octet1 = str1[i];
1969  BM_ASSERT(octet1 && octet2);
1970  res = (octet1 > octet2) - (octet1 < octet2);
1971  if (res)
1972  return res;
1973  } // for i
1974 
1975  for (; true; ++i)
1976  {
1977  octet2 = str2[i];
1978  octet1 = str1[i];
1979  if (!octet1)
1980  {
1981  res = -octet2; // -1 || 0
1982  break;
1983  }
1984  res = (octet1 > octet2) - (octet1 < octet2);
1985  if (res || !octet2)
1986  break;
1987  } // for i
1988  return res;
1989 }
1990 
1991 
1992 //---------------------------------------------------------------------
1993 
1994 template<class CharType, class BV, unsigned STR_SIZE>
1996  size_type idx, const value_type* str) const BMNOEXCEPT
1997 {
1998  BM_ASSERT(str);
1999  BM_ASSERT(is_remap()); // MUST guarantee remapping
2000 
2001  int res = 0;
2002  for (unsigned i = 0; true; ++i)
2003  {
2004  CharType octet2 = str[i];
2005  CharType octet1 = (CharType)this->bmatr_.get_octet(idx, i);
2006  if (!octet1)
2007  {
2008  res = -octet2; // -1 || 0
2009  break;
2010  }
2011  const unsigned char* remap_row = remap_matrix1_.row(i);
2012  CharType remap_value1 = (CharType)remap_row[unsigned(octet1)];
2013  BM_ASSERT(remap_value1);
2014  res = (remap_value1 > octet2) - (remap_value1 < octet2);
2015  if (res || !octet2)
2016  break;
2017  } // for i
2018  return res;
2019 }
2020 
2021 //---------------------------------------------------------------------
2022 
2023 template<class CharType, class BV, unsigned STR_SIZE>
2025  const value_type* str) const BMNOEXCEPT
2026 {
2027  BM_ASSERT(str);
2028  BM_ASSERT(!is_remap()); // MUST guarantee remapping
2029 
2030  int res = 0;
2031  for (unsigned i = 0; true; ++i)
2032  {
2033  CharType octet2 = str[i];
2034  CharType octet1 = (CharType)this->bmatr_.get_octet(idx, i);
2035  if (!octet1)
2036  {
2037  res = -octet2; // -1 || 0
2038  break;
2039  }
2040  res = (octet1 > octet2) - (octet1 < octet2);
2041  if (res || !octet2)
2042  break;
2043  } // for i
2044  return res;
2045 }
2046 
2047 
2048 //---------------------------------------------------------------------
2049 
2050 template<class CharType, class BV, unsigned STR_SIZE>
2052  size_type idx,
2053  const value_type* str) const BMNOEXCEPT
2054 {
2055  BM_ASSERT(str);
2056  int res = remap_flags_ ? compare_remap(idx, str)
2057  : compare_nomap(idx, str);
2058  return res;
2059 }
2060 
2061 //---------------------------------------------------------------------
2062 
2063 template<class CharType, class BV, unsigned STR_SIZE>
2065  size_type idx1,
2066  size_type idx2) const BMNOEXCEPT
2067 {
2068  BM_ASSERT(idx1 < size() && idx2 < size());
2069  int res = 0;
2070  if (idx1 == idx2)
2071  return 0;
2072  if (remap_flags_)
2073  {
2074  for (unsigned i = 0; true; ++i)
2075  {
2076  // TODO: implement function to return two octets at once
2077  CharType octet2 = (CharType)this->bmatr_.get_octet(idx2, i);
2078  CharType octet1 = (CharType)this->bmatr_.get_octet(idx1, i);
2079  if (!octet1)
2080  {
2081  res = -octet2; // -1 || 0
2082  break;
2083  }
2084  const unsigned char* remap_row = remap_matrix1_.row(i);
2085  unsigned char remap_value1 = remap_row[unsigned(octet1)];
2086  //BM_ASSERT(remap_value1);
2087  unsigned char remap_value2 = remap_row[unsigned(octet2)];
2088  //BM_ASSERT(remap_value2);
2089  res = (remap_value1 > remap_value2) - (remap_value1 < remap_value2);
2090  if (res || !octet2)
2091  break;
2092  } // for i
2093  }
2094  else
2095  {
2096  for (unsigned i = 0; true; ++i)
2097  {
2098  CharType octet2 = (CharType)this->bmatr_.get_octet(idx2, i);
2099  CharType octet1 = (CharType)this->bmatr_.get_octet(idx1, i);
2100  if (!octet1)
2101  {
2102  res = -octet2; // -1 || 0
2103  break;
2104  }
2105  res = (octet1 > octet2) - (octet1 < octet2);
2106  if (res || !octet2)
2107  break;
2108  } // for i
2109  }
2110  return res;
2111 
2112 }
2113 
2114 //---------------------------------------------------------------------
2115 
2116 template<class CharType, class BV, unsigned STR_SIZE>
2117 template<bool USE_PREFIX_BUF>
2119  size_type idx1, size_type idx2,
2120  value_type* prefix_buf) const BMNOEXCEPT
2121 {
2122  BM_ASSERT (!(prefix_buf && !USE_PREFIX_BUF));
2123  unsigned i = 0;
2124  CharType ch1 = CharType(this->bmatr_.get_octet(idx1, i));
2125  CharType ch2 = CharType(this->bmatr_.get_octet(idx2, i));
2126  if (ch1 == ch2 && (ch1|ch2))
2127  {
2128  if constexpr(USE_PREFIX_BUF)
2129  {
2130  BM_ASSERT(prefix_buf);
2131  *prefix_buf++ = ch1;
2132  }
2133  for (++i; true; ++i)
2134  {
2135  ch1 = CharType(this->bmatr_.get_octet(idx1, i));
2136  ch2 = CharType(this->bmatr_.get_octet(idx2, i));
2137  if (ch1 != ch2 || (!(ch1|ch2))) // chs not the same or both zero
2138  return i;
2139  if constexpr(USE_PREFIX_BUF)
2140  *prefix_buf++ = ch1;
2141  } // for i
2142  }
2143  return i;
2144 }
2145 
2146 
2147 //---------------------------------------------------------------------
2148 
2149 template<class CharType, class BV, unsigned STR_SIZE>
2150 bool
2152  size_type rank,
2153  size_type& pos) BMNOEXCEPT
2154 {
2155  BM_ASSERT(rank);
2156  pos = rank - 1;
2157  return true;
2158 }
2159 
2160 //---------------------------------------------------------------------
2161 
2162 template<class CharType, class BV, unsigned STR_SIZE>
2166 {
2167  return this->bmatr_.octet_size();
2168 }
2169 
2170 //---------------------------------------------------------------------
2171 
2172 template<class CharType, class BV, unsigned STR_SIZE>
2174  octet_freq_matrix_type& octet_matrix) const
2175 {
2176  size_type max_str_len = effective_max_str();
2177  octet_matrix.resize(max_str_len, 256, false);
2178  octet_matrix.set_zero(); //init(true);
2179  {
2180  const_iterator it(this);
2181  for(; it.valid(); ++it)
2182  {
2183  const value_type* s = *it; // get asciiz char*
2184  if(!s)
2185  continue;
2186  for (unsigned i = 0; true; ++i) // for each char in str
2187  {
2188  value_type ch = s[i];
2189  if (!ch)
2190  break;
2191  typename
2192  octet_freq_matrix_type::value_type* row = octet_matrix.row(i);
2193  unsigned ch_idx = (unsigned char)ch;
2194  row[ch_idx] += 1;
2195  } // for i
2196  } // for it
2197  }
2198 }
2199 
2200 //---------------------------------------------------------------------
2201 
2202 template<class CharType, class BV, unsigned STR_SIZE>
2204  slice_octet_matrix_type& octet_remap_matrix1,
2205  slice_octet_matrix_type& octet_remap_matrix2,
2206  octet_freq_matrix_type& octet_occupancy_matrix) const
2207 {
2208  size_type max_str_len = effective_max_str();
2209  octet_remap_matrix1.resize(max_str_len, 256, false);
2210  octet_remap_matrix1.set_zero();
2211  octet_remap_matrix2.resize(max_str_len, 256, false);
2212  octet_remap_matrix2.set_zero();
2213 
2214  for (unsigned i = 0; i < octet_occupancy_matrix.rows(); ++i)
2215  {
2216  typename octet_freq_matrix_type::value_type* frq_row =
2217  octet_occupancy_matrix.row(i);
2218 
2219  unsigned char* remap_row1 = octet_remap_matrix1.row(i);
2220  unsigned char* remap_row2 = octet_remap_matrix2.row(i);
2221 
2222  const typename slice_octet_matrix_type::size_type row_size =
2223  octet_occupancy_matrix.cols();
2224  for (unsigned remap_code = 1; true; ++remap_code)
2225  {
2226  typename octet_freq_matrix_type::size_type char_idx;
2227  bool found = bm::find_max_nz(frq_row, row_size, &char_idx);
2228  #if 0
2229  bool found = bm::find_first_nz(frq_row, row_size, &char_idx);
2230  #endif
2231  if (!found)
2232  break;
2233  BM_ASSERT(char_idx);
2234  unsigned char ch = (unsigned char)char_idx;
2235  remap_row1[remap_code] = ch;
2236  remap_row2[ch] = (unsigned char)remap_code;
2237  frq_row[char_idx] = 0; // clear the picked char
2238  } // for
2239  } // for i
2240 }
2241 
2242 //---------------------------------------------------------------------
2243 
2244 template<class CharType, class BV, unsigned STR_SIZE>
2246 {
2247  BM_ASSERT(remap_flags_);
2248 
2249  auto rows = remap_matrix1_.rows();
2250  remap_matrix2_.resize(rows, remap_matrix1_.cols(), false);
2251  if (rows)
2252  {
2253  remap_matrix2_.set_zero();
2254  //remap_matrix2_.init(true);
2255  for (unsigned i = 0; i < remap_matrix1_.rows(); ++i)
2256  {
2257  const unsigned char* remap_row1 = remap_matrix1_.row(i);
2258  unsigned char* remap_row2 = remap_matrix2_.row(i);
2259  for (unsigned j = 1; j < remap_matrix1_.cols(); ++j)
2260  {
2261  if (remap_row1[j])
2262  {
2263  unsigned ch_code = remap_row1[j];
2264  remap_row2[ch_code] = (unsigned char)j;
2265  BM_ASSERT(ch_code < 256);
2266  }
2267  } // for j
2268  } // for i
2269  } // if rows
2270 }
2271 
2272 //---------------------------------------------------------------------
2273 
2274 template<class CharType, class BV, unsigned STR_SIZE>
2276  value_type* BMRESTRICT sv_str,
2277  size_type buf_size,
2278  const value_type* BMRESTRICT str,
2279  const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT
2280 {
2281  if (!octet_remap_matrix2.rows())
2282  return false;
2283 
2284  const unsigned char* remap_row = octet_remap_matrix2.row(0);
2285  for (unsigned i = 0; i < buf_size; ++i, remap_row += 256)
2286  {
2287  CharType ch = str[i];
2288  if (!ch)
2289  {
2290  sv_str[i] = ch;
2291  break;
2292  }
2293  unsigned char remap_value = remap_row[unsigned(ch)];
2294  sv_str[i] = CharType(remap_value);
2295  if (!remap_value) // unknown dictionary element
2296  return false;
2297  } // for i
2298  return true;
2299 }
2300 
2301 //---------------------------------------------------------------------
2302 
2303 template<class CharType, class BV, unsigned STR_SIZE>
2305  value_type* BMRESTRICT sv_str,
2306  value_type* BMRESTRICT str_cp,
2307  size_type buf_size,
2308  const value_type* BMRESTRICT str,
2309  size_t in_len,
2310  const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT
2311 {
2312  BM_ASSERT(in_len <= buf_size); (void) buf_size;
2313 
2314  const unsigned char* remap_row = octet_remap_matrix2.row(0);
2315  for (unsigned i = 0; i < in_len; ++i, remap_row += 256)
2316  {
2317  CharType ch = str[i];
2318  str_cp[i] = value_type(ch);
2319  BM_ASSERT(ch);
2320  unsigned char remap_value = remap_row[unsigned(ch)];
2321  sv_str[i] = CharType(remap_value);
2322  if (!remap_value) // unknown dictionary element
2323  return false;
2324  } // for i
2325  sv_str[in_len] = str_cp[in_len] = 0; // gurantee zero termination
2326  return true;
2327 }
2328 
2329 
2330 //---------------------------------------------------------------------
2331 
2332 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2335  size_type buf_size,
2336  const value_type* BMRESTRICT sv_str,
2337  const slice_octet_matrix_type& BMRESTRICT octet_remap_matrix1
2338  ) BMNOEXCEPT
2339 {
2340  const unsigned char* remap_row = octet_remap_matrix1.row(0);
2341  for (unsigned i = 0; i < buf_size; ++i, remap_row += 256)
2342  {
2343  CharType ch = sv_str[i];
2344  if (!ch)
2345  {
2346  str[i] = ch;
2347  break;
2348  }
2349  unsigned char remap_value = remap_row[unsigned(ch)];
2350  str[i] = CharType(remap_value);
2351  if (!remap_value) // unknown dictionary element
2352  return false;
2353  } // for i
2354  return true;
2355 }
2356 
2357 //---------------------------------------------------------------------
2358 
2359 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2361 {
2363  sv_tmp(this->get_null_support());
2364  sv_tmp.remap_from_impl(*this, 0, true /*move data*/);
2365  sv_tmp.swap(*this);
2366 }
2367 
2368 //---------------------------------------------------------------------
2369 
2370 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2372  back_insert_iterator& iit)
2373 {
2374  if (iit.remap_flags_ && iit.omatrix_.rows())
2375  {
2377  sv_tmp(this->get_null_support());
2378  sv_tmp.remap_from_impl(*this, &iit.omatrix_, true /*move data*/);
2379  sv_tmp.swap(*this);
2380  }
2381  else
2382  remap();
2383 }
2384 
2385 //---------------------------------------------------------------------
2386 
2387 template<class CharType, class BV, unsigned STR_SIZE>
2388 void
2390  const str_sparse_vector& str_sv,
2391  octet_freq_matrix_type* omatrix)
2392 {
2393  remap_from_impl(str_sv, omatrix, false);
2394 }
2395 
2396 //---------------------------------------------------------------------
2397 
2398 template<class CharType, class BV, unsigned STR_SIZE>
2399 void
2401  const str_sparse_vector& str_sv,
2402  octet_freq_matrix_type* omatrix,
2403  bool move_data)
2404 {
2405  const unsigned buffer_size = ins_buf_size; // bm::gap_max_bits; // 65536;
2406 
2407  if (str_sv.is_remap())
2408  {
2409  *this = str_sv;
2410  return;
2411  }
2412 
2413  typename bvector_type::allocator_pool_type pool;
2414  typename
2416  if (move_data)
2417  {
2418  str_sparse_vector& sv = const_cast<str_sparse_vector&>(str_sv);
2419  g1.assign_if_not_set(pool, *this);
2420  g2.assign_if_not_set(pool, sv);
2421 
2422  auto r = sv.get_bmatrix().rows();
2423  pool.set_block_limit(r + 10);
2424  }
2425 
2426  this->clear_all(true);
2427  if (str_sv.empty()) // no content to remap
2428  return;
2429 
2430  octet_freq_matrix_type occ_matrix; // occupancy map
2431  if (!omatrix)
2432  {
2433  str_sv.calc_octet_stat(occ_matrix);
2434  omatrix = &occ_matrix;
2435  }
2436  str_sv.build_octet_remap(remap_matrix1_, remap_matrix2_, *omatrix);
2437  remap_flags_ = 1; // turn ON remapped mode
2438 
2439  typedef bm::dynamic_heap_matrix<CharType, allocator_type> buffer_matrix_type;
2440 
2441  size_type str_len = str_sv.effective_max_str()+1;
2442  buffer_matrix_type cmatr(buffer_size, str_len);
2443  cmatr.init(true); // init and set zero
2444 
2445  for (size_type i{0}, dsize; true; i += dsize)
2446  {
2447  dsize = str_sv.decode(cmatr, i, buffer_size, true);
2448  if (!dsize)
2449  break;
2450  if (move_data && (dsize == ins_buf_size)) // free the src.vect blocks
2451  {
2452  // here const_cast is OK, because we violate cosnt-ness only
2453  // in internal safe cases controlled by the upper level call
2454  //
2455  str_sparse_vector& sv = const_cast<str_sparse_vector&>(str_sv);
2456  sv.clear_range(i, i+dsize-1, false);
2457  }
2458 
2459  this->import(cmatr, i, dsize);
2460  } // for i
2461 
2462  if (bvector_type* bv_null = this->get_null_bvect())
2463  {
2464  if (const bvector_type* bv_null_arg = str_sv.get_null_bvector())
2465  if (move_data)
2466  {
2467  bvector_type* bv = const_cast<bvector_type*>(bv_null_arg);
2468  bv_null->swap(*bv);
2469  }
2470  else
2471  *bv_null = *bv_null_arg;
2472  else
2473  {
2474  // TODO: exception? assert? maybe it is OK...
2475  }
2476  }
2477 
2478 }
2479 
2480 //---------------------------------------------------------------------
2481 
2482 template<class CharType, class BV, unsigned STR_SIZE>
2484 {
2485  if (remap_flags_)
2486  recalc_remap_matrix2();
2487  this->sync_ro();
2488 }
2489 
2490 //---------------------------------------------------------------------
2491 
2492 template<class CharType, class BV, unsigned STR_SIZE>
2495  bm::null_support null_able) const BMNOEXCEPT
2496 {
2497  // at this point both vectors should have the same remap settings
2498  // to be considered "equal".
2499  // Strictly speaking this is incorrect, because re-map and non-remap
2500  // vectors may have the same content
2501 
2502  if (remap_flags_ != sv.remap_flags_)
2503  return false;
2504  if (remap_flags_)
2505  {
2506  // TODO: equal matrix dimention overlap may be not enough
2507  // (check the non-overlap to be zero)
2508  // dimentionality shrink is a result of de-serialization
2509  bool b;
2510  b = remap_matrix1_.equal_overlap(sv.remap_matrix1_);
2511  if (!b)
2512  return b;
2513  b = remap_matrix2_.equal_overlap(sv.remap_matrix2_);
2514  if (!b)
2515  return b;
2516  }
2517  return parent_type::equal(sv, null_able);
2518 }
2519 
2520 //---------------------------------------------------------------------
2521 
2522 template<class CharType, class BV, unsigned STR_SIZE>
2525  size_type left, size_type right,
2526  bm::null_support slice_null)
2527 {
2528  if (left > right)
2529  bm::xor_swap(left, right);
2530  this->clear_all(true);
2531 
2532  remap_flags_ = sv.remap_flags_;
2533  remap_matrix1_ = sv.remap_matrix1_;
2534  remap_matrix2_ = sv.remap_matrix2_;
2535 
2536  this->copy_range_slices(sv, left, right, slice_null);
2537  this->resize(sv.size());
2538 }
2539 
2540 //---------------------------------------------------------------------
2541 
2542 template<class CharType, class BV, unsigned STR_SIZE>
2546 {
2547  size_type arg_size = str_sv.size();
2548  if (this->size_ < arg_size)
2549  resize(arg_size);
2550 
2551  // there is an assumption here that we only need to copy remap flags once
2552  // because we merge matrices with the same remaps
2553  // otherwise - undefined behavior
2554  //
2555  if (remap_flags_ != str_sv.remap_flags_)
2556  {
2557  remap_flags_ = str_sv.remap_flags_;
2558  remap_matrix1_ = str_sv.remap_matrix1_;
2559  remap_matrix2_ = str_sv.remap_matrix2_;
2560  }
2561  bvector_type* bv_null = this->get_null_bvect();
2562 
2563  this->merge_matr(str_sv.bmatr_);
2564 
2565  // our vector is NULL-able but argument is not (assumed all values are real)
2566  if (bv_null && !str_sv.is_nullable())
2567  bv_null->set_range(0, arg_size-1);
2568 
2569  return *this;
2570 
2571 }
2572 
2573 //---------------------------------------------------------------------
2574 
2575 template<class CharType, class BV, unsigned STR_SIZE>
2577  size_type left, size_type right,
2578  bm::null_support slice_null)
2579 {
2580  if (right < left)
2581  bm::xor_swap(left, right);
2582  this->keep_range_no_check(left, right, slice_null);
2583 }
2584 
2585 //---------------------------------------------------------------------
2586 
2587 template<class CharType, class BV, unsigned STR_SIZE>
2590 {
2591  typedef typename
2593  return it_type(this);
2594 }
2595 
2596 //---------------------------------------------------------------------
2597 
2598 template<class CharType, class BV, unsigned STR_SIZE>
2600  bool free_mem, unsigned remap) BMNOEXCEPT
2601 {
2602  parent_type::clear_all(free_mem);
2603  if (remap_flags_ && (remap == 0))
2604  {
2605  remap_flags_ = 0;
2606  remap_matrix1_.free();
2607  remap_matrix2_.free();
2608  }
2609 }
2610 
2611 //---------------------------------------------------------------------
2612 
2613 template<class CharType, class BV, unsigned STR_SIZE>
2615  const char* err_msg)
2616 {
2617 #ifndef BM_NO_STL
2618  throw std::range_error(err_msg);
2619 #else
2620  BM_ASSERT_THROW(false, BM_ERR_RANGE);
2621 #endif
2622 }
2623 
2624 //---------------------------------------------------------------------
2625 
2626 template<class CharType, class BV, unsigned STR_SIZE>
2628  const char* err_msg)
2629 {
2630 #ifndef BM_NO_STL
2631  if (!err_msg)
2632  err_msg = "Unknown/incomparable dictionary character";
2633  throw std::domain_error(err_msg);
2634 #else
2635  BM_ASSERT_THROW(false, BM_BAD_VALUE);
2636 #endif
2637 }
2638 
2639 
2640 //---------------------------------------------------------------------
2641 //
2642 //---------------------------------------------------------------------
2643 
2644 
2645 template<class CharType, class BV, unsigned STR_SIZE>
2647 : sv_(0), substr_from_(0), substr_to_(STR_SIZE),
2648  pos_(bm::id_max), pos_in_buf_(~size_type(0))
2649 {
2650 }
2651 
2652 //---------------------------------------------------------------------
2653 
2654 template<class CharType, class BV, unsigned STR_SIZE>
2657 : sv_(it.sv_),
2658  substr_from_(it.substr_from_), substr_to_(it.substr_to_),
2659  pos_(it.pos_), pos_in_buf_(~size_type(0))
2660 {
2661 }
2662 
2663 //---------------------------------------------------------------------
2664 
2665 template<class CharType, class BV, unsigned STR_SIZE>
2668 : sv_(sv), pos_(sv->empty() ? bm::id_max : 0), pos_in_buf_(~size_type(0))
2669 {
2670  substr_from_ = 0;
2671  substr_to_ = (unsigned) sv_->effective_max_str();
2672  buf_matrix_.resize(n_rows, substr_to_+1);
2673 }
2674 
2675 //---------------------------------------------------------------------
2676 
2677 template<class CharType, class BV, unsigned STR_SIZE>
2681 : sv_(sv), pos_(pos >= sv->size() ? bm::id_max : pos), pos_in_buf_(~size_type(0))
2682 {
2683  substr_from_ = 0;
2684  substr_to_ = (unsigned) sv_->effective_max_str();
2685  buf_matrix_.resize(n_rows, substr_to_+1);
2686 }
2687 
2688 //---------------------------------------------------------------------
2689 
2690 template<class CharType, class BV, unsigned STR_SIZE>
2692  unsigned from, unsigned len) BMNOEXCEPT
2693 {
2694  unsigned max_str = sv_->effective_max_str();
2695  substr_from_ = from;
2696  if (!len)
2697  {
2698  len = 1 + max_str - from;
2699  substr_to_ = from + len;
2700  }
2701  else
2702  {
2703  // TODO: check for overflow
2704  substr_to_ = substr_from_ + (len - 1);
2705  }
2706  if (max_str < substr_to_)
2707  substr_to_ = max_str;
2708  buf_matrix_.resize(n_rows, len+1, false/*no content copy*/);
2709 }
2710 
2711 //---------------------------------------------------------------------
2712 
2713 template<class CharType, class BV, unsigned STR_SIZE>
2716 {
2717  BM_ASSERT(sv_);
2718  BM_ASSERT(this->valid());
2719 
2720  if (pos_in_buf_ == ~size_type(0))
2721  {
2722  if (!buf_matrix_.is_init())
2723  buf_matrix_.init();
2724  pos_in_buf_ = 0;
2725  size_type d = sv_->decode_substr(buf_matrix_, pos_, n_rows,
2726  substr_from_, substr_to_);
2727  if (!d)
2728  {
2729  pos_ = bm::id_max;
2730  return 0;
2731  }
2732  }
2733  if (is_null())
2734  return 0;
2735  return buf_matrix_.row(pos_in_buf_);
2736 }
2737 
2738 //---------------------------------------------------------------------
2739 
2740 template<class CharType, class BV, unsigned STR_SIZE>
2743 {
2744  BM_ASSERT(sv_);
2745  BM_ASSERT(this->valid());
2746 
2747  if (pos_in_buf_ == ~size_type(0))
2748  {
2749  if (!buf_matrix_.is_init())
2750  buf_matrix_.init();
2751  pos_in_buf_ = 0;
2752  size_type d = sv_->decode_substr(buf_matrix_, pos_, n_rows,
2753  substr_from_, substr_to_);
2754  if (!d)
2755  {
2756  pos_ = bm::id_max;
2757  return string_view_type();
2758  }
2759  }
2760  if (is_null())
2761  return string_view_type();
2762  return string_view_type(buf_matrix_.row(pos_in_buf_));
2763 }
2764 
2765 
2766 //---------------------------------------------------------------------
2767 
2768 template<class CharType, class BV, unsigned STR_SIZE>
2769 void
2772  ) BMNOEXCEPT
2773 {
2774  pos_ = (!sv_ || pos >= sv_->size()) ? bm::id_max : pos;
2775  pos_in_buf_ = ~size_type(0);
2776 }
2777 
2778 //---------------------------------------------------------------------
2779 
2780 template<class CharType, class BV, unsigned STR_SIZE>
2781 void
2783 {
2784  if (pos_ == bm::id_max) // nothing to do, we are at the end
2785  return;
2786  ++pos_;
2787 
2788  if (pos_ >= sv_->size())
2789  this->invalidate();
2790  else
2791  {
2792  if (pos_in_buf_ != ~size_type(0))
2793  {
2794  ++pos_in_buf_;
2795  if (pos_in_buf_ >= n_rows)
2796  pos_in_buf_ = ~size_type(0);
2797  }
2798  }
2799 }
2800 
2801 //---------------------------------------------------------------------
2802 //
2803 //---------------------------------------------------------------------
2804 
2805 template<class CharType, class BV, unsigned STR_SIZE>
2807 : sv_(0), bv_null_(0), pos_in_buf_(~size_type(0))
2808 {}
2809 
2810 //---------------------------------------------------------------------
2811 
2812 template<class CharType, class BV, unsigned STR_SIZE>
2815 : sv_(sv), pos_in_buf_(~size_type(0))
2816 {
2817  if (sv)
2818  {
2819  prev_nb_ = sv_->size() >> bm::set_block_shift;
2820  bv_null_ = sv_->get_null_bvect();
2821  unsigned esize = (unsigned) sv_->effective_max_str();
2822  if (esize < STR_SIZE)
2823  esize = STR_SIZE;
2824  buf_matrix_.init_resize(n_buf_size, esize);
2825  }
2826  else
2827  {
2828  bv_null_ = 0; prev_nb_ = 0;
2829  }
2830 }
2831 
2832 //---------------------------------------------------------------------
2833 
2834 template<class CharType, class BV, unsigned STR_SIZE>
2837 : sv_(bi.sv_), bv_null_(bi.bv_null_), buf_matrix_(bi.buf_matrix_.rows(), bi.buf_matrix_.cols()),
2838  pos_in_buf_(~size_type(0)), prev_nb_(bi.prev_nb_), opt_mode_(bi.opt_mode_),
2839  remap_flags_(bi.remap_flags_), omatrix_(bi.omatrix_)
2840 {
2841  BM_ASSERT(bi.empty());
2842 }
2843 
2844 //---------------------------------------------------------------------
2845 
2846 template<class CharType, class BV, unsigned STR_SIZE>
2848 {
2849  this->flush();
2850 }
2851 
2852 //---------------------------------------------------------------------
2853 
2854 template<class CharType, class BV, unsigned STR_SIZE>
2855 bool
2858 {
2859  return (pos_in_buf_ == ~size_type(0) || !sv_);
2860 }
2861 
2862 //---------------------------------------------------------------------
2863 
2864 template<class CharType, class BV, unsigned STR_SIZE>
2866 {
2867  flush_impl();
2868  if (remap_flags_)
2869  {
2870  buf_matrix_.free();
2871  sv_->remap(*this);
2872  remap_flags_ = 0;
2873  }
2874 }
2875 
2876 //---------------------------------------------------------------------
2877 
2878 template<class CharType, class BV, unsigned STR_SIZE>
2880 {
2881  if (this->empty())
2882  return;
2883 
2884  size_type imp_idx = sv_->size();
2885  sv_->import_no_check(buf_matrix_, imp_idx, pos_in_buf_+1, false);
2886  pos_in_buf_ = ~size_type(0);
2887  block_idx_type nb = sv_->size() >> bm::set_block_shift;
2888  if (nb != prev_nb_)
2889  {
2890  sv_->optimize_block(prev_nb_, opt_mode_);
2891  prev_nb_ = nb;
2892  }
2893 }
2894 
2895 
2896 //---------------------------------------------------------------------
2897 
2898 template<class CharType, class BV, unsigned STR_SIZE>
2901 {
2902  if (!v)
2903  {
2904  this->add_null();
2905  return;
2906  }
2907  size_type buf_idx = this->pos_in_buf_; // offset in
2908  size_type sz = sv_->size();
2909 
2910  this->add_value(v);
2911  if (bv_null_)
2912  bv_null_->set_bit_no_check(sz + buf_idx + 1);
2913 }
2914 
2915 //---------------------------------------------------------------------
2916 
2917 template<class CharType, class BV, unsigned STR_SIZE>
2919 {
2920  /*size_type buf_idx = */this->add_value("");
2921 }
2922 
2923 //---------------------------------------------------------------------
2924 
2925 template<class CharType, class BV, unsigned STR_SIZE>
2928 {
2929  for (size_type i = 0; i < count; ++i) // TODO: optimization
2930  this->add_value("");
2931 }
2932 
2933 //---------------------------------------------------------------------
2934 
2935 
2936 template<class CharType, class BV, unsigned STR_SIZE>
2937 void
2940 {
2942 
2943  size_t slen = ::strlen(v);
2944 
2945  auto orows = omatrix_.rows();
2946  if (slen > orows)
2947  {
2948  if (!orows)
2949  {
2950  omatrix_.resize(slen, 256, false);
2951  omatrix_.set_zero();
2952  }
2953  else
2954  {
2955  omatrix_.resize(slen, 256, true);
2956  for (; orows < omatrix_.rows(); ++orows)
2957  {
2958  typename
2959  octet_freq_matrix_type::value_type* r = omatrix_.row(orows);
2960  ::memset(r, 0, 256 * sizeof(r[0]));
2961  } // for orows
2962  }
2963  }
2964  for (size_t i = 0; i < slen; ++i)
2965  {
2966  value_type ch = v[i];
2967  typename
2968  octet_freq_matrix_type::value_type* row = omatrix_.row(i);
2969  unsigned ch_idx = (unsigned char)ch;
2970  row[ch_idx] += 1;
2971  } // for i
2972 }
2973 
2974 //---------------------------------------------------------------------
2975 
2976 template<class CharType, class BV, unsigned STR_SIZE>
2977 void
2980 {
2981  BM_ASSERT(sv_);
2982  BM_ASSERT(v);
2983  BM_ASSERT(buf_matrix_.rows()>0);
2984 
2985  if (pos_in_buf_ >= buf_matrix_.rows()-1)
2986  {
2987  if (pos_in_buf_ == ~size_type(0) && (!buf_matrix_.is_init()))
2988  buf_matrix_.init();
2989  else
2990  this->flush_impl();
2991  pos_in_buf_ = 0; buf_matrix_.set_zero();
2992  }
2993  else
2994  {
2995  ++pos_in_buf_;
2996  }
2997 
2998  if (remap_flags_)
2999  add_remap_stat(v);
3000 
3001  value_type* r = buf_matrix_.row(pos_in_buf_);
3002 
3004  typename buffer_matrix_type::size_type cols = buf_matrix_.cols();
3005  for (i = 0; i < cols; ++i)
3006  {
3007  r[i] = v[i];
3008  if (!r[i])
3009  return;
3010  } // for i
3011 
3012  // string is longer than the initial size, matrix resize is needed
3013  for (cols = i; true; ++cols) // find the new length
3014  {
3015  if (!v[cols])
3016  break;
3017  } // for cols
3018 
3019  // cols is now string length and the new mattrix size parameter
3020  buf_matrix_.resize(buf_matrix_.rows(), cols + 1);
3021 
3022  r = buf_matrix_.row(pos_in_buf_);
3023  cols = buf_matrix_.cols();
3024  for (; i < cols; ++i)
3025  {
3026  r[i] = v[i];
3027  if (!r[i])
3028  return;
3029  } // for i
3030  BM_ASSERT(0);
3031 }
3032 
3033 //---------------------------------------------------------------------
3034 
3035 template<class CharType, class BV, unsigned STR_SIZE>
3037 {
3038  const bvector_type* bv_null = this->get_null_bvector();
3039  if (!bv_null)
3040  return;
3041  bool found = bv_null->find_reverse(this->size_);
3042  this->size_ += found;
3043 }
3044 
3045 //---------------------------------------------------------------------
3046 
3047 } // namespace
3048 
3049 #endif
ncbi::TMaskedQueryRegions mask
Algorithms for bvector<> (main include)
basic bit-matrix class and utilities
Constants, lookup tables and typedefs.
Definitions(internal)
#define BMRESTRICT
Definition: bmdef.h:203
#define BMNOEXCEPT
Definition: bmdef.h:82
#define BM_ASSERT
Definition: bmdef.h:139
#define BM_ASSERT_THROW(x, xerrcode)
Definition: bmdef.h:338
#define BMNOEXCEPT2
Definition: bmdef.h:108
Utilities for bit transposition (internal) (experimental!)
#define true
Definition: bool.h:35
#define bool
Definition: bool.h:34
void assign_if_not_set(POOL &pool, PCLASS &obj) BMNOEXCEPT
check if vector has no assigned allocator and set one
Definition: bmblocks.h:2914
Base class for bit-transposed(bit-sliced) sparse vector construction.
Definition: bmbmatrix.h:352
void resize(size_type new_size, bool set_null)
Definition: bmbmatrix.h:1812
const bmatrix_type & get_bmatrix() const noexcept
Definition: bmbmatrix.h:553
void copy_from(const base_sparse_vector< CharType, BV, MAX_SIZE > &bsv)
Definition: bmbmatrix.h:1663
void clear_range(size_type left, size_type right, bool set_null)
Definition: bmbmatrix.h:1776
bvector_type_ptr get_create_slice(unsigned i)
get access to bit-plain, function checks and creates a plane
Definition: bmbmatrix.h:1851
bool is_nullable() const noexcept
check if container supports NULL(unassigned) values
Definition: bmbmatrix.h:439
bool is_null(size_type idx) const noexcept
test if specified element is NULL
Definition: bmbmatrix.h:1830
bmatrix_type bmatr_
bit-transposed matrix
Definition: bmbmatrix.h:701
void swap(base_sparse_vector< CharType, BV, MAX_SIZE > &bsv) noexcept
Definition: bmbmatrix.h:1744
void bit_and_rows(const bvector_type &bv)
Set AND (intersect) operation on all existing bit-slices.
Definition: bmbmatrix.h:672
std::make_unsigned< value_type >::type unsigned_value_type
Definition: bmbmatrix.h:376
void bit_sub_rows(const bvector_type &bv, bool use_null)
Set SUB (MINUS) operation on all existing bit-slices.
Definition: bmbmatrix.h:665
const bvector_type * get_null_bvector() const noexcept
Get bit-vector of assigned values or NULL (if not constructed that way)
Definition: bmbmatrix.h:451
void clear_value_planes_from(unsigned plane_idx, size_type idx)
Definition: bmbmatrix.h:1934
Basic dense bit-matrix class.
Definition: bmbmatrix.h:56
unsigned char get_octet(size_type pos, size_type octet_idx) const noexcept
Definition: bmbmatrix.h:1295
size_type rows_not_null() const noexcept
Definition: bmbmatrix.h:143
void allocate_rows(size_type rsize)
allocate matrix rows of bit-vectors (new rows are NULLs)
Definition: bmbmatrix.h:880
bvector_type_const_ptr get_row(size_type i) const noexcept
Definition: bmbmatrix.h:777
void set_octet(size_type pos, size_type octet_idx, unsigned char octet)
Definition: bmbmatrix.h:1193
size_type rows() const noexcept
Definition: bmbmatrix.h:140
unsigned char * data() noexcept
Get write access to buffer memory.
Definition: bmbuffer.h:60
size_t size() const noexcept
Get buffer size.
Definition: bmbuffer.h:54
const unsigned char * buf() const noexcept
Get read access to buffer memory.
Definition: bmbuffer.h:57
const value_type * row(size_type row_idx) const noexcept
Definition: bmbuffer.h:802
buffer_type & get_buffer() noexcept
Get low-level buffer access.
Definition: bmbuffer.h:869
void swap(dynamic_heap_matrix &other) noexcept
Definition: bmbuffer.h:852
value_type get(size_type row_idx, size_type col_idx) noexcept
Definition: bmbuffer.h:819
size_type rows() const noexcept
Definition: bmbuffer.h:756
size_type cols() const noexcept
Definition: bmbuffer.h:757
void remapz(VECT_TYPE *vect) const noexcept
Definition: bmbuffer.h:934
void set_zero() noexcept
memset all buffer to all zeroes
Definition: bmbuffer.h:844
void init_resize(size_type rows_in, size_type cols_in)
Definition: bmbuffer.h:773
void resize(size_type rows_in, size_type cols_in, bool copy_content=true)
Definition: bmbuffer.h:759
void init(bool set_z=false)
Post construction allocation, initialization.
Definition: bmbuffer.h:740
value_type * data() const noexcept
Definition: bmbuffer.h:381
void resize(size_type new_size)
vector resize
Definition: bmbuffer.h:462
sparse vector de-serializer
Serialize sparse vector into a memory buffer(s) structure.
Back insert iterator implements buffered insert, faster than generic access assignment.
bvector_type * bv_null_
!< pointer on the parent vector
back_insert_iterator & operator*()
noop
void add_remap_stat(const value_type *v)
account new value as remap statistics
void add(const value_type *v)
add value to the container
octet_freq_matrix_type omatrix_
octet frequency matrix
void add_value(const value_type *v)
add value to the buffer without changing the NULL vector
back_insert_iterator & operator++()
noop
void flush()
flush the accumulated buffer.
void set_remap(bool flag) noexcept
Method to configure back inserter to collect statistics on optimal character codes.
buffer_matrix_type buf_matrix_
!< not NULL vector pointer
back_insert_iterator & operator++(int)
noop
str_sparse_vector_type * str_sparse_vector_type_ptr
bvector_type::block_idx_type block_idx_type
str_sparse_vector_type::value_type value_type
str_sparse_vector_type::size_type size_type
void add_null()
add NULL (no-value) to the container
bvector_type::allocator_type allocator_type
back_insert_iterator & operator=(const value_type *v)
push value to the vector
bvector_type::optmode opt_mode_
!< previous block added
bm::dynamic_heap_matrix< CharType, allocator_type > buffer_matrix_type
block_idx_type prev_nb_
!< buffer position
str_sparse_vector< CharType, BV, STR_SIZE > str_sparse_vector_type
unsigned get_remap() const noexcept
Get curent remap state flags.
void set_optimize(typename bvector_type::optmode opt_mode) noexcept
Set optimization on load option (deafult: false)
const octet_freq_matrix_type & get_octet_matrix() const noexcept
Get octet frequence matrix.
void add_null(size_type count)
add a series of consequitve NULLs (no-value) to the container
allocator_type::allocator_pool_type allocator_pool_type
bool empty() const noexcept
return true if insertion buffer is empty
back_insert_iterator & operator=(const StrType &v)
push value to the vector
str_sparse_vector_type::bvector_type bvector_type
Const iterator to do quick traverse of the sparse vector.
str_sparse_vector_type * str_sparse_vector_type_ptr
std::input_iterator_tag iterator_category
dynamic_heap_matrix< CharType, allocator_type > buffer_matrix_type
bool operator==(const const_iterator &it) const noexcept
bool operator!=(const const_iterator &it) const noexcept
const value_type * operator*() const
Get current position (value)
bool operator<=(const const_iterator &it) const noexcept
const value_type * value() const
Get zero terminated string value at the current position.
allocator_type::allocator_pool_type allocator_pool_type
size_type pos_in_buf_
!< decode value buffer
const_iterator() noexcept
Construct iterator (not attached to any particular vector)
bvector_type::allocator_type allocator_type
void invalidate() noexcept
Invalidate current iterator.
buffer_matrix_type buf_matrix_
!< Position
size_type pos() const noexcept
Current position (index) in the vector.
unsigned substr_from_
!< ptr to parent
unsigned substr_to_
!< substring from
bool is_null() const noexcept
Get NULL status.
str_sparse_vector< CharType, BV, STR_SIZE > str_sparse_vector_type
void go_to(size_type pos) noexcept
re-position to a specified position
str_sparse_vector_type::bvector_type bvector_type
void set_substr(unsigned from, unsigned len=0) noexcept
setup iterator to retrieve a sub-string of a string
bool operator>=(const const_iterator &it) const noexcept
const str_sparse_vector_type * sv_
bool operator<(const const_iterator &it) const noexcept
str_sparse_vector_type::size_type size_type
str_sparse_vector_type::value_type value_type
string_view_type get_string_view() const
Get current string as string_view.
void advance() noexcept
advance iterator forward by one
bool valid() const noexcept
Returns true if iterator is at a valid position.
bool operator>(const const_iterator &it) const noexcept
const_iterator & operator++(int) noexcept
Advance to the next available value.
std::basic_string_view< CharType > string_view_type
const_iterator & operator++() noexcept
Advance to the next available value.
Reference class to access elements via common [] operator.
const_reference(const str_sparse_vector< CharType, BV, STR_SIZE > &str_sv, size_type idx)
const str_sparse_vector< CharType, BV, STR_SIZE > & str_sv_
const value_type * get() const noexcept
bool operator==(const const_reference &ref) const noexcept
bm::heap_vector< CharType, typename bvector_type::allocator_type, true > bufffer_type
Reference class to access elements via common [] operator.
str_sparse_vector< CharType, BV, STR_SIZE > & str_sv_
reference(str_sparse_vector< CharType, BV, STR_SIZE > &str_sv, size_type idx)
bool is_null() const noexcept
reference & operator=(const value_type *str)
const value_type * get() const noexcept
bool operator==(const reference &ref) const noexcept
reference & operator=(const reference &ref)
succinct sparse vector for strings with compression using bit-slicing ( transposition) method
void insert(size_type idx, const value_type *str)
insert the specified element
static bool remap_n_tosv_2way(value_type *sv_str, value_type *str_cp, size_type buf_size, const value_type *str, size_t in_len, const slice_octet_matrix_type &octet_remap_matrix2) noexcept
void calc_octet_stat(octet_freq_matrix_type &octet_matrix) const
void clear() noexcept
resize to zero, free memory, reset remapping
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename str_sparse_vector< CharType, BV, STR_SIZE >::statistics *stat=0)
run memory optimization for all vector planes
size_type get(size_type idx, value_type *str, size_type buf_size) const noexcept
get specified element
void set_value(size_type idx, const value_type *str)
set value without checking boundaries
void resize(size_type sz)
resize vector
bool empty() const
return true if vector is empty
static bool remap_tosv(value_type *sv_str, size_type buf_size, const value_type *str, const slice_octet_matrix_type &octet_remap_matrix2) noexcept
const_iterator get_const_iterator(size_type idx) const noexcept
Get const_itertor re-positioned to specific element.
int compare_remap(size_type idx, const value_type *str) const noexcept
Variant of compare for remapped vectors.
static constexpr bool is_str() noexcept
bool is_ro() const noexcept
Returns true if vector is read-only.
static size_type max_str()
get maximum string length capacity
unsigned char * init_remap_buffer()
void set_null(size_type idx)
set NULL status for the specified element Vector is resized automatically
void set_null(const bvector_type &bv_idx)
Set NULL all elements set as 1 in the argument vector.
void remap_from_impl(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix, bool move_data)
Remap from implementation, please note that move_data flag can violate cosnt-ness.
void sync(bool force)
syncronize internal structures
slice_octet_matrix_type remap_matrix1_
octet remap table 1
const bvector_type * bvector_type_const_ptr
str_sparse_vector< CharType, BV, STR_SIZE > & merge(str_sparse_vector< CharType, BV, STR_SIZE > &str_sv)
merge with another sparse vector using OR operation Merge is different from join(),...
void assign(size_type idx, const StrType &str)
set specified element with bounds checking and automatic resize
bool remap_tosv(value_type *sv_str, size_type buf_size, const value_type *str) const noexcept
str_sparse_vector(bm::null_support null_able=bm::no_null, allocation_policy_type ap=allocation_policy_type(), size_type bv_max_size=bm::id_max, const allocator_type &alloc=allocator_type())
Sparse vector constructor.
bm::dynamic_heap_matrix< unsigned char, allocator_type > slice_octet_matrix_type
Matrix of character remappings.
void remap_from(const str_sparse_vector &str_sv, octet_freq_matrix_type *omatrix=0)
Build remapping profile and load content from another sparse vector Remapped vector likely saves memo...
void sync_size() noexcept
recalculate size to exclude tail NULL elements After this call size() will return the true size of th...
size_type size() const
return size of the vector
void build_octet_remap(slice_octet_matrix_type &octet_remap_matrix1, slice_octet_matrix_type &octet_remap_matrix2, octet_freq_matrix_type &octet_occupancy_matrix) const
void keep(const bvector_type &bv_idx)
Set NULL all elements NOT set as 1 in the argument vector.
void import_char_slice(const unsigned_value_type *ch_slice, unsigned ch_acc, size_type char_slice_idx, size_type idx_from, size_type imp_size)
bool is_remap() const noexcept
Get character remapping status (true | false)
bm::basic_bmatrix< BV > bmatrix_type
unsigned common_prefix_length(size_type idx1, size_type idx2, value_type *prefix_buf=0) const noexcept
Find size of common prefix between two vector elements in octets.
void insert_value(size_type idx, const value_type *str)
insert value without checking boundaries
void push_back(const StrType &str)
push back a string
void import_no_check(CharMatrix &cmatr, size_type idx_from, size_type imp_size, bool set_not_null=true)
bvector_type::allocation_policy allocation_policy_type
void clear(const bvector_type &bv_idx)
Set vector elements spcified by argument bit-vector to empty Note that set to empty elements are NOT ...
void freeze()
Turn sparse vector into immutable mode Read-only (immutable) vector uses less memory and allows faste...
BV::allocator_type allocator_type
void insert_value_no_null(size_type idx, const value_type *str)
insert value without checking boundaries or support of NULL
bm::dynamic_heap_matrix< size_t, allocator_type > octet_freq_matrix_type
Matrix of character frequencies (for optimal code remap)
slice_octet_matrix_type remap_matrix2_
octet remap table 2
bool resolve_range(size_type from, size_type to, size_type *idx_from, size_type *idx_to) const
parent_type::unsigned_value_type unsigned_value_type
void import_back(CharMatrix &cmatr, size_type imp_size)
Bulk push-back import of strings from a C-style matrix of chars.
static int compare_str(const value_type *str1, const value_type *str2) noexcept
size_type decode(CharMatrix &cmatr, size_type idx_from, size_type dec_size, bool zero_mem=true) const
Bulk export strings to a C-style matrix of chars.
void resize_internal(size_type sz)
bool try_get(size_type idx, StrType &str) const
get specified string element if NOT NULL Template method expects an STL-compatible type basic_string<...
bool remap_n_tosv_2way(value_type *sv_str, value_type *str_cp, size_type buf_size, const value_type *str, size_t in_len) const noexcept
void erase(size_type idx)
erase the specified element
size_type effective_size() const noexcept
size of sparse vector (may be different for RSC)
const unsigned char * get_remap_buffer() const
bvector_type * bvector_type_ptr
void insert(size_type idx, const StrType &str)
insert STL string
static void throw_bad_value(const char *err_msg)
throw domain error
void get(size_type idx, StrType &str) const
get specified string element Template method expects an STL-compatible type basic_string<>
reference operator[](size_type idx)
Operator to get write access to an element.
void push_back(const value_type *str)
push back a string (zero terminated)
str_sparse_vector< CharType, BV, STR_SIZE > & operator=(const str_sparse_vector< CharType, BV, STR_SIZE > &str_sv)
static constexpr bool is_compressed() noexcept
various type traits
size_type decode_substr(CharMatrix &cmatr, size_type idx_from, size_type dec_size, unsigned substr_from, unsigned substr_to, bool zero_mem=true) const
Bulk export strings to a C-style matrix of chars.
const remap_matrix_type * get_remap_matrix() const
base_sparse_vector< CharType, BV, STR_SIZE > parent_type
void push_back_null()
push back NULL value
size_type effective_vector_max() const
get effective string length used in vector
static bool remap_fromsv(value_type *str, size_type buf_size, const value_type *sv_str, const slice_octet_matrix_type &octet_remap_matrix1) noexcept
int compare_nomap(size_type idx, const value_type *str) const noexcept
Variant of compare for non-mapped vectors.
void remap()
Build remapping profile and re-load content to save memory.
back_insert_iterator get_back_inserter()
Provide back insert iterator Back insert iterator implements buffered insertion, which is faster,...
const_iterator begin() const noexcept
Provide const iterator access to container content.
void calc_stat(struct str_sparse_vector< CharType, BV, STR_SIZE >::statistics *st) const noexcept
Calculates memory statistics.
slice_octet_matrix_type remap_matrix_type
bvector_type::enumerator bvector_enumerator_type
void set_value_no_null(size_type idx, const value_type *str)
set value without checking boundaries or support of NULL
void copy_range(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, size_type left, size_type right, bm::null_support slice_null=bm::use_null)
copy range of values from another sparse vector
size_t remap_size() const
bool equal(const str_sparse_vector< CharType, BV, STR_SIZE > &sv, bm::null_support null_able=bm::use_null) const noexcept
check if another sparse vector has the same content and size
static void throw_range_error(const char *err_msg)
throw range error
str_sparse_vector< CharType, BV, STR_SIZE > & clear_range(size_type left, size_type right, bool set_null=false)
clear range (assign bit 0 for all planes)
void swap(size_type idx1, size_type idx2)
swap two vector elements between each other
size_type size_internal() const
void set(size_type idx, const value_type *str)
set specified element with bounds checking and automatic resize
const_iterator end() const noexcept
Provide const iterator access to the end.
unsigned remap_flags_
remapping status
size_type effective_max_str() const noexcept
get effective string length used in vector Calculate and returns efficiency, how close are we to the ...
static bool find_rank(size_type rank, size_type &pos) noexcept
find position of compressed element by its rank
remap_matrix_type * get_remap_matrix()
int compare(size_type idx, const value_type *str) const noexcept
Compare vector element with argument lexicographically.
const const_reference operator[](size_type idx) const
Operator to get read access to an element.
void keep_range(size_type left, size_type right, bm::null_support slice_null=bm::use_null)
Keep only specified interval in the sparse vector, clear all other elements.
str_sparse_vector(str_sparse_vector< CharType, BV, STR_SIZE > &&str_sv) noexcept
allocator_type::allocator_pool_type allocator_pool_type
bvector_type::size_type size_type
void clear_all(bool free_mem, unsigned remap=0) noexcept
resize to zero, free memory
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
bm::id_t word_bitcount(bm::id_t w) noexcept
Definition: bmutil.h:582
unsigned bit_list(T w, B *bits) noexcept
Unpacks word into list of ON bit indexes.
Definition: bmfunc.h:595
null_support
NULL-able value support.
Definition: bmconst.h:229
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:230
@ no_null
do not support NULL values
Definition: bmconst.h:231
int i
int len
#include<zmmintrin.h>
Definition: bm.h:78
const unsigned id_max
Definition: bmconst.h:109
void xor_swap(W &x, W &y) noexcept
XOR swap two variables.
Definition: bmutil.h:534
unsigned int word_t
Definition: bmconst.h:39
int for_each_bit_range_no_check(const BV &bv, typename BV::size_type left, typename BV::size_type right, Func &bit_functor)
Implementation of for_each_bit_range without boilerplave checks.
Definition: bmalgo_impl.h:1925
bool find_first_nz(const VT *arr, SZ arr_size, SZ *found_idx) noexcept
Find max non-zero value in an array.
Definition: bmfunc.h:10135
bool find_max_nz(const VT *arr, SZ arr_size, SZ *found_idx) noexcept
Find max non-zero value in an array.
Definition: bmfunc.h:10114
unsigned long long int id64_t
Definition: bmconst.h:35
remap_setup
@ COPY_RTABLES
copy remap tables only (without data)
const unsigned gap_max_bits
Definition: bmconst.h:81
const unsigned set_block_shift
Definition: bmconst.h:56
bool has_zero_byte_u64(bm::id64_t v) noexcept
Returns true if INT64 contains 0 octet.
Definition: bmutil.h:571
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
void resize(vector< SMethodDef > &container)
static const BitmapCharRec ch3
Definition: ncbi_10x20.c:1809
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
static const BitmapCharRec ch0
Definition: ncbi_10x20.c:13
static const BitmapCharRec ch2
Definition: ncbi_10x20.c:1819
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static char tmp[2048]
Definition: utf8.c:42
static int buffer_size
Definition: pcretest.c:1050
static const char * str(char *buf, int n)
Definition: stats.c:84
Structure with statistical information about memory allocation footprint, serialization projection,...
Definition: bmfunc.h:56
void add(const bv_statistics &st) noexcept
Sum data from another sttructure.
Definition: bmfunc.h:103
#define const
Definition: zconf.h:230
Modified on Fri Jan 05 07:25:37 2024 by modify_doxy.py rev. 669887