NCBI C++ ToolKit
kmercounts.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
2 #define ALGO_COBALT___KMERCOUNTS__HPP
3 
4 /* $Id: kmercounts.hpp 92103 2020-12-22 14:59:31Z boratyng $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's offical duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: kmercounts.hpp
32 
33 Author: Greg Boratyn
34 
35 Contents: Interface for k-mer counting
36 
37 ******************************************************************************/
38 
39 
40 #include <util/math/matrix.hpp>
42 #include <objmgr/scope.hpp>
43 #include <algo/cobalt/base.hpp>
44 #include <algo/cobalt/links.hpp>
46 #include <vector>
47 #include <stack>
48 
49 
51 BEGIN_SCOPE(cobalt)
52 
53 
54 
55 // TO DO: Redesign K-mer counts classes
56 
57 /// Kmer counts for alignment free sequence similarity computation
58 /// implemented as a sparse vector
59 ///
61 {
62 public:
63  typedef Uint1 TCount;
64 
65 
66  /// Element of the sparse vector
67  struct SVectorElement {
68  Uint4 position; ///< position of non-zero element
69  TCount value; ///< value of non-zero element
70 
71  /// Default constructor
72  SVectorElement(void) {position = 0; value = 0;}
73 
74  /// Create vector element
75  /// @param pos Element position
76  /// @param val Element value
77  SVectorElement(Uint4 pos, TCount val) {position = pos; value = val;}
78  };
79 
80  typedef vector<SVectorElement>::const_iterator TNonZeroCounts_CI;
81 
82 
83 public:
84  /// Create empty counts vector
85  ///
86  CSparseKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
87 
88  /// Create k-mer counts vector from SSeqLoc with defalut k-mer length and
89  /// alphabet size
90  /// @param seq The sequence to be represented as k-mer counts [in]
91  /// @param scope Scope
92  ///
93  CSparseKmerCounts(const objects::CSeq_loc& seq,
94  objects::CScope& scope);
95 
96  /// Reset the counts vector
97  /// @param seq Sequence [in]
98  /// @param scope Scope [in]
99  ///
100  void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
101 
102  /// Get sequence length
103  /// @return Sequence length
104  ///
105  unsigned int GetSeqLength(void) const {return m_SeqLength;}
106 
107  /// Get number of all k-mers found in the sequence
108  /// @return Number of all k-mers
109  ///
110  unsigned int GetNumCounts(void) const {return m_NumCounts;}
111 
112  /// Get default kmer length
113  /// @return Default k-mer length
114  ///
115  static unsigned int GetKmerLength(void)
117 
118  /// Get default alphabet size
119  /// @return Default alphabet size
120  ///
121  static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
122 
123  /// Get non-zero counts iterator
124  /// @return Non-zero counts iterator pointing to the begining
125  ///
126  TNonZeroCounts_CI BeginNonZero(void) const {return m_Counts.begin();}
127 
128  /// Get non-zero counts iterator
129  /// @return Non-zero counts iterator pointing to the end
130  ///
131  TNonZeroCounts_CI EndNonZero(void) const {return m_Counts.end();}
132 
133  /// Print counts
134  /// @param ostr Output stream [in|out]
135  /// @return Output stream
136  ///
137  CNcbiOstream& Print(CNcbiOstream& ostr) const;
138 
139  /// Set default k-mer length
140  /// @param len Default k-mer length [in]
141  ///
142  static void SetKmerLength(unsigned len)
143  {sm_KmerLength = len; sm_ForceSmallerMem = false;}
144 
145  /// Set Default alphabet size
146  /// @param size Default alphabet size [in]
147  ///
148  static void SetAlphabetSize(unsigned size)
149  {sm_AlphabetSize = size; sm_ForceSmallerMem = false;}
150 
151  /// Set default compressed alphabet letter translation table
152  /// @return Reference to translation table [in|out]
153  ///
154  static vector<Uint1>& SetTransTable(void) {return sm_TransTable;}
155 
156  /// Set default option for using compressed alphabet
157  /// @param use_comp Will compressed alphabet be used [in]
158  ///
159  static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
160 
161  /// Compute 1 - local fraction of common k-mers between two count vectors
162  /// normalized by length of shorter sequence
163  /// @param vect1 K-mer counts vector [in]
164  /// @param vect2 K-mer counts vector [in]
165  /// @return Local fraction of common k-mer as distance
166  ///
167  /// Computes 1 - F(v1, v2), where
168  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\}
169  /// - k + 1), where
170  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
171  /// excluding Xaa, k - k-mer length
172  /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
173  static double FractionCommonKmersDist(const CSparseKmerCounts& vect1,
174  const CSparseKmerCounts& vect2);
175 
176  /// Compute 1 - global fraction of common k-mers between two count vectors
177  /// normalized by length of longer sequence
178  /// @param vect1 K-mer counts vector [in]
179  /// @param vect2 K-mer counts vector [in]
180  /// @return Global fraction of common k-mers as distance
181  ///
182  /// Computes 1 - F(v1, v2), where
183  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\}
184  /// - k + 1), where
185  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
186  /// excluding Xaa, k - k-mer length
187  /// F(x, y) is modified version of measure presented
188  /// RC Edgar, BMC Bioinformatics 5:113, 2004
189  static double FractionCommonKmersGlobalDist(const CSparseKmerCounts& v1,
190  const CSparseKmerCounts& v2);
191 
192  /// Copmute number of common kmers between two count vectors
193  /// @param v1 K-mer counts vector [in]
194  /// @param v2 K-mer counts vecotr [in]
195  /// @param repetitions Should multiple copies of the same k-mer be counted
196  /// @return Number of k-mers that are present in both counts vectors
197  ///
198  static unsigned int CountCommonKmers(const CSparseKmerCounts& v1,
199  const CSparseKmerCounts& v2,
200  bool repetitions = true);
201 
202  /// Perform preparations before k-mer counting common to all sequences.
203  /// Allocate buffer for storing temporary counts
204  ///
205  static void PreCount(void);
206 
207  /// Perform post-kmer counting tasks. Free buffer.
208  ///
209  static void PostCount(void);
210 
211 
212 private:
213  static TCount* ReserveCountsMem(unsigned int num_bits);
214 
216  {
217  _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size());
218  return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter);
219  }
220 
221  /// Initializes element index as bit vector for first k letters,
222  /// skipping Xaa
223  /// @param sv Sequence [in]
224  /// @param pos Element index in sparse vector [out]
225  /// @param index Index of letter in the sequence where k-mer counting
226  /// starts. At exit index points to the next letter after first
227  /// k-mer [in|out]
228  /// @param num_bits Number of bits in pos per letter [in]
229  /// @param kmer_len K-mer length [in]
230  /// @return True if pos was initialized, false otherwise (if no k-mer
231  /// without X was found)
232  static bool InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
233  unsigned int& index, Uint4 num_bits,
234  Uint4 kmer_len);
235 
236 
237 protected:
238  vector<SVectorElement> m_Counts;
239  unsigned int m_SeqLength;
240  unsigned int m_NumCounts;
241  static unsigned int sm_KmerLength;
242  static unsigned int sm_AlphabetSize;
243  static vector<Uint1> sm_TransTable;
244  static bool sm_UseCompressed;
245  static TCount* sm_Buffer;
246  static bool sm_ForceSmallerMem;
247  static const unsigned int kLengthBitsThreshold = 32;
248 };
249 
250 
251 /// K-mer counts implemented as bit vectors
252 ///
254 {
255 public:
256 
257  /// Constructor
258  ///
259  CBinaryKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
260 
261 
262  /// Constructor
263  /// @param seq Sequence [in]
264  /// @param scop Scope [in]
265  ///
266  CBinaryKmerCounts(const objects::CSeq_loc& seq,
267  objects::CScope& scope);
268 
269  /// Compute counts
270  /// @param seq Sequence [in]
271  /// @param scope Scope [in]
272  ///
273  void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
274 
275  /// Get sequence length
276  /// @return Sequence length
277  ///
278  unsigned int GetSeqLength(void) const {return m_SeqLength;}
279 
280  /// Get number of k-mers
281  /// @return Number of k-mers
282  ///
283  unsigned int GetNumCounts(void) const {return m_NumCounts;}
284 
285  /// Get k-mer length
286  /// @return K-mer length
287  ///
288  static unsigned int GetKmerLength(void)
290 
291  /// Get alphabet size
292  /// @return Alphabet size
293  ///
294  static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
295 
296  /// Set default k-mer length
297  /// @param len Default k-mer length [in]
298  ///
299  static void SetKmerLength(unsigned len)
300  {sm_KmerLength = len;}
301 
302  /// Set Default alphabet size
303  /// @param size Default alphabet size [in]
304  ///
305  static void SetAlphabetSize(unsigned size)
306  {sm_AlphabetSize = size;}
307 
308  /// Set default compressed alphabet letter translation table
309  /// @return Reference to translation table [in|out]
310  ///
311  static vector<Uint1>& SetTransTable(void) {return sm_TransTable;}
312 
313  /// Set default option for using compressed alphabet
314  /// @param use_comp Will compressed alphabet be used [in]
315  ///
316  static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
317 
318  /// Compute 1 - local fraction of common k-mers between two count vectors
319  /// normalized by length of shorter sequence
320  /// @param vect1 K-mer counts vector [in]
321  /// @param vect2 K-mer counts vector [in]
322  /// @return Local fraction of common k-mer as distance
323  ///
324  /// Computes 1 - F(v1, v2), where
325  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\}
326  /// - k + 1), where
327  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
328  /// excluding Xaa, k - k-mer length
329  /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
330  static double FractionCommonKmersDist(const CBinaryKmerCounts& vect1,
331  const CBinaryKmerCounts& vect2);
332 
333  /// Compute 1 - global fraction of common k-mers between two count vectors
334  /// normalized by length of longer sequence
335  /// @param vect1 K-mer counts vector [in]
336  /// @param vect2 K-mer counts vector [in]
337  /// @return Global fraction of common k-mers as distance
338  ///
339  /// Computes 1 - F(v1, v2), where
340  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\}
341  /// - k + 1), where
342  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
343  /// excluding Xaa, k - k-mer length
344  /// F(x, y) is modified version of measure presented
345  /// RC Edgar, BMC Bioinformatics 5:113, 2004
346  static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts& v1,
347  const CBinaryKmerCounts& v2);
348 
349  /// Copmute number of common kmers between two count vectors
350  /// @param v1 K-mer counts vector [in]
351  /// @param v2 K-mer counts vecotr [in]
352  /// @param repetitions Should multiple copies of the same k-mer be counted
353  /// @return Number of k-mers that are present in both counts vectors
354  ///
355  static unsigned int CountCommonKmers(const CBinaryKmerCounts& v1,
356  const CBinaryKmerCounts& v2);
357 
358 
359  /// Perform preparations before k-mer counting common to all sequences.
360  ///
361  static void PreCount(void) {}
362 
363  /// Perform post-kmer counting tasks.
364  ///
365  static void PostCount(void) {}
366 
367 
368 protected:
370  {
371  _ASSERT(!sm_UseCompressed || letter < sm_TransTable.size());
372  return (Uint4)(sm_UseCompressed ? sm_TransTable[(int)letter] : letter);
373  }
374 
375  /// Get number of set bits (adapted
376  /// from http://graphics.stanford.edu/~seander/bithacks.html)
377  /// @param v Bit vector [in]
378  /// @return Number of set bits
379  ///
381  {
382  if (v==0) return 0; // early bailout for sparse vectors
383  v = v - ((v >> 1) & 0x55555555);
384  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
385  v = ((v + (v >> 4)) & 0xF0F0F0F);
386  v = v*0x1010101;
387 
388  return v >> 24; // count
389  }
390 
391 
392 protected:
393  vector<Uint4> m_Counts;
396  static unsigned int sm_KmerLength;
397  static unsigned int sm_AlphabetSize;
398  static vector<Uint1> sm_TransTable;
399  static bool sm_UseCompressed;
400 };
401 
402 
403 
404 /// Exception class for Kmer counts
406 {
407 public:
408  enum EErrCode {
415  };
416 
418 };
419 
420 /// Interface for computing and manipulating k-mer counts vectors that allows
421 /// for different implementations of K-mer counts vectors
422 ///
423 template <class TKmerCounts>
425 {
426 public:
431  };
432 
436  };
437 
439 
440 public:
441 
442  /// Set default counts vector parameters
443  /// @param kmer_len K-mer length [in]
444  /// @param alphabet_size Alphabet size [in]
445  ///
446  static void SetParams(unsigned kmer_len, unsigned alphabet_size)
447  {
448  TKmerCounts::SetKmerLength(kmer_len);
449  TKmerCounts::SetAlphabetSize(alphabet_size);
450  TKmerCounts::SetTransTable().clear();
451  TKmerCounts::SetUseCompressed(false);
452  }
453 
454  /// Creates translation table for compressed alphabets
455  /// @param trans_string String with groupped letters [in]
456  /// @param trans_table Translation table [out]
457  /// @param alphabet_len Number of letters in compressed alphabet
458  ///
460  vector<Uint1>& trans_table,
461  unsigned alphabet_len)
462 
463  {
464  // Compressed alphabets taken from
465  // Shiryev et al.(2007), Bioinformatics, 23:2949-2951
466  const char* kCompAlphabets[] = {
467  // 23-to-10 letter compressed alphabet. Based on SE-V(10)
468  "IJLMV AST BDENZ KQR G FY P H C W",
469  // 23-to-15 letter compressed alphabet. Based on SE_B(14)
470  "ST IJV LM KR EQZ A G BD P N F Y H C W"
471  };
472 
473  _ASSERT(alph_index >= eFirstCompressed && alph_index <= eLastAlphabet);
474  const char* trans_string = kCompAlphabets[alph_index
475  - (int)eFirstCompressed];
476 
477  Uint4 compressed_letter = 1; // this allows for gaps
478  trans_table.clear();
479  trans_table.resize(alphabet_len + 1, 0);
480  for (Uint4 i = 0; i < strlen(trans_string);i++) {
481  if (isspace(trans_string[i])) {
482  compressed_letter++;
483  }
484  else if (isalpha(trans_string[i])) {
485  Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]];
486 
487  _ASSERT(aa_letter < trans_table.size());
488 
489  trans_table[aa_letter] = compressed_letter;
490  }
491  }
492  }
493 
494  /// Set default counts vector parameters for use with compressed alphabet
495  /// @param kmer_len K-mer length [in]
496  /// @param alph Compressed alphabet to use [in]
497  ///
498  static void SetParams(unsigned kmer_len, ECompressedAlphabet alph) {
499  TKmerCounts::SetKmerLength(kmer_len);
500  unsigned int len;
501  unsigned int compressed_len;
502  switch (alph) {
503  case eSE_V10:
504  len = 28;
505  compressed_len = 11; //including gap
507  TKmerCounts::SetTransTable(),
508  len);
509  TKmerCounts::SetAlphabetSize(compressed_len);
510  TKmerCounts::SetUseCompressed(true);
511  break;
512 
513  case eSE_B15:
514  len = 28;
515  compressed_len = 16; //including gap
517  TKmerCounts::SetTransTable(),
518  len);
519  TKmerCounts::SetAlphabetSize(compressed_len);
520  TKmerCounts::SetUseCompressed(true);
521  break;
522 
523  case eRegular:
524  TKmerCounts::SetAlphabetSize(kAlphabetSize);
525  TKmerCounts::SetTransTable().clear();
526  TKmerCounts::SetUseCompressed(false);
527  }
528  }
529 
530  /// Create k-mer counts vectors for given sequences
531  /// @param seqs List of sequences [in]
532  /// @param counts List of k-mer counts vectors [out]
533  ///
534  static void ComputeCounts(const vector< CRef<objects::CSeq_loc> >& seqs,
535  objects::CScope& scope,
536  vector<TKmerCounts>& counts)
537  {
538  if (seqs.empty()) {
539  NCBI_THROW(CKmerCountsException, eInvalidOptions,
540  "Empty list of sequences");
541  }
542 
543  counts.clear();
544 
545  TKmerCounts::PreCount();
546 
547  ITERATE(vector< CRef<objects::CSeq_loc> >, it, seqs) {
548  counts.push_back(TKmerCounts(**it, scope));
549  }
550 
551  TKmerCounts::PostCount();
552  }
553 
554  /// Compute matrix of distances between given counts vectors
555  /// @param counts List of k-mer counts vectors [in]
556  /// @param fsim Function that computes distance betwee two vectors [in]
557  /// @param dmat Distance matrix [out]
558  ///
559  static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
560  double(*fsim)(const TKmerCounts&, const TKmerCounts&),
561  TDistMatrix& dmat)
562 
563  {
564  if (counts.empty()) {
565  NCBI_THROW(CKmerCountsException, eBadSequence,
566  "The list of k-mer counts vectors is empty");
567  }
568 
569  dmat.Resize(counts.size(), counts.size(), 0.0);
570  for (int i=0;i < (int)counts.size() - 1;i++) {
571  for (int j=i+1;j < (int)counts.size();j++) {
572  dmat(i, j) = fsim(counts[i], counts[j]);
573  dmat(j, i) = dmat(i, j);
574  }
575  }
576  }
577 
578  /// Compute matrix of distances between given list of counts vectors
579  /// using distance function with additional normalizing values
580  /// @param counts List of k-mer counts vectors [in]
581  /// @param dmat Distance matrix [out]
582  /// @param fsim Function that computes distance betwee two vectors [in]
583  /// @param normalizers List of normalizing arguments [in]
584  ///
585  static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
586  TDistMatrix& dmat,
587  double(*fsim)(const TKmerCounts&, const TKmerCounts&, double, double),
588  const vector<double>& normalizers);
589 
590 
591  /// Compute distance matrix for given counts vectors and distance measure
592  /// @param counts List of k-mer counts vecotrs [in]
593  /// @param dist_method Distance measure [in]
594  /// @param dmat Distance matrix [out]
595  ///
596  static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
597  EDistMeasures dist_method,
598  TDistMatrix& dmat)
599  {
600  switch (dist_method) {
602  ComputeDistMatrix(counts, TKmerCounts::FractionCommonKmersDist,
603  dmat);
604  break;
605 
607  ComputeDistMatrix(counts,
608  TKmerCounts::FractionCommonKmersGlobalDist,
609  dmat);
610  break;
611 
612  default:
613  NCBI_THROW(CKmerCountsException, eUnsuportedDistMethod,
614  "Unrecognised distance measure");
615  }
616  }
617 
618 
619  /// Compute distance matrix for given counts vectors and distance measure
620  /// and avoid copying
621  /// @param counts List of k-mer counts vecotrs [in]
622  /// @param dist_method Distance measure [in]
623  /// @return Distance matrix
624  ///
625  static unique_ptr<TDistMatrix> ComputeDistMatrix(
626  const vector<TKmerCounts>& counts,
627  EDistMeasures dist_method)
628  {
629  unique_ptr<TDistMatrix> dmat(new TDistMatrix(counts.size(),
630  counts.size(), 0));
631  ComputeDistMatrix(counts, dist_method, *dmat.get());
632  return dmat;
633  }
634 
635 
636  /// Compute distances between k-mer counts as graph where nodes are
637  /// sequences and edges represent distances. Distances above given
638  /// threshold will not have edges.
639  /// @param counts List of k-mer counts vectors [in]
640  /// @param dist_method Distance measure [in]
641  /// @param max_dist Maxium distance that will be represented with a graph
642  /// edge [in]
643  /// @param mark_links If true, existings links will be marked in binary
644  /// matrix [in]
645  /// @return Disatances between k-mer counts vectors represented as a graph
646  ///
647  static CRef<CLinks> ComputeDistLinks(const vector<TKmerCounts>& counts,
648  EDistMeasures dist_method,
649  double max_dist)
650  {
651  if (counts.size() < 2) {
652  NCBI_THROW(CKmerCountsException, eInvalid, "Distance links can be"
653  " computed for at least two k-mer counts vectors");
654  }
655 
656  CRef<CLinks> links(new CLinks(counts.size()));
657  double dist;
658  for (int i=0;i < (int)counts.size()-1;i++) {
659  for (int j=i+1;j < (int)counts.size();j++) {
660  if (dist_method == eFractionCommonKmersLocal) {
661  dist = TKmerCounts::FractionCommonKmersDist(counts[i],
662  counts[j]);
663  }
664  else {
665  dist = TKmerCounts::FractionCommonKmersGlobalDist(counts[i],
666  counts[j]);
667  }
668 
669  if (dist <= max_dist) {
670  links->AddLink(i, j, dist);
671  }
672  }
673  }
674 
675  return links;
676  }
677 };
678 
679 
680 
681 END_SCOPE(cobalt)
683 
684 #endif /* ALGO_COBALT___KMERCOUNTS__HPP */
Definitions used by all COBALT aligner components.
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
K-mer counts implemented as bit vectors.
Definition: kmercounts.hpp:254
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
Definition: kmercounts.hpp:316
static unsigned int GetKmerLength(void)
Get k-mer length.
Definition: kmercounts.hpp:288
static vector< Uint1 > sm_TransTable
Definition: kmercounts.hpp:398
unsigned int GetSeqLength(void) const
Get sequence length.
Definition: kmercounts.hpp:278
static void PostCount(void)
Perform post-kmer counting tasks.
Definition: kmercounts.hpp:365
static unsigned int GetAlphabetSize(void)
Get alphabet size.
Definition: kmercounts.hpp:294
static void SetKmerLength(unsigned len)
Set default k-mer length.
Definition: kmercounts.hpp:299
CBinaryKmerCounts(void)
Constructor.
Definition: kmercounts.hpp:259
unsigned int GetNumCounts(void) const
Get number of k-mers.
Definition: kmercounts.hpp:283
static unsigned int sm_AlphabetSize
Definition: kmercounts.hpp:397
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
Definition: kmercounts.hpp:305
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
Definition: kmercounts.hpp:380
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
Definition: kmercounts.hpp:361
static bool sm_UseCompressed
Definition: kmercounts.hpp:399
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Definition: kmercounts.hpp:311
static Uint4 GetAALetter(Uint1 letter)
Definition: kmercounts.hpp:369
static unsigned int sm_KmerLength
Definition: kmercounts.hpp:396
vector< Uint4 > m_Counts
Definition: kmercounts.hpp:393
Exception class for Kmer counts.
Definition: kmercounts.hpp:406
NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException)
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
Definition: kmercounts.hpp:61
static unsigned int sm_AlphabetSize
Definition: kmercounts.hpp:242
static unsigned int GetKmerLength(void)
Get default kmer length.
Definition: kmercounts.hpp:115
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
Definition: kmercounts.hpp:159
CSparseKmerCounts(void)
Create empty counts vector.
Definition: kmercounts.hpp:86
vector< SVectorElement > m_Counts
Definition: kmercounts.hpp:238
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
Definition: kmercounts.hpp:80
static vector< Uint1 > sm_TransTable
Definition: kmercounts.hpp:243
unsigned int m_NumCounts
Definition: kmercounts.hpp:240
static TCount * sm_Buffer
Definition: kmercounts.hpp:245
unsigned int m_SeqLength
Definition: kmercounts.hpp:239
static unsigned int GetAlphabetSize(void)
Get default alphabet size.
Definition: kmercounts.hpp:121
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
Definition: kmercounts.hpp:126
static unsigned int sm_KmerLength
Definition: kmercounts.hpp:241
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
Definition: kmercounts.hpp:131
static Uint4 GetAALetter(Uint1 letter)
Definition: kmercounts.hpp:215
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
Definition: kmercounts.hpp:148
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
Definition: kmercounts.hpp:110
static bool sm_UseCompressed
Definition: kmercounts.hpp:244
static bool sm_ForceSmallerMem
Definition: kmercounts.hpp:246
unsigned int GetSeqLength(void) const
Get sequence length.
Definition: kmercounts.hpp:105
static void SetKmerLength(unsigned len)
Set default k-mer length.
Definition: kmercounts.hpp:142
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Definition: kmercounts.hpp:154
Interface for computing and manipulating k-mer counts vectors that allows for different implementatio...
Definition: kmercounts.hpp:425
static CRef< CLinks > ComputeDistLinks(const vector< TKmerCounts > &counts, EDistMeasures dist_method, double max_dist)
Compute distances between k-mer counts as graph where nodes are sequences and edges represent distanc...
Definition: kmercounts.hpp:647
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
Definition: kmercounts.hpp:559
static void SetParams(unsigned kmer_len, ECompressedAlphabet alph)
Set default counts vector parameters for use with compressed alphabet.
Definition: kmercounts.hpp:498
static unique_ptr< TDistMatrix > ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method)
Compute distance matrix for given counts vectors and distance measure and avoid copying.
Definition: kmercounts.hpp:625
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, TDistMatrix &dmat, double(*fsim)(const TKmerCounts &, const TKmerCounts &, double, double), const vector< double > &normalizers)
Compute matrix of distances between given list of counts vectors using distance function with additio...
static void BuildCompressedTranslation(ECompressedAlphabet alph_index, vector< Uint1 > &trans_table, unsigned alphabet_len)
Creates translation table for compressed alphabets.
Definition: kmercounts.hpp:459
CNcbiMatrix< double > TDistMatrix
Definition: kmercounts.hpp:438
@ eFractionCommonKmersLocal
Definition: kmercounts.hpp:435
@ eFractionCommonKmersGlobal
Definition: kmercounts.hpp:434
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
Definition: kmercounts.hpp:534
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method, TDistMatrix &dmat)
Compute distance matrix for given counts vectors and distance measure.
Definition: kmercounts.hpp:596
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
Definition: kmercounts.hpp:446
void Print(const CCompactSAMApplication::AlignInfo &ai)
static ulg compressed_len
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CVect2< U > & v2
Definition: globals.hpp:440
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NCBI_COBALT_EXPORT
Definition: ncbi_export.h:977
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
Element of the sparse vector.
Definition: kmercounts.hpp:67
SVectorElement(Uint4 pos, TCount val)
Create vector element.
Definition: kmercounts.hpp:77
Uint4 position
position of non-zero element
Definition: kmercounts.hpp:68
SVectorElement(void)
Default constructor.
Definition: kmercounts.hpp:72
TCount value
value of non-zero element
Definition: kmercounts.hpp:69
#define _ASSERT
static Uint4 letter(char c)
Modified on Wed May 15 15:06:33 2024 by modify_doxy.py rev. 669887