NCBI C++ ToolKit
kmercounts.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
2 #define ALGO_COBALT___KMERCOUNTS__HPP
3 
4 /* $Id: kmercounts.hpp 102838 2024-07-29 14:19:45Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's offical duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: kmercounts.hpp
32 
33 Author: Greg Boratyn
34 
35 Contents: Interface for k-mer counting
36 
37 ******************************************************************************/
38 
39 
40 #include <util/math/matrix.hpp>
42 #include <objmgr/scope.hpp>
43 #include <algo/cobalt/base.hpp>
44 #include <algo/cobalt/links.hpp>
47 #include <vector>
48 #include <stack>
49 
50 
52 BEGIN_SCOPE(cobalt)
53 
54 
55 
56 // TO DO: Redesign K-mer counts classes
57 
58 /// Kmer counts for alignment free sequence similarity computation
59 /// implemented as a sparse vector
60 ///
62 {
63 public:
64  typedef Uint1 TCount;
65 
66 
67  /// Element of the sparse vector
68  struct SVectorElement {
69  Uint4 position; ///< position of non-zero element
70  TCount value; ///< value of non-zero element
71 
72  /// Default constructor
73  SVectorElement(void) {position = 0; value = 0;}
74 
75  /// Create vector element
76  /// @param pos Element position
77  /// @param val Element value
78  SVectorElement(Uint4 pos, TCount val) {position = pos; value = val;}
79  };
80 
81  typedef vector<SVectorElement>::const_iterator TNonZeroCounts_CI;
82 
83 
84 public:
85  /// Create empty counts vector
86  ///
87  CSparseKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
88 
89  /// Create k-mer counts vector from SSeqLoc with defalut k-mer length and
90  /// alphabet size
91  /// @param seq The sequence to be represented as k-mer counts [in]
92  /// @param scope Scope
93  ///
94  CSparseKmerCounts(const objects::CSeq_loc& seq,
95  objects::CScope& scope);
96 
97  /// Reset the counts vector
98  /// @param seq Sequence [in]
99  /// @param scope Scope [in]
100  ///
101  void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
102 
103  /// Get sequence length
104  /// @return Sequence length
105  ///
106  unsigned int GetSeqLength(void) const {return m_SeqLength;}
107 
108  /// Get number of all k-mers found in the sequence
109  /// @return Number of all k-mers
110  ///
111  unsigned int GetNumCounts(void) const {return m_NumCounts;}
112 
113  /// Get default kmer length
114  /// @return Default k-mer length
115  ///
116  static unsigned int GetKmerLength(void)
118 
119  /// Get default alphabet size
120  /// @return Default alphabet size
121  ///
122  static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
123 
124  /// Get non-zero counts iterator
125  /// @return Non-zero counts iterator pointing to the begining
126  ///
127  TNonZeroCounts_CI BeginNonZero(void) const {return m_Counts.begin();}
128 
129  /// Get non-zero counts iterator
130  /// @return Non-zero counts iterator pointing to the end
131  ///
132  TNonZeroCounts_CI EndNonZero(void) const {return m_Counts.end();}
133 
134  /// Print counts
135  /// @param ostr Output stream [in|out]
136  /// @return Output stream
137  ///
138  CNcbiOstream& Print(CNcbiOstream& ostr) const;
139 
140  /// Set default k-mer length
141  /// @param len Default k-mer length [in]
142  ///
143  static void SetKmerLength(unsigned len)
144  {sm_KmerLength = len; sm_ForceSmallerMem = false;}
145 
146  /// Set Default alphabet size
147  /// @param size Default alphabet size [in]
148  ///
149  static void SetAlphabetSize(unsigned size)
150  {sm_AlphabetSize = size; sm_ForceSmallerMem = false;}
151 
152  /// Set default compressed alphabet letter translation table
153  /// @return Reference to translation table [in|out]
154  ///
155  static vector<Uint1>& SetTransTable(void) {return sm_TransTable.Get();}
156 
157  /// Set default option for using compressed alphabet
158  /// @param use_comp Will compressed alphabet be used [in]
159  ///
160  static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
161 
162  /// Compute 1 - local fraction of common k-mers between two count vectors
163  /// normalized by length of shorter sequence
164  /// @param vect1 K-mer counts vector [in]
165  /// @param vect2 K-mer counts vector [in]
166  /// @return Local fraction of common k-mer as distance
167  ///
168  /// Computes 1 - F(v1, v2), where
169  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\}
170  /// - k + 1), where
171  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
172  /// excluding Xaa, k - k-mer length
173  /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
174  static double FractionCommonKmersDist(const CSparseKmerCounts& vect1,
175  const CSparseKmerCounts& vect2);
176 
177  /// Compute 1 - global fraction of common k-mers between two count vectors
178  /// normalized by length of longer sequence
179  /// @param vect1 K-mer counts vector [in]
180  /// @param vect2 K-mer counts vector [in]
181  /// @return Global fraction of common k-mers as distance
182  ///
183  /// Computes 1 - F(v1, v2), where
184  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\}
185  /// - k + 1), where
186  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
187  /// excluding Xaa, k - k-mer length
188  /// F(x, y) is modified version of measure presented
189  /// RC Edgar, BMC Bioinformatics 5:113, 2004
190  static double FractionCommonKmersGlobalDist(const CSparseKmerCounts& v1,
191  const CSparseKmerCounts& v2);
192 
193  /// Copmute number of common kmers between two count vectors
194  /// @param v1 K-mer counts vector [in]
195  /// @param v2 K-mer counts vecotr [in]
196  /// @param repetitions Should multiple copies of the same k-mer be counted
197  /// @return Number of k-mers that are present in both counts vectors
198  ///
199  static unsigned int CountCommonKmers(const CSparseKmerCounts& v1,
200  const CSparseKmerCounts& v2,
201  bool repetitions = true);
202 
203  /// Perform preparations before k-mer counting common to all sequences.
204  /// Allocate buffer for storing temporary counts
205  ///
206  static void PreCount(void);
207 
208  /// Perform post-kmer counting tasks. Free buffer.
209  ///
210  static void PostCount(void);
211 
212 
213 private:
214  static TCount* ReserveCountsMem(unsigned int num_bits);
215 
217  {
218  _ASSERT(!sm_UseCompressed || letter < sm_TransTable->size());
219  return (Uint4)(sm_UseCompressed ? (sm_TransTable.Get())[(int)letter] : letter);
220  }
221 
222  /// Initializes element index as bit vector for first k letters,
223  /// skipping Xaa
224  /// @param sv Sequence [in]
225  /// @param pos Element index in sparse vector [out]
226  /// @param index Index of letter in the sequence where k-mer counting
227  /// starts. At exit index points to the next letter after first
228  /// k-mer [in|out]
229  /// @param num_bits Number of bits in pos per letter [in]
230  /// @param kmer_len K-mer length [in]
231  /// @return True if pos was initialized, false otherwise (if no k-mer
232  /// without X was found)
233  static bool InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
234  unsigned int& index, Uint4 num_bits,
235  Uint4 kmer_len);
236 
237 
238 protected:
239  vector<SVectorElement> m_Counts;
240  unsigned int m_SeqLength;
241  unsigned int m_NumCounts;
242  static unsigned int sm_KmerLength;
243  static unsigned int sm_AlphabetSize;
245  static bool sm_UseCompressed;
246  static TCount* sm_Buffer;
247  static bool sm_ForceSmallerMem;
248  static const unsigned int kLengthBitsThreshold = 32;
249 };
250 
251 
252 /// K-mer counts implemented as bit vectors
253 ///
255 {
256 public:
257 
258  /// Constructor
259  ///
260  CBinaryKmerCounts(void) : m_SeqLength(0), m_NumCounts(0) {}
261 
262 
263  /// Constructor
264  /// @param seq Sequence [in]
265  /// @param scop Scope [in]
266  ///
267  CBinaryKmerCounts(const objects::CSeq_loc& seq,
268  objects::CScope& scope);
269 
270  /// Compute counts
271  /// @param seq Sequence [in]
272  /// @param scope Scope [in]
273  ///
274  void Reset(const objects::CSeq_loc& seq, objects::CScope& scope);
275 
276  /// Get sequence length
277  /// @return Sequence length
278  ///
279  unsigned int GetSeqLength(void) const {return m_SeqLength;}
280 
281  /// Get number of k-mers
282  /// @return Number of k-mers
283  ///
284  unsigned int GetNumCounts(void) const {return m_NumCounts;}
285 
286  /// Get k-mer length
287  /// @return K-mer length
288  ///
289  static unsigned int GetKmerLength(void)
291 
292  /// Get alphabet size
293  /// @return Alphabet size
294  ///
295  static unsigned int GetAlphabetSize(void) {return sm_AlphabetSize;}
296 
297  /// Set default k-mer length
298  /// @param len Default k-mer length [in]
299  ///
300  static void SetKmerLength(unsigned len)
301  {sm_KmerLength = len;}
302 
303  /// Set Default alphabet size
304  /// @param size Default alphabet size [in]
305  ///
306  static void SetAlphabetSize(unsigned size)
307  {sm_AlphabetSize = size;}
308 
309  /// Set default compressed alphabet letter translation table
310  /// @return Reference to translation table [in|out]
311  ///
312  static vector<Uint1>& SetTransTable(void) {return sm_TransTable.Get();}
313 
314  /// Set default option for using compressed alphabet
315  /// @param use_comp Will compressed alphabet be used [in]
316  ///
317  static void SetUseCompressed(bool use_comp) {sm_UseCompressed = use_comp;}
318 
319  /// Compute 1 - local fraction of common k-mers between two count vectors
320  /// normalized by length of shorter sequence
321  /// @param vect1 K-mer counts vector [in]
322  /// @param vect2 K-mer counts vector [in]
323  /// @return Local fraction of common k-mer as distance
324  ///
325  /// Computes 1 - F(v1, v2), where
326  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\min \{L_x, L_y\}
327  /// - k + 1), where
328  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
329  /// excluding Xaa, k - k-mer length
330  /// F(x, y) is described in RC Edgar, BMC Bioinformatics 5:113, 2004
331  static double FractionCommonKmersDist(const CBinaryKmerCounts& vect1,
332  const CBinaryKmerCounts& vect2);
333 
334  /// Compute 1 - global fraction of common k-mers between two count vectors
335  /// normalized by length of longer sequence
336  /// @param vect1 K-mer counts vector [in]
337  /// @param vect2 K-mer counts vector [in]
338  /// @return Global fraction of common k-mers as distance
339  ///
340  /// Computes 1 - F(v1, v2), where
341  /// F(x, y) = \sum_{t} \min \{n_x(t), n_y(t)\} / (\max \{L_x, L_y\}
342  /// - k + 1), where
343  /// t - k-mer, n_x(t) - number of k-mer t in x, L_x - length of x
344  /// excluding Xaa, k - k-mer length
345  /// F(x, y) is modified version of measure presented
346  /// RC Edgar, BMC Bioinformatics 5:113, 2004
347  static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts& v1,
348  const CBinaryKmerCounts& v2);
349 
350  /// Copmute number of common kmers between two count vectors
351  /// @param v1 K-mer counts vector [in]
352  /// @param v2 K-mer counts vecotr [in]
353  /// @param repetitions Should multiple copies of the same k-mer be counted
354  /// @return Number of k-mers that are present in both counts vectors
355  ///
356  static unsigned int CountCommonKmers(const CBinaryKmerCounts& v1,
357  const CBinaryKmerCounts& v2);
358 
359 
360  /// Perform preparations before k-mer counting common to all sequences.
361  ///
362  static void PreCount(void) {}
363 
364  /// Perform post-kmer counting tasks.
365  ///
366  static void PostCount(void) {}
367 
368 
369 protected:
371  {
372  _ASSERT(!sm_UseCompressed || letter < sm_TransTable->size());
373  return (Uint4)(sm_UseCompressed ? (sm_TransTable.Get())[(int)letter] : letter);
374  }
375 
376  /// Get number of set bits (adapted
377  /// from http://graphics.stanford.edu/~seander/bithacks.html)
378  /// @param v Bit vector [in]
379  /// @return Number of set bits
380  ///
382  {
383  if (v==0) return 0; // early bailout for sparse vectors
384  v = v - ((v >> 1) & 0x55555555);
385  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
386  v = ((v + (v >> 4)) & 0xF0F0F0F);
387  v = v*0x1010101;
388 
389  return v >> 24; // count
390  }
391 
392 
393 protected:
394  vector<Uint4> m_Counts;
397  static unsigned int sm_KmerLength;
398  static unsigned int sm_AlphabetSize;
400  static bool sm_UseCompressed;
401 };
402 
403 
404 
405 /// Exception class for Kmer counts
407 {
408 public:
409  enum EErrCode {
416  };
417 
419 };
420 
421 /// Interface for computing and manipulating k-mer counts vectors that allows
422 /// for different implementations of K-mer counts vectors
423 ///
424 template <class TKmerCounts>
426 {
427 public:
432  };
433 
437  };
438 
440 
441 public:
442 
443  /// Set default counts vector parameters
444  /// @param kmer_len K-mer length [in]
445  /// @param alphabet_size Alphabet size [in]
446  ///
447  static void SetParams(unsigned kmer_len, unsigned alphabet_size)
448  {
449  TKmerCounts::SetKmerLength(kmer_len);
450  TKmerCounts::SetAlphabetSize(alphabet_size);
451  TKmerCounts::SetTransTable().clear();
452  TKmerCounts::SetUseCompressed(false);
453  }
454 
455  /// Creates translation table for compressed alphabets
456  /// @param trans_string String with groupped letters [in]
457  /// @param trans_table Translation table [out]
458  /// @param alphabet_len Number of letters in compressed alphabet
459  ///
461  vector<Uint1>& trans_table,
462  unsigned alphabet_len)
463 
464  {
465  // Compressed alphabets taken from
466  // Shiryev et al.(2007), Bioinformatics, 23:2949-2951
467  const char* kCompAlphabets[] = {
468  // 23-to-10 letter compressed alphabet. Based on SE-V(10)
469  "IJLMV AST BDENZ KQR G FY P H C W",
470  // 23-to-15 letter compressed alphabet. Based on SE_B(14)
471  "ST IJV LM KR EQZ A G BD P N F Y H C W"
472  };
473 
474  _ASSERT(alph_index >= eFirstCompressed && alph_index <= eLastAlphabet);
475  const char* trans_string = kCompAlphabets[alph_index
476  - (int)eFirstCompressed];
477 
478  Uint4 compressed_letter = 1; // this allows for gaps
479  trans_table.clear();
480  trans_table.resize(alphabet_len + 1, 0);
481  for (Uint4 i = 0; i < strlen(trans_string);i++) {
482  if (isspace(trans_string[i])) {
483  compressed_letter++;
484  }
485  else if (isalpha(trans_string[i])) {
486  Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]];
487 
488  _ASSERT(aa_letter < trans_table.size());
489 
490  trans_table[aa_letter] = compressed_letter;
491  }
492  }
493  }
494 
495  /// Set default counts vector parameters for use with compressed alphabet
496  /// @param kmer_len K-mer length [in]
497  /// @param alph Compressed alphabet to use [in]
498  ///
499  static void SetParams(unsigned kmer_len, ECompressedAlphabet alph) {
500  TKmerCounts::SetKmerLength(kmer_len);
501  unsigned int len;
502  unsigned int compressed_len;
503  switch (alph) {
504  case eSE_V10:
505  len = 28;
506  compressed_len = 11; //including gap
508  TKmerCounts::SetTransTable(),
509  len);
510  TKmerCounts::SetAlphabetSize(compressed_len);
511  TKmerCounts::SetUseCompressed(true);
512  break;
513 
514  case eSE_B15:
515  len = 28;
516  compressed_len = 16; //including gap
518  TKmerCounts::SetTransTable(),
519  len);
520  TKmerCounts::SetAlphabetSize(compressed_len);
521  TKmerCounts::SetUseCompressed(true);
522  break;
523 
524  case eRegular:
525  TKmerCounts::SetAlphabetSize(kAlphabetSize);
526  TKmerCounts::SetTransTable().clear();
527  TKmerCounts::SetUseCompressed(false);
528  }
529  }
530 
531  /// Create k-mer counts vectors for given sequences
532  /// @param seqs List of sequences [in]
533  /// @param counts List of k-mer counts vectors [out]
534  ///
535  static void ComputeCounts(const vector< CRef<objects::CSeq_loc> >& seqs,
536  objects::CScope& scope,
537  vector<TKmerCounts>& counts)
538  {
539  if (seqs.empty()) {
540  NCBI_THROW(CKmerCountsException, eInvalidOptions,
541  "Empty list of sequences");
542  }
543 
544  counts.clear();
545 
546  TKmerCounts::PreCount();
547 
548  ITERATE(vector< CRef<objects::CSeq_loc> >, it, seqs) {
549  counts.push_back(TKmerCounts(**it, scope));
550  }
551 
552  TKmerCounts::PostCount();
553  }
554 
555  /// Compute matrix of distances between given counts vectors
556  /// @param counts List of k-mer counts vectors [in]
557  /// @param fsim Function that computes distance betwee two vectors [in]
558  /// @param dmat Distance matrix [out]
559  ///
560  static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
561  double(*fsim)(const TKmerCounts&, const TKmerCounts&),
562  TDistMatrix& dmat)
563 
564  {
565  if (counts.empty()) {
566  NCBI_THROW(CKmerCountsException, eBadSequence,
567  "The list of k-mer counts vectors is empty");
568  }
569 
570  dmat.Resize(counts.size(), counts.size(), 0.0);
571  for (int i=0;i < (int)counts.size() - 1;i++) {
572  for (int j=i+1;j < (int)counts.size();j++) {
573  dmat(i, j) = fsim(counts[i], counts[j]);
574  dmat(j, i) = dmat(i, j);
575  }
576  }
577  }
578 
579  /// Compute matrix of distances between given list of counts vectors
580  /// using distance function with additional normalizing values
581  /// @param counts List of k-mer counts vectors [in]
582  /// @param dmat Distance matrix [out]
583  /// @param fsim Function that computes distance betwee two vectors [in]
584  /// @param normalizers List of normalizing arguments [in]
585  ///
586  static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
587  TDistMatrix& dmat,
588  double(*fsim)(const TKmerCounts&, const TKmerCounts&, double, double),
589  const vector<double>& normalizers);
590 
591 
592  /// Compute distance matrix for given counts vectors and distance measure
593  /// @param counts List of k-mer counts vecotrs [in]
594  /// @param dist_method Distance measure [in]
595  /// @param dmat Distance matrix [out]
596  ///
597  static void ComputeDistMatrix(const vector<TKmerCounts>& counts,
598  EDistMeasures dist_method,
599  TDistMatrix& dmat)
600  {
601  switch (dist_method) {
603  ComputeDistMatrix(counts, TKmerCounts::FractionCommonKmersDist,
604  dmat);
605  break;
606 
608  ComputeDistMatrix(counts,
609  TKmerCounts::FractionCommonKmersGlobalDist,
610  dmat);
611  break;
612 
613  default:
614  NCBI_THROW(CKmerCountsException, eUnsuportedDistMethod,
615  "Unrecognised distance measure");
616  }
617  }
618 
619 
620  /// Compute distance matrix for given counts vectors and distance measure
621  /// and avoid copying
622  /// @param counts List of k-mer counts vecotrs [in]
623  /// @param dist_method Distance measure [in]
624  /// @return Distance matrix
625  ///
626  static unique_ptr<TDistMatrix> ComputeDistMatrix(
627  const vector<TKmerCounts>& counts,
628  EDistMeasures dist_method)
629  {
630  unique_ptr<TDistMatrix> dmat(new TDistMatrix(counts.size(),
631  counts.size(), 0));
632  ComputeDistMatrix(counts, dist_method, *dmat.get());
633  return dmat;
634  }
635 
636 
637  /// Compute distances between k-mer counts as graph where nodes are
638  /// sequences and edges represent distances. Distances above given
639  /// threshold will not have edges.
640  /// @param counts List of k-mer counts vectors [in]
641  /// @param dist_method Distance measure [in]
642  /// @param max_dist Maxium distance that will be represented with a graph
643  /// edge [in]
644  /// @param mark_links If true, existings links will be marked in binary
645  /// matrix [in]
646  /// @return Disatances between k-mer counts vectors represented as a graph
647  ///
648  static CRef<CLinks> ComputeDistLinks(const vector<TKmerCounts>& counts,
649  EDistMeasures dist_method,
650  double max_dist)
651  {
652  if (counts.size() < 2) {
653  NCBI_THROW(CKmerCountsException, eInvalid, "Distance links can be"
654  " computed for at least two k-mer counts vectors");
655  }
656 
657  CRef<CLinks> links(new CLinks(counts.size()));
658  double dist;
659  for (int i=0;i < (int)counts.size()-1;i++) {
660  for (int j=i+1;j < (int)counts.size();j++) {
661  if (dist_method == eFractionCommonKmersLocal) {
662  dist = TKmerCounts::FractionCommonKmersDist(counts[i],
663  counts[j]);
664  }
665  else {
666  dist = TKmerCounts::FractionCommonKmersGlobalDist(counts[i],
667  counts[j]);
668  }
669 
670  if (dist <= max_dist) {
671  links->AddLink(i, j, dist);
672  }
673  }
674  }
675 
676  return links;
677  }
678 };
679 
680 
681 
682 END_SCOPE(cobalt)
684 
685 #endif /* ALGO_COBALT___KMERCOUNTS__HPP */
Definitions used by all COBALT aligner components.
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
K-mer counts implemented as bit vectors.
Definition: kmercounts.hpp:255
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
Definition: kmercounts.hpp:317
static unsigned int GetKmerLength(void)
Get k-mer length.
Definition: kmercounts.hpp:289
unsigned int GetSeqLength(void) const
Get sequence length.
Definition: kmercounts.hpp:279
static void PostCount(void)
Perform post-kmer counting tasks.
Definition: kmercounts.hpp:366
static unsigned int GetAlphabetSize(void)
Get alphabet size.
Definition: kmercounts.hpp:295
static void SetKmerLength(unsigned len)
Set default k-mer length.
Definition: kmercounts.hpp:300
CBinaryKmerCounts(void)
Constructor.
Definition: kmercounts.hpp:260
unsigned int GetNumCounts(void) const
Get number of k-mers.
Definition: kmercounts.hpp:284
static unsigned int sm_AlphabetSize
Definition: kmercounts.hpp:398
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
Definition: kmercounts.hpp:306
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
Definition: kmercounts.hpp:381
static CSafeStatic< vector< Uint1 > > sm_TransTable
Definition: kmercounts.hpp:399
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
Definition: kmercounts.hpp:362
static bool sm_UseCompressed
Definition: kmercounts.hpp:400
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Definition: kmercounts.hpp:312
static Uint4 GetAALetter(Uint1 letter)
Definition: kmercounts.hpp:370
static unsigned int sm_KmerLength
Definition: kmercounts.hpp:397
vector< Uint4 > m_Counts
Definition: kmercounts.hpp:394
Exception class for Kmer counts.
Definition: kmercounts.hpp:407
NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException)
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
CSafeStatic<>::
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
Definition: kmercounts.hpp:62
static unsigned int sm_AlphabetSize
Definition: kmercounts.hpp:243
static unsigned int GetKmerLength(void)
Get default kmer length.
Definition: kmercounts.hpp:116
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
Definition: kmercounts.hpp:160
CSparseKmerCounts(void)
Create empty counts vector.
Definition: kmercounts.hpp:87
vector< SVectorElement > m_Counts
Definition: kmercounts.hpp:239
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
Definition: kmercounts.hpp:81
unsigned int m_NumCounts
Definition: kmercounts.hpp:241
static TCount * sm_Buffer
Definition: kmercounts.hpp:246
static CSafeStatic< vector< Uint1 > > sm_TransTable
Definition: kmercounts.hpp:244
unsigned int m_SeqLength
Definition: kmercounts.hpp:240
static unsigned int GetAlphabetSize(void)
Get default alphabet size.
Definition: kmercounts.hpp:122
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
Definition: kmercounts.hpp:127
static unsigned int sm_KmerLength
Definition: kmercounts.hpp:242
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
Definition: kmercounts.hpp:132
static Uint4 GetAALetter(Uint1 letter)
Definition: kmercounts.hpp:216
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
Definition: kmercounts.hpp:149
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
Definition: kmercounts.hpp:111
static bool sm_UseCompressed
Definition: kmercounts.hpp:245
static bool sm_ForceSmallerMem
Definition: kmercounts.hpp:247
unsigned int GetSeqLength(void) const
Get sequence length.
Definition: kmercounts.hpp:106
static void SetKmerLength(unsigned len)
Set default k-mer length.
Definition: kmercounts.hpp:143
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Definition: kmercounts.hpp:155
Interface for computing and manipulating k-mer counts vectors that allows for different implementatio...
Definition: kmercounts.hpp:426
static CRef< CLinks > ComputeDistLinks(const vector< TKmerCounts > &counts, EDistMeasures dist_method, double max_dist)
Compute distances between k-mer counts as graph where nodes are sequences and edges represent distanc...
Definition: kmercounts.hpp:648
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
Definition: kmercounts.hpp:560
static void SetParams(unsigned kmer_len, ECompressedAlphabet alph)
Set default counts vector parameters for use with compressed alphabet.
Definition: kmercounts.hpp:499
static unique_ptr< TDistMatrix > ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method)
Compute distance matrix for given counts vectors and distance measure and avoid copying.
Definition: kmercounts.hpp:626
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, TDistMatrix &dmat, double(*fsim)(const TKmerCounts &, const TKmerCounts &, double, double), const vector< double > &normalizers)
Compute matrix of distances between given list of counts vectors using distance function with additio...
static void BuildCompressedTranslation(ECompressedAlphabet alph_index, vector< Uint1 > &trans_table, unsigned alphabet_len)
Creates translation table for compressed alphabets.
Definition: kmercounts.hpp:460
CNcbiMatrix< double > TDistMatrix
Definition: kmercounts.hpp:439
@ eFractionCommonKmersLocal
Definition: kmercounts.hpp:436
@ eFractionCommonKmersGlobal
Definition: kmercounts.hpp:435
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
Definition: kmercounts.hpp:535
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method, TDistMatrix &dmat)
Compute distance matrix for given counts vectors and distance measure.
Definition: kmercounts.hpp:597
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
Definition: kmercounts.hpp:447
void Print(const CCompactSAMApplication::AlignInfo &ai)
static ulg compressed_len
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CVect2< U > & v2
Definition: globals.hpp:440
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NCBI_COBALT_EXPORT
Definition: ncbi_export.h:977
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Static variables safety - create on demand, destroy on application termination.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
Element of the sparse vector.
Definition: kmercounts.hpp:68
SVectorElement(Uint4 pos, TCount val)
Create vector element.
Definition: kmercounts.hpp:78
Uint4 position
position of non-zero element
Definition: kmercounts.hpp:69
SVectorElement(void)
Default constructor.
Definition: kmercounts.hpp:73
TCount value
value of non-zero element
Definition: kmercounts.hpp:70
#define _ASSERT
static Uint4 letter(char c)
Modified on Fri Sep 20 14:57:41 2024 by modify_doxy.py rev. 669887