NCBI C++ ToolKit
kmercounts.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: kmercounts.cpp
29 
30 Author: Greg Boratyn
31 
32 Contents: Implementation of k-mer counting classes
33 
34 ******************************************************************************/
35 
36 
37 #include <ncbi_pch.hpp>
38 
39 #include <math.h>
40 
41 #include <corelib/ncbistre.hpp>
42 #include <corelib/ncbi_limits.hpp>
43 #include <corelib/ncbiexpt.hpp>
44 
45 #include <objmgr/seq_vector.hpp>
46 
48 
49 
51 USING_SCOPE(cobalt);
52 
53 // Default values for default params
54 unsigned int CSparseKmerCounts::sm_KmerLength = 4;
60 
61 unsigned int CBinaryKmerCounts::sm_KmerLength = 3;
65 
66 static const Uint1 kXaa = 21;
67 
68 
69 CSparseKmerCounts::CSparseKmerCounts(const objects::CSeq_loc& seq,
70  objects::CScope& scope)
71 {
72  Reset(seq, scope);
73 }
74 
75 // Initialize position bit vector for k-mer counting
76 // @param sv Sequence [in]
77 // @param pos Index of the k-mer count [out]
78 // @param index Index of letter in sequence [in|out]
79 // @param num_bits Number of bits per letter in pos [in]
80 // @param kmer_len K-mer length [in]
81 // @return True if a valid k-mer was found, false otherwise
82 bool CSparseKmerCounts::InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
83  unsigned int& index,
84  Uint4 num_bits, Uint4 kmer_len)
85 
86 {
87  pos = 0;
88  unsigned i = 0;
89  while (index + kmer_len - 1 < sv.size() && i < kmer_len) {
90 
91  // Skip kmers that contain X (unspecified aa)
92  if (sv[index + i] == kXaa) {
93  index += i + 1;
94 
95  pos = 0;
96  i = 0;
97  continue;
98  }
99  pos |= GetAALetter(sv[index + i]) << (num_bits * (kmer_len - i - 1));
100  i++;
101  }
102 
103  if (i < kmer_len) {
104  return false;
105  }
106 
107  index += i;
108  return true;
109 }
110 
111 
112 static void MarkUsed(Uint4 pos, vector<Uint4>& entries, int chunk)
113 {
114  int index = pos / chunk;
115  int offset = pos - index * chunk;
116  Uint4 mask = 0x80000000 >> offset;
117  entries[index] |= mask;
118 }
119 
120 
122  unsigned int num_bits)
123 {
124  Uint4 num_elements;
125  TCount* counts = NULL;
126 
127  // Reserve memory for storing counts
128  // there are two methods for indexing counts (see the Reset() method)
129  // if memory cannot be allocated try to allocate for the second method
130  // that requires less memory
131  if (!sm_ForceSmallerMem && sm_KmerLength * num_bits
133 
134  num_elements = 1 << (num_bits * sm_KmerLength);
135  try {
136  counts = new TCount[num_elements];
137  }
138  catch (std::bad_alloc) {
139  sm_ForceSmallerMem = true;
140  num_elements = (Uint4)pow((double)sm_AlphabetSize,
141  (double)sm_KmerLength);
142 
143  try {
144  counts = new TCount[num_elements];
145  }
146  catch (std::bad_alloc) {
147  NCBI_THROW(CKmerCountsException, eMemoryAllocation,
148  "Memory cannot be allocated for k-mer counting."
149  " Try using compressed alphabet or smaller k.");
150  }
151 
152  }
153  }
154  return counts;
155 }
156 
157 void CSparseKmerCounts::Reset(const objects::CSeq_loc& seq,
158  objects::CScope& scope)
159 {
160  unsigned int kmer_len = sm_KmerLength;
161  unsigned int alphabet_size = sm_AlphabetSize;
162 
163  _ASSERT(kmer_len > 0 && alphabet_size > 0);
164 
165  if (sm_UseCompressed && sm_TransTable->empty()) {
166  NCBI_THROW(CKmerCountsException, eInvalidOptions,
167  "Compressed alphabet selected, but translation table not"
168  " specified");
169  }
170 
171  if (!seq.IsWhole() && !seq.IsInt()) {
172  NCBI_THROW(CKmerCountsException, eUnsupportedSeqLoc,
173  "Unsupported SeqLoc encountered");
174  }
175 
176  _ASSERT(seq.GetId());
177  objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
178 
179  unsigned int num_elements;
180  unsigned int seq_len = sv.size();
181 
182  m_SeqLength = sv.size();
183  m_Counts.clear();
184  m_NumCounts = 0;
185 
186  if (m_SeqLength < kmer_len) {
187  NCBI_THROW(CKmerCountsException, eBadSequence,
188  "Sequence shorter than desired k-mer length");
189  }
190 
191  // Compute number of bits needed to represent all letters
192  unsigned int mask = 1;
193  int num = 0;
194  while (alphabet_size > mask) {
195  mask <<= 1;
196  num++;
197  }
198  const int kNumBits = num;
199 
200  TCount * counts;
201  AutoArray<TCount> tmp_counts;
202  if (sm_Buffer == NULL) {
203  tmp_counts.reset(ReserveCountsMem(kNumBits));
204  counts = tmp_counts.get();
205  } else {
206  counts = sm_Buffer;
207  }
208 
209  // Vecotr of counts is first computed using regular vector that is later
210  // converted to the sparse vector (list of position-value pairs).
211  // Positions are calculated as binary representations of k-mers, if they
212  // fit in 32 bits. Otherwise as numbers in system with base alphabet size.
213  if (!sm_ForceSmallerMem && kmer_len * kNumBits < kLengthBitsThreshold) {
214 
215  num_elements = 1 << (kNumBits * kmer_len);
216  const Uint4 kMask = num_elements - (1 << kNumBits);
217 
218  _ASSERT(counts);
219  memset(counts, 0, num_elements * sizeof(TCount));
220 
221  const int kBitChunk = sizeof(Uint4) * 8;
222 
223  // Vector indicating non-zero elements
224  vector<Uint4> used_entries(num_elements / kBitChunk + 1);
225 
226  //first k-mer
227  Uint4 i = 0;
228  Uint4 pos;
229  bool is_pos = InitPosBits(sv, pos, i, kNumBits, kmer_len);
230  if (is_pos) {
231  _ASSERT(pos < num_elements);
232  counts[pos]++;
233  MarkUsed(pos, used_entries, kBitChunk);
234  m_NumCounts++;
235 
236  //for each next kmer
237  for (;i < seq_len && is_pos;i++) {
238 
239  if (GetAALetter(sv[i]) >= alphabet_size) {
240  NCBI_THROW(CKmerCountsException, eBadSequence,
241  "Letter out of alphabet in sequnece");
242  }
243 
244  // Kmers that contain unspecified amino acid X are not
245  // considered
246  if (sv[i] == kXaa) {
247  i++;
248  is_pos = InitPosBits(sv, pos, i, kNumBits, kmer_len);
249 
250  if (i >= seq_len || !is_pos) {
251  break;
252  }
253  }
254 
255  pos <<= kNumBits;
256  pos &= kMask;
257  pos |= GetAALetter(sv[i]);
258  _ASSERT(pos < num_elements);
259  counts[pos]++;
260  MarkUsed(pos, used_entries, kBitChunk);
261  m_NumCounts++;
262  }
263  }
264 
265  // Convert to sparse vector
266  m_Counts.reserve(m_SeqLength - kmer_len + 1);
267  Uint4 ind = 0;
268  Uint4 num_bit_chunks = num_elements / kBitChunk + 1;
269  while (ind < num_elements / kBitChunk + 1) {
270 
271  // find next chunk with at least one non-zero count
272  while (ind < num_bit_chunks && used_entries[ind] == 0) {
273  ind++;
274  }
275 
276  if (ind == num_bit_chunks) {
277  break;
278  }
279 
280  // find the set bit and get position in the counts vector
281  for (Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
282  j++, mask>>=1) {
283  _ASSERT(j < 32);
284  if ((used_entries[ind] & mask) != 0) {
285  pos = ind * kBitChunk + j;
286 
287  _ASSERT(counts[pos] > 0);
288  m_Counts.push_back(SVectorElement(pos, counts[pos]));
289 
290  used_entries[ind] ^= mask;
291  }
292  }
293  ind++;
294  }
295 
296  }
297  else {
298  _ASSERT(pow((double)alphabet_size, (double)kmer_len)
300 
301  AutoArray<double> base(kmer_len);
302  for (Uint4 i=0;i < kmer_len;i++) {
303  base[i] = pow((double)alphabet_size, (double)i);
304  }
305 
306  num_elements = (Uint4)pow((double)alphabet_size, (double)kmer_len);
307 
308  _ASSERT(counts);
309  memset(counts, 0, num_elements * sizeof(TCount));
310 
311  // Vector indicating non-zero elements
312  const int kBitChunk = sizeof(Uint4) * 8;
313  vector<Uint4> used_entries(num_elements / kBitChunk + 1);
314 
315  Uint4 pos;
316  for (unsigned i=0;i < seq_len - kmer_len + 1;i++) {
317 
318  // Kmers that contain unspecified amino acid X are not considered
319  if (sv[i + kmer_len - 1] == kXaa) {
320  i += kmer_len - 1;
321  continue;
322  }
323 
324  pos = GetAALetter(sv[i]) - 1;
325  _ASSERT(GetAALetter(sv[i]) <= alphabet_size);
326  for (Uint4 j=1;j < kmer_len;j++) {
327  pos += (Uint4)(((double)GetAALetter(sv[i + j]) - 1) * base[j]);
328  _ASSERT(GetAALetter(sv[i + j]) <= alphabet_size);
329  }
330  counts[pos]++;
331  MarkUsed(pos, used_entries, kBitChunk);
332  m_NumCounts++;
333  }
334 
335  // Convert to sparse vector
336  m_Counts.reserve(m_SeqLength - kmer_len + 1);
337  Uint4 ind = 0;
338  Uint4 num_bit_chunks = num_elements / kBitChunk + 1;
339  while (ind < num_elements / kBitChunk + 1) {
340 
341  // find next chunk with at least one non-zero count
342  while (ind < num_bit_chunks && used_entries[ind] == 0) {
343  ind++;
344  }
345 
346  if (ind == num_bit_chunks) {
347  break;
348  }
349 
350  // find the set bit and get position in the counts vector
351  for (Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
352  j++, mask>>=1) {
353  _ASSERT(j < 32);
354  if ((used_entries[ind] & mask) != 0) {
355  pos = ind * kBitChunk + j;
356 
357  _ASSERT(counts[pos] > 0);
358  m_Counts.push_back(SVectorElement(pos, counts[pos]));
359 
360  used_entries[ind] ^= mask;
361  }
362  }
363  ind++;
364  }
365  }
366 
367 }
368 
370  const CSparseKmerCounts& v1,
371  const CSparseKmerCounts& v2)
372 {
373  _ASSERT(GetKmerLength() > 0);
374 
375  unsigned int num_common = CountCommonKmers(v1, v2, true);
376 
377  unsigned int num_counts1 = v1.GetNumCounts();
378  unsigned int num_counts2 = v2.GetNumCounts();
379  unsigned int fewer_counts
380  = num_counts1 < num_counts2 ? num_counts1 : num_counts2;
381 
382  // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
383  // SeqLen - k + 1 that is equal to number of counts only if sequence
384  // does not contain Xaa.
385  return 1.0 - (double)num_common / (double)fewer_counts;
386 }
387 
388 
390  const CSparseKmerCounts& v1,
391  const CSparseKmerCounts& v2)
392 {
393  _ASSERT(GetKmerLength() > 0);
394 
395  unsigned int num_common = CountCommonKmers(v1, v2, true);
396 
397  unsigned int num_counts1 = v1.GetNumCounts();
398  unsigned int num_counts2 = v2.GetNumCounts();
399  unsigned int more_counts
400  = num_counts1 > num_counts2 ? num_counts1 : num_counts2;
401 
402  // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
403  // SeqLen - k + 1 that is equal to number of counts only if sequence
404  // does not contain Xaa.
405  return 1.0 - (double)num_common / (double)more_counts;
406 }
407 
408 
410  const CSparseKmerCounts& vect1,
411  const CSparseKmerCounts& vect2,
412  bool repetitions)
413 
414 {
415 
416  unsigned int result = 0;
417  TNonZeroCounts_CI it1 = vect1.m_Counts.begin();
418  TNonZeroCounts_CI it2 = vect2.m_Counts.begin();
419 
420  // Iterating through non zero counts in both vectors
421  do {
422  // For each vector element that is non zero in vect1 and vect2
423  while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end()
424  && it1->position == it2->position) {
425 
426  // Increase number of common kmers found
427  if (repetitions) {
428  result += (unsigned)(it1->value < it2->value
429  ? it1->value : it2->value);
430  }
431  else {
432  result++;
433  }
434  ++it1;
435  ++it2;
436  }
437 
438  //Finding the next pair of non-zero element in both vect1 and vect2
439 
440  while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end()
441  && it1->position < it2->position) {
442  ++it1;
443  }
444 
445  while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end()
446  && it2->position < it1->position) {
447  ++it2;
448  }
449 
450 
451  } while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end());
452 
453  return result;
454 }
455 
456 
458 {
459  // Reserve memory for storing counts of all possible k-mers
460  // compute number of bits needed to represent all letters
461  unsigned int mask = 1;
462  int num_bits = 0;
463  while (sm_AlphabetSize > mask) {
464  mask <<= 1;
465  num_bits++;
466  }
467 
468  sm_Buffer = ReserveCountsMem(num_bits);
469 }
470 
472 {
473  if (sm_Buffer) {
474  delete [] sm_Buffer;
475  }
476  sm_Buffer = NULL;
477  sm_ForceSmallerMem = false;
478 }
479 
480 
482 {
484  it != EndNonZero();++it) {
485  ostr << it->position << ":" << (int)it->value << " ";
486  }
487  ostr << NcbiEndl;
488 
489  return ostr;
490 }
491 
492 
493 CBinaryKmerCounts::CBinaryKmerCounts(const objects::CSeq_loc& seq,
494  objects::CScope& scope)
495 {
496  Reset(seq, scope);
497 }
498 
499 
500 void CBinaryKmerCounts::Reset(const objects::CSeq_loc& seq,
501  objects::CScope& scope)
502 {
503  unsigned int kmer_len = sm_KmerLength;
504  unsigned int alphabet_size = sm_AlphabetSize;
505 
506  _ASSERT(kmer_len > 0 && alphabet_size > 0);
507 
508  if (sm_UseCompressed && sm_TransTable->empty()) {
509  NCBI_THROW(CKmerCountsException, eInvalidOptions,
510  "Compressed alphabet selected, but translation table not"
511  " specified");
512  }
513 
514  if (!seq.IsWhole() && !seq.IsInt()) {
515  NCBI_THROW(CKmerCountsException, eUnsupportedSeqLoc,
516  "Unsupported SeqLoc encountered");
517  }
518 
519  _ASSERT(seq.GetId());
520  objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
521 
522  unsigned int num_elements;
523  unsigned int seq_len = sv.size();
524 
525  m_SeqLength = sv.size();
526  m_Counts.clear();
527  m_NumCounts = 0;
528 
529  if (m_SeqLength < kmer_len) {
530  NCBI_THROW(CKmerCountsException, eBadSequence,
531  "Sequence shorter than desired k-mer length");
532  }
533 
534  const int kBitChunk = sizeof(Uint4) * 8;
535 
536  // Vecotr of counts is first computed using regular vector that is later
537  // converted to the sparse vector (list of position-value pairs).
538  // Positions are calculated as binary representations of k-mers, if they
539  // fit in 32 bits. Otherwise as numbers in system with base alphabet size.
540 
541  _ASSERT(pow((double)alphabet_size, (double)kmer_len)
543 
544  AutoArray<double> base(kmer_len);
545  for (Uint4 i=0;i < kmer_len;i++) {
546  base[i] = pow((double)alphabet_size, (double)i);
547  }
548 
549  num_elements = (Uint4)pow((double)alphabet_size, (double)kmer_len);
550 
551  m_Counts.resize(num_elements / 32 + 1, (Uint4)0);
552 
553  Uint4 pos;
554  unsigned i = 0;
555 
556  // find the first k-mer that does not contain Xaa
557  bool is_xaa;
558  do {
559  is_xaa = false;
560  for (unsigned j=0;j < kmer_len && i < seq_len - kmer_len + 1;j++) {
561 
562  if (sv[i + j] == kXaa) {
563  i += kmer_len;
564  is_xaa = true;
565  break;
566  }
567  }
568  } while (i < seq_len - kmer_len + 1 && is_xaa);
569  // if sequences contains only Xaa's then exit
570  if (i >= seq_len - kmer_len + 1) {
571  return;
572  }
573 
574  // for each subsequence of kmer_len residues
575  for (;i < seq_len - kmer_len + 1;i++) {
576 
577  // k-mers that contain unspecified amino acid X are not considered
578  if (sv[i + kmer_len - 1] == kXaa) {
579 
580  // move k-mer window past Xaa
581  i += kmer_len;
582 
583  // find first k-mer that does not contain Xaa
584  do {
585  is_xaa = false;
586  for (unsigned j=0;j < kmer_len && i < seq_len - kmer_len + 1;
587  j++) {
588 
589  if (sv[i + j] == kXaa) {
590  i += kmer_len;
591  is_xaa = true;
592  break;
593  }
594  }
595  } while (i < seq_len - kmer_len + 1 && is_xaa);
596 
597  // if Xaa are found till the end of sequence exit
598  if (i >= seq_len - kmer_len + 1) {
599  break;
600  }
601  }
602 
603  pos = GetAALetter(sv[i]) - 1;
604  _ASSERT(GetAALetter(sv[i]) <= alphabet_size);
605  for (Uint4 j=1;j < kmer_len;j++) {
606  pos += (Uint4)(((double)GetAALetter(sv[i + j]) - 1) * base[j]);
607  _ASSERT(GetAALetter(sv[i + j]) <= alphabet_size);
608  }
609  MarkUsed(pos, m_Counts, kBitChunk);
610  }
611 
612  m_NumCounts = 0;
613  for (size_t i=0;i < m_Counts.size();i++) {
615  }
616 }
617 
618 
620  const CBinaryKmerCounts& v2)
621 {
622  _ASSERT(GetKmerLength() > 0);
623 
624  unsigned int num_common = CountCommonKmers(v1, v2);
625 
626  unsigned int num_counts1 = v1.GetNumCounts();
627  unsigned int num_counts2 = v2.GetNumCounts();
628  unsigned int fewer_counts
629  = num_counts1 < num_counts2 ? num_counts1 : num_counts2;
630 
631  // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
632  // SeqLen - k + 1 that is equal to number of counts only if sequence
633  // does not contain Xaa.
634  return 1.0 - (double)num_common / (double)fewer_counts;
635 }
636 
637 
639  const CBinaryKmerCounts& v1,
640  const CBinaryKmerCounts& v2)
641 {
642  _ASSERT(GetKmerLength() > 0);
643 
644  unsigned int num_common = CountCommonKmers(v1, v2);
645 
646  unsigned int num_counts1 = v1.GetNumCounts();
647  unsigned int num_counts2 = v2.GetNumCounts();
648  unsigned int more_counts
649  = num_counts1 > num_counts2 ? num_counts1 : num_counts2;
650 
651  // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
652  // SeqLen - k + 1 that is equal to number of counts only if sequence
653  // does not contain Xaa.
654  return 1.0 - (double)num_common / (double)more_counts;
655 }
656 
657 
659  const CBinaryKmerCounts& vect1,
660  const CBinaryKmerCounts& vect2)
661 {
662  unsigned int result = 0;
663  const Uint4* counts1 = &vect1.m_Counts[0];
664  const Uint4* counts2 = &vect2.m_Counts[0];
665  size_t size = vect1.m_Counts.size();
666 
667  for (size_t i=0;i < size;i++) {
668  result += x_Popcount(counts1[i] & counts2[i]);
669  }
670 
671  return result;
672 }
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
ncbi::TMaskedQueryRegions mask
AutoArray –.
Definition: ncbimisc.hpp:527
K-mer counts implemented as bit vectors.
Definition: kmercounts.hpp:255
static unsigned int GetKmerLength(void)
Get k-mer length.
Definition: kmercounts.hpp:289
static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts &v1, const CBinaryKmerCounts &v2)
Definition: kmercounts.cpp:638
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Compute counts.
Definition: kmercounts.cpp:500
CBinaryKmerCounts(void)
Constructor.
Definition: kmercounts.hpp:260
unsigned int GetNumCounts(void) const
Get number of k-mers.
Definition: kmercounts.hpp:284
static unsigned int sm_AlphabetSize
Definition: kmercounts.hpp:398
static double FractionCommonKmersDist(const CBinaryKmerCounts &vect1, const CBinaryKmerCounts &vect2)
Definition: kmercounts.cpp:619
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
Definition: kmercounts.hpp:381
static CSafeStatic< vector< Uint1 > > sm_TransTable
Definition: kmercounts.hpp:399
static bool sm_UseCompressed
Definition: kmercounts.hpp:400
static Uint4 GetAALetter(Uint1 letter)
Definition: kmercounts.hpp:370
static unsigned int sm_KmerLength
Definition: kmercounts.hpp:397
vector< Uint4 > m_Counts
Definition: kmercounts.hpp:394
static unsigned int CountCommonKmers(const CBinaryKmerCounts &v1, const CBinaryKmerCounts &v2)
Copmute number of common kmers between two count vectors.
Definition: kmercounts.cpp:658
Exception class for Kmer counts.
Definition: kmercounts.hpp:407
CSafeStatic<>::
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
Definition: kmercounts.hpp:62
static unsigned int sm_AlphabetSize
Definition: kmercounts.hpp:243
static unsigned int GetKmerLength(void)
Get default kmer length.
Definition: kmercounts.hpp:116
CSparseKmerCounts(void)
Create empty counts vector.
Definition: kmercounts.hpp:87
static TCount * ReserveCountsMem(unsigned int num_bits)
Definition: kmercounts.cpp:121
vector< SVectorElement > m_Counts
Definition: kmercounts.hpp:239
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
Definition: kmercounts.hpp:81
static double FractionCommonKmersGlobalDist(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2)
Definition: kmercounts.cpp:389
unsigned int m_NumCounts
Definition: kmercounts.hpp:241
static TCount * sm_Buffer
Definition: kmercounts.hpp:246
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
Definition: kmercounts.cpp:457
static CSafeStatic< vector< Uint1 > > sm_TransTable
Definition: kmercounts.hpp:244
unsigned int m_SeqLength
Definition: kmercounts.hpp:240
static void PostCount(void)
Perform post-kmer counting tasks.
Definition: kmercounts.cpp:471
static const unsigned int kLengthBitsThreshold
Definition: kmercounts.hpp:248
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
Definition: kmercounts.hpp:127
static unsigned int sm_KmerLength
Definition: kmercounts.hpp:242
static double FractionCommonKmersDist(const CSparseKmerCounts &vect1, const CSparseKmerCounts &vect2)
Definition: kmercounts.cpp:369
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
Definition: kmercounts.hpp:132
static Uint4 GetAALetter(Uint1 letter)
Definition: kmercounts.hpp:216
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Reset the counts vector.
Definition: kmercounts.cpp:157
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
Definition: kmercounts.hpp:111
static bool sm_UseCompressed
Definition: kmercounts.hpp:245
static unsigned int CountCommonKmers(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2, bool repetitions=true)
Copmute number of common kmers between two count vectors.
Definition: kmercounts.cpp:409
static bool sm_ForceSmallerMem
Definition: kmercounts.hpp:247
static bool InitPosBits(const objects::CSeqVector &sv, Uint4 &pos, unsigned int &index, Uint4 num_bits, Uint4 kmer_len)
Initializes element index as bit vector for first k letters, skipping Xaa.
Definition: kmercounts.cpp:82
CNcbiOstream & Print(CNcbiOstream &ostr) const
Print counts.
Definition: kmercounts.cpp:481
int offset
Definition: replacements.h:160
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:581
void reset(element_type *p=0)
Reset will delete the old pointer, set content to the new value, and assume the ownership upon the ne...
Definition: ncbimisc.hpp:598
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CVect2< U > & v2
Definition: globals.hpp:440
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
USING_SCOPE(cobalt)
static void MarkUsed(Uint4 pos, vector< Uint4 > &entries, int chunk)
Definition: kmercounts.cpp:112
USING_NCBI_SCOPE
Definition: kmercounts.cpp:50
static const Uint1 kXaa
Definition: kmercounts.cpp:66
int i
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines NCBI C++ exception handling.
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
T max(T x_, T y_)
Element of the sparse vector.
Definition: kmercounts.hpp:68
#define _ASSERT
else result
Definition: token2.c:20
static wxAcceleratorEntry entries[3]
Modified on Fri Sep 20 14:57:56 2024 by modify_doxy.py rev. 669887