1 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
2 #define ALGO_COBALT___KMERCOUNTS__HPP
94 objects::CScope& scope);
100 void Reset(
const objects::CSeq_loc& seq, objects::CScope& scope);
143 {sm_KmerLength =
len; sm_ForceSmallerMem =
false;}
149 {sm_AlphabetSize =
size; sm_ForceSmallerMem =
false;}
200 bool repetitions =
true);
205 static void PreCount(
void);
209 static void PostCount(
void);
213 static TCount* ReserveCountsMem(
unsigned int num_bits);
232 static bool InitPosBits(
const objects::CSeqVector& sv,
Uint4& pos,
233 unsigned int& index,
Uint4 num_bits,
247 static const unsigned int kLengthBitsThreshold = 32;
267 objects::CScope& scope);
273 void Reset(
const objects::CSeq_loc& seq, objects::CScope& scope);
300 {sm_KmerLength =
len;}
306 {sm_AlphabetSize =
size;}
383 v = v - ((v >> 1) & 0x55555555);
384 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
385 v = ((v + (v >> 4)) & 0xF0F0F0F);
423 template <
class TKmerCounts>
446 static void SetParams(
unsigned kmer_len,
unsigned alphabet_size)
448 TKmerCounts::SetKmerLength(kmer_len);
449 TKmerCounts::SetAlphabetSize(alphabet_size);
450 TKmerCounts::SetTransTable().clear();
451 TKmerCounts::SetUseCompressed(
false);
460 vector<Uint1>& trans_table,
461 unsigned alphabet_len)
466 const char* kCompAlphabets[] = {
468 "IJLMV AST BDENZ KQR G FY P H C W",
470 "ST IJV LM KR EQZ A G BD P N F Y H C W"
474 const char* trans_string = kCompAlphabets[alph_index
477 Uint4 compressed_letter = 1;
479 trans_table.resize(alphabet_len + 1, 0);
480 for (
Uint4 i = 0;
i < strlen(trans_string);
i++) {
484 else if (
isalpha(trans_string[
i])) {
487 _ASSERT(aa_letter < trans_table.size());
489 trans_table[aa_letter] = compressed_letter;
499 TKmerCounts::SetKmerLength(kmer_len);
507 TKmerCounts::SetTransTable(),
510 TKmerCounts::SetUseCompressed(
true);
517 TKmerCounts::SetTransTable(),
520 TKmerCounts::SetUseCompressed(
true);
525 TKmerCounts::SetTransTable().clear();
526 TKmerCounts::SetUseCompressed(
false);
535 objects::CScope& scope,
536 vector<TKmerCounts>& counts)
540 "Empty list of sequences");
545 TKmerCounts::PreCount();
548 counts.push_back(TKmerCounts(**it, scope));
551 TKmerCounts::PostCount();
560 double(*fsim)(
const TKmerCounts&,
const TKmerCounts&),
564 if (counts.empty()) {
566 "The list of k-mer counts vectors is empty");
569 dmat.
Resize(counts.size(), counts.size(), 0.0);
570 for (
int i=0;
i < (
int)counts.size() - 1;
i++) {
571 for (
int j=
i+1;j < (
int)counts.size();j++) {
572 dmat(
i, j) = fsim(counts[
i], counts[j]);
573 dmat(j,
i) = dmat(
i, j);
587 double(*fsim)(
const TKmerCounts&,
const TKmerCounts&,
double,
double),
588 const vector<double>& normalizers);
600 switch (dist_method) {
608 TKmerCounts::FractionCommonKmersGlobalDist,
614 "Unrecognised distance measure");
626 const vector<TKmerCounts>& counts,
629 unique_ptr<TDistMatrix> dmat(
new TDistMatrix(counts.size(),
651 if (counts.size() < 2) {
653 " computed for at least two k-mer counts vectors");
658 for (
int i=0;
i < (
int)counts.size()-1;
i++) {
659 for (
int j=
i+1;j < (
int)counts.size();j++) {
661 dist = TKmerCounts::FractionCommonKmersDist(counts[
i],
665 dist = TKmerCounts::FractionCommonKmersGlobalDist(counts[
i],
669 if (dist <= max_dist) {
Definitions used by all COBALT aligner components.
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
K-mer counts implemented as bit vectors.
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
static unsigned int GetKmerLength(void)
Get k-mer length.
static vector< Uint1 > sm_TransTable
unsigned int GetSeqLength(void) const
Get sequence length.
static void PostCount(void)
Perform post-kmer counting tasks.
static unsigned int GetAlphabetSize(void)
Get alphabet size.
static void SetKmerLength(unsigned len)
Set default k-mer length.
CBinaryKmerCounts(void)
Constructor.
unsigned int GetNumCounts(void) const
Get number of k-mers.
static unsigned int sm_AlphabetSize
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
static bool sm_UseCompressed
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
static Uint4 GetAALetter(Uint1 letter)
static unsigned int sm_KmerLength
Exception class for Kmer counts.
NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException)
Set of edges with weights between nodes represented by zero-based positive integers.
void AddLink(int first, int second, double weight)
Add link.
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
static unsigned int sm_AlphabetSize
static unsigned int GetKmerLength(void)
Get default kmer length.
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
CSparseKmerCounts(void)
Create empty counts vector.
vector< SVectorElement > m_Counts
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
static vector< Uint1 > sm_TransTable
static TCount * sm_Buffer
static unsigned int GetAlphabetSize(void)
Get default alphabet size.
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
static unsigned int sm_KmerLength
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
static Uint4 GetAALetter(Uint1 letter)
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
static bool sm_UseCompressed
static bool sm_ForceSmallerMem
unsigned int GetSeqLength(void) const
Get sequence length.
static void SetKmerLength(unsigned len)
Set default k-mer length.
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Interface for computing and manipulating k-mer counts vectors that allows for different implementatio...
static CRef< CLinks > ComputeDistLinks(const vector< TKmerCounts > &counts, EDistMeasures dist_method, double max_dist)
Compute distances between k-mer counts as graph where nodes are sequences and edges represent distanc...
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
static void SetParams(unsigned kmer_len, ECompressedAlphabet alph)
Set default counts vector parameters for use with compressed alphabet.
static unique_ptr< TDistMatrix > ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method)
Compute distance matrix for given counts vectors and distance measure and avoid copying.
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, TDistMatrix &dmat, double(*fsim)(const TKmerCounts &, const TKmerCounts &, double, double), const vector< double > &normalizers)
Compute matrix of distances between given list of counts vectors using distance function with additio...
static void BuildCompressedTranslation(ECompressedAlphabet alph_index, vector< Uint1 > &trans_table, unsigned alphabet_len)
Creates translation table for compressed alphabets.
CNcbiMatrix< double > TDistMatrix
@ eFractionCommonKmersLocal
@ eFractionCommonKmersGlobal
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method, TDistMatrix &dmat)
Compute distance matrix for given counts vectors and distance measure.
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
void Print(const CCompactSAMApplication::AlignInfo &ai)
static ulg compressed_len
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
#define NCBI_COBALT_EXPORT
unsigned int
A callback function used to compare two keys in a database.
const struct ncbi::grid::netcache::search::fields::SIZE size
Element of the sparse vector.
SVectorElement(Uint4 pos, TCount val)
Create vector element.
Uint4 position
position of non-zero element
SVectorElement(void)
Default constructor.
TCount value
value of non-zero element
static Uint4 letter(char c)