1 #ifndef ALGO_COBALT___KMERCOUNTS__HPP
2 #define ALGO_COBALT___KMERCOUNTS__HPP
95 objects::CScope& scope);
101 void Reset(
const objects::CSeq_loc& seq, objects::CScope& scope);
144 {sm_KmerLength =
len; sm_ForceSmallerMem =
false;}
150 {sm_AlphabetSize =
size; sm_ForceSmallerMem =
false;}
201 bool repetitions =
true);
206 static void PreCount(
void);
210 static void PostCount(
void);
214 static TCount* ReserveCountsMem(
unsigned int num_bits);
218 _ASSERT(!sm_UseCompressed || letter < sm_TransTable->
size());
233 static bool InitPosBits(
const objects::CSeqVector& sv,
Uint4& pos,
234 unsigned int& index,
Uint4 num_bits,
248 static const unsigned int kLengthBitsThreshold = 32;
268 objects::CScope& scope);
274 void Reset(
const objects::CSeq_loc& seq, objects::CScope& scope);
301 {sm_KmerLength =
len;}
307 {sm_AlphabetSize =
size;}
372 _ASSERT(!sm_UseCompressed || letter < sm_TransTable->
size());
384 v = v - ((v >> 1) & 0x55555555);
385 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
386 v = ((v + (v >> 4)) & 0xF0F0F0F);
424 template <
class TKmerCounts>
447 static void SetParams(
unsigned kmer_len,
unsigned alphabet_size)
449 TKmerCounts::SetKmerLength(kmer_len);
450 TKmerCounts::SetAlphabetSize(alphabet_size);
451 TKmerCounts::SetTransTable().clear();
452 TKmerCounts::SetUseCompressed(
false);
461 vector<Uint1>& trans_table,
462 unsigned alphabet_len)
467 const char* kCompAlphabets[] = {
469 "IJLMV AST BDENZ KQR G FY P H C W",
471 "ST IJV LM KR EQZ A G BD P N F Y H C W"
475 const char* trans_string = kCompAlphabets[alph_index
478 Uint4 compressed_letter = 1;
480 trans_table.resize(alphabet_len + 1, 0);
481 for (
Uint4 i = 0;
i < strlen(trans_string);
i++) {
485 else if (
isalpha(trans_string[
i])) {
488 _ASSERT(aa_letter < trans_table.size());
490 trans_table[aa_letter] = compressed_letter;
500 TKmerCounts::SetKmerLength(kmer_len);
508 TKmerCounts::SetTransTable(),
511 TKmerCounts::SetUseCompressed(
true);
518 TKmerCounts::SetTransTable(),
521 TKmerCounts::SetUseCompressed(
true);
526 TKmerCounts::SetTransTable().clear();
527 TKmerCounts::SetUseCompressed(
false);
536 objects::CScope& scope,
537 vector<TKmerCounts>& counts)
541 "Empty list of sequences");
546 TKmerCounts::PreCount();
549 counts.push_back(TKmerCounts(**it, scope));
552 TKmerCounts::PostCount();
561 double(*fsim)(
const TKmerCounts&,
const TKmerCounts&),
565 if (counts.empty()) {
567 "The list of k-mer counts vectors is empty");
570 dmat.
Resize(counts.size(), counts.size(), 0.0);
571 for (
int i=0;
i < (
int)counts.size() - 1;
i++) {
572 for (
int j=
i+1;j < (
int)counts.size();j++) {
573 dmat(
i, j) = fsim(counts[
i], counts[j]);
574 dmat(j,
i) = dmat(
i, j);
588 double(*fsim)(
const TKmerCounts&,
const TKmerCounts&,
double,
double),
589 const vector<double>& normalizers);
601 switch (dist_method) {
609 TKmerCounts::FractionCommonKmersGlobalDist,
615 "Unrecognised distance measure");
627 const vector<TKmerCounts>& counts,
630 unique_ptr<TDistMatrix> dmat(
new TDistMatrix(counts.size(),
652 if (counts.size() < 2) {
654 " computed for at least two k-mer counts vectors");
659 for (
int i=0;
i < (
int)counts.size()-1;
i++) {
660 for (
int j=
i+1;j < (
int)counts.size();j++) {
662 dist = TKmerCounts::FractionCommonKmersDist(counts[
i],
666 dist = TKmerCounts::FractionCommonKmersGlobalDist(counts[
i],
670 if (dist <= max_dist) {
Definitions used by all COBALT aligner components.
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
K-mer counts implemented as bit vectors.
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
static unsigned int GetKmerLength(void)
Get k-mer length.
unsigned int GetSeqLength(void) const
Get sequence length.
static void PostCount(void)
Perform post-kmer counting tasks.
static unsigned int GetAlphabetSize(void)
Get alphabet size.
static void SetKmerLength(unsigned len)
Set default k-mer length.
CBinaryKmerCounts(void)
Constructor.
unsigned int GetNumCounts(void) const
Get number of k-mers.
static unsigned int sm_AlphabetSize
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
static CSafeStatic< vector< Uint1 > > sm_TransTable
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
static bool sm_UseCompressed
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
static Uint4 GetAALetter(Uint1 letter)
static unsigned int sm_KmerLength
Exception class for Kmer counts.
NCBI_EXCEPTION_DEFAULT(CKmerCountsException, CException)
Set of edges with weights between nodes represented by zero-based positive integers.
void AddLink(int first, int second, double weight)
Add link.
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
static unsigned int sm_AlphabetSize
static unsigned int GetKmerLength(void)
Get default kmer length.
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
CSparseKmerCounts(void)
Create empty counts vector.
vector< SVectorElement > m_Counts
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
static TCount * sm_Buffer
static CSafeStatic< vector< Uint1 > > sm_TransTable
static unsigned int GetAlphabetSize(void)
Get default alphabet size.
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
static unsigned int sm_KmerLength
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
static Uint4 GetAALetter(Uint1 letter)
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
static bool sm_UseCompressed
static bool sm_ForceSmallerMem
unsigned int GetSeqLength(void) const
Get sequence length.
static void SetKmerLength(unsigned len)
Set default k-mer length.
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Interface for computing and manipulating k-mer counts vectors that allows for different implementatio...
static CRef< CLinks > ComputeDistLinks(const vector< TKmerCounts > &counts, EDistMeasures dist_method, double max_dist)
Compute distances between k-mer counts as graph where nodes are sequences and edges represent distanc...
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, double(*fsim)(const TKmerCounts &, const TKmerCounts &), TDistMatrix &dmat)
Compute matrix of distances between given counts vectors.
static void SetParams(unsigned kmer_len, ECompressedAlphabet alph)
Set default counts vector parameters for use with compressed alphabet.
static unique_ptr< TDistMatrix > ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method)
Compute distance matrix for given counts vectors and distance measure and avoid copying.
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, TDistMatrix &dmat, double(*fsim)(const TKmerCounts &, const TKmerCounts &, double, double), const vector< double > &normalizers)
Compute matrix of distances between given list of counts vectors using distance function with additio...
static void BuildCompressedTranslation(ECompressedAlphabet alph_index, vector< Uint1 > &trans_table, unsigned alphabet_len)
Creates translation table for compressed alphabets.
CNcbiMatrix< double > TDistMatrix
@ eFractionCommonKmersLocal
@ eFractionCommonKmersGlobal
static void ComputeCounts(const vector< CRef< objects::CSeq_loc > > &seqs, objects::CScope &scope, vector< TKmerCounts > &counts)
Create k-mer counts vectors for given sequences.
static void ComputeDistMatrix(const vector< TKmerCounts > &counts, EDistMeasures dist_method, TDistMatrix &dmat)
Compute distance matrix for given counts vectors and distance measure.
static void SetParams(unsigned kmer_len, unsigned alphabet_size)
Set default counts vector parameters.
void Print(const CCompactSAMApplication::AlignInfo &ai)
static ulg compressed_len
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
#define NCBI_COBALT_EXPORT
unsigned int
A callback function used to compare two keys in a database.
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Static variables safety - create on demand, destroy on application termination.
Element of the sparse vector.
SVectorElement(Uint4 pos, TCount val)
Create vector element.
Uint4 position
position of non-zero element
SVectorElement(void)
Default constructor.
TCount value
value of non-zero element
static Uint4 letter(char c)