70 objects::CScope& scope)
89 while (index + kmer_len - 1 < sv.size() &&
i < kmer_len) {
92 if (sv[index +
i] ==
kXaa) {
99 pos |=
GetAALetter(sv[index +
i]) << (num_bits * (kmer_len -
i - 1));
114 int index = pos / chunk;
115 int offset = pos - index * chunk;
122 unsigned int num_bits)
136 counts =
new TCount[num_elements];
138 catch (std::bad_alloc) {
144 counts =
new TCount[num_elements];
146 catch (std::bad_alloc) {
148 "Memory cannot be allocated for k-mer counting."
149 " Try using compressed alphabet or smaller k.");
158 objects::CScope& scope)
163 _ASSERT(kmer_len > 0 && alphabet_size > 0);
167 "Compressed alphabet selected, but translation table not"
171 if (!seq.IsWhole() && !seq.IsInt()) {
173 "Unsupported SeqLoc encountered");
177 objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
179 unsigned int num_elements;
180 unsigned int seq_len = sv.size();
188 "Sequence shorter than desired k-mer length");
192 unsigned int mask = 1;
194 while (alphabet_size >
mask) {
198 const int kNumBits = num;
204 counts = tmp_counts.
get();
215 num_elements = 1 << (kNumBits * kmer_len);
216 const Uint4 kMask = num_elements - (1 << kNumBits);
219 memset(counts, 0, num_elements *
sizeof(
TCount));
221 const int kBitChunk =
sizeof(
Uint4) * 8;
224 vector<Uint4> used_entries(num_elements / kBitChunk + 1);
229 bool is_pos =
InitPosBits(sv, pos,
i, kNumBits, kmer_len);
233 MarkUsed(pos, used_entries, kBitChunk);
237 for (;
i < seq_len && is_pos;
i++) {
241 "Letter out of alphabet in sequnece");
250 if (
i >= seq_len || !is_pos) {
260 MarkUsed(pos, used_entries, kBitChunk);
268 Uint4 num_bit_chunks = num_elements / kBitChunk + 1;
269 while (ind < num_elements / kBitChunk + 1) {
272 while (ind < num_bit_chunks && used_entries[ind] == 0) {
276 if (ind == num_bit_chunks) {
281 for (
Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
284 if ((used_entries[ind] &
mask) != 0) {
285 pos = ind * kBitChunk + j;
290 used_entries[ind] ^=
mask;
298 _ASSERT(pow((
double)alphabet_size, (
double)kmer_len)
303 base[
i] = pow((
double)alphabet_size, (
double)
i);
306 num_elements = (
Uint4)pow((
double)alphabet_size, (double)kmer_len);
309 memset(counts, 0, num_elements *
sizeof(
TCount));
312 const int kBitChunk =
sizeof(
Uint4) * 8;
313 vector<Uint4> used_entries(num_elements / kBitChunk + 1);
316 for (
unsigned i=0;
i < seq_len - kmer_len + 1;
i++) {
319 if (sv[
i + kmer_len - 1] ==
kXaa) {
326 for (
Uint4 j=1;j < kmer_len;j++) {
331 MarkUsed(pos, used_entries, kBitChunk);
338 Uint4 num_bit_chunks = num_elements / kBitChunk + 1;
339 while (ind < num_elements / kBitChunk + 1) {
342 while (ind < num_bit_chunks && used_entries[ind] == 0) {
346 if (ind == num_bit_chunks) {
351 for (
Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
354 if ((used_entries[ind] &
mask) != 0) {
355 pos = ind * kBitChunk + j;
360 used_entries[ind] ^=
mask;
378 unsigned int num_counts2 =
v2.GetNumCounts();
379 unsigned int fewer_counts
380 = num_counts1 < num_counts2 ? num_counts1 : num_counts2;
385 return 1.0 - (double)num_common / (
double)fewer_counts;
398 unsigned int num_counts2 =
v2.GetNumCounts();
399 unsigned int more_counts
400 = num_counts1 > num_counts2 ? num_counts1 : num_counts2;
405 return 1.0 - (double)num_common / (
double)more_counts;
424 && it1->position == it2->position) {
428 result += (unsigned)(it1->value < it2->value
429 ? it1->value : it2->value);
441 && it1->position < it2->position) {
446 && it2->position < it1->position) {
461 unsigned int mask = 1;
485 ostr << it->position <<
":" << (
int)it->value <<
" ";
494 objects::CScope& scope)
501 objects::CScope& scope)
506 _ASSERT(kmer_len > 0 && alphabet_size > 0);
510 "Compressed alphabet selected, but translation table not"
514 if (!seq.IsWhole() && !seq.IsInt()) {
516 "Unsupported SeqLoc encountered");
520 objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
522 unsigned int num_elements;
523 unsigned int seq_len = sv.size();
531 "Sequence shorter than desired k-mer length");
534 const int kBitChunk =
sizeof(
Uint4) * 8;
541 _ASSERT(pow((
double)alphabet_size, (
double)kmer_len)
546 base[
i] = pow((
double)alphabet_size, (
double)
i);
549 num_elements = (
Uint4)pow((
double)alphabet_size, (double)kmer_len);
560 for (
unsigned j=0;j < kmer_len &&
i < seq_len - kmer_len + 1;j++) {
562 if (sv[
i + j] ==
kXaa) {
568 }
while (
i < seq_len - kmer_len + 1 && is_xaa);
570 if (
i >= seq_len - kmer_len + 1) {
575 for (;
i < seq_len - kmer_len + 1;
i++) {
578 if (sv[
i + kmer_len - 1] ==
kXaa) {
586 for (
unsigned j=0;j < kmer_len &&
i < seq_len - kmer_len + 1;
589 if (sv[
i + j] ==
kXaa) {
595 }
while (
i < seq_len - kmer_len + 1 && is_xaa);
598 if (
i >= seq_len - kmer_len + 1) {
605 for (
Uint4 j=1;j < kmer_len;j++) {
627 unsigned int num_counts2 =
v2.GetNumCounts();
628 unsigned int fewer_counts
629 = num_counts1 < num_counts2 ? num_counts1 : num_counts2;
634 return 1.0 - (double)num_common / (
double)fewer_counts;
647 unsigned int num_counts2 =
v2.GetNumCounts();
648 unsigned int more_counts
649 = num_counts1 > num_counts2 ? num_counts1 : num_counts2;
654 return 1.0 - (double)num_common / (
double)more_counts;
667 for (
size_t i=0;
i <
size;
i++) {
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
K-mer counts implemented as bit vectors.
static unsigned int GetKmerLength(void)
Get k-mer length.
static double FractionCommonKmersGlobalDist(const CBinaryKmerCounts &v1, const CBinaryKmerCounts &v2)
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Compute counts.
CBinaryKmerCounts(void)
Constructor.
unsigned int GetNumCounts(void) const
Get number of k-mers.
static unsigned int sm_AlphabetSize
static double FractionCommonKmersDist(const CBinaryKmerCounts &vect1, const CBinaryKmerCounts &vect2)
static Uint4 x_Popcount(Uint4 v)
Get number of set bits (adapted from http://graphics.stanford.edu/~seander/bithacks....
static CSafeStatic< vector< Uint1 > > sm_TransTable
static bool sm_UseCompressed
static Uint4 GetAALetter(Uint1 letter)
static unsigned int sm_KmerLength
static unsigned int CountCommonKmers(const CBinaryKmerCounts &v1, const CBinaryKmerCounts &v2)
Copmute number of common kmers between two count vectors.
Exception class for Kmer counts.
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
static unsigned int sm_AlphabetSize
static unsigned int GetKmerLength(void)
Get default kmer length.
CSparseKmerCounts(void)
Create empty counts vector.
static TCount * ReserveCountsMem(unsigned int num_bits)
vector< SVectorElement > m_Counts
vector< SVectorElement >::const_iterator TNonZeroCounts_CI
static double FractionCommonKmersGlobalDist(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2)
static TCount * sm_Buffer
static void PreCount(void)
Perform preparations before k-mer counting common to all sequences.
static CSafeStatic< vector< Uint1 > > sm_TransTable
static void PostCount(void)
Perform post-kmer counting tasks.
static const unsigned int kLengthBitsThreshold
TNonZeroCounts_CI BeginNonZero(void) const
Get non-zero counts iterator.
static unsigned int sm_KmerLength
static double FractionCommonKmersDist(const CSparseKmerCounts &vect1, const CSparseKmerCounts &vect2)
TNonZeroCounts_CI EndNonZero(void) const
Get non-zero counts iterator.
static Uint4 GetAALetter(Uint1 letter)
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Reset the counts vector.
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
static bool sm_UseCompressed
static unsigned int CountCommonKmers(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2, bool repetitions=true)
Copmute number of common kmers between two count vectors.
static bool sm_ForceSmallerMem
static bool InitPosBits(const objects::CSeqVector &sv, Uint4 &pos, unsigned int &index, Uint4 num_bits, Uint4 kmer_len)
Initializes element index as bit vector for first k letters, skipping Xaa.
CNcbiOstream & Print(CNcbiOstream &ostr) const
Print counts.
element_type * get(void) const
Get pointer.
void reset(element_type *p=0)
Reset will delete the old pointer, set content to the new value, and assume the ownership upon the ne...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
unsigned int
A callback function used to compare two keys in a database.
static void MarkUsed(Uint4 pos, vector< Uint4 > &entries, int chunk)
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines NCBI C++ exception handling.
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Element of the sparse vector.
static wxAcceleratorEntry entries[3]