NCBI C++ ToolKit
blastkmerutils.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastkmerutils.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  * File Description:
29  * Utilities for the BLAST kmer search.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
39 #include <math.h>
40 
43 
44 #include <util/random_gen.hpp>
45 
46 #include "pearson.hpp"
47 
50 USING_SCOPE(blast);
51 
53 
54 
55 // universal hashing mod 1048583.
57 {
59  return ( (a*x + b) % p );
60 }
61 
62 
63 /// FNV hash, see http://www.isthe.com/chongo/tech/comp/fnv/index.html
65 {
66  const Uint4 fnv_prime = 16777619u;
67  const Uint4 fnv_offset_basis = 2166136261u;
68  Int4 i;
69  Uint4 hash;
70 
71  Uint1 key[4];
72  key[0] = num & 0xff;
73  key[1] = (num >> 8) & 0xff;
74  key[2] = (num >> 16) & 0xff;
75  key[3] = (num >> 24) & 0xff;
76 
77 
78  hash = fnv_offset_basis;
79  for (i = 0;i < 4;i++) {
80  hash *= fnv_prime;
81  hash ^= key[i];
82  }
83 
84  return hash;
85 }
86 
87 
88 // estimate Jaccard similarity using minhashes
89 inline double estimate_jaccard(vector<uint32_t>& query_hash, vector<uint32_t>& subject, int num_hashes)
90 {
91  int score=0;
92 
93  // help the compiler vectorize
94  uint32_t *a=&(query_hash[0]);
95  uint32_t *b=&(subject[0]);
96 
97  for(int h=0;h<num_hashes;h++)
98  {
99  if (a[h] == b[h])
100  score++;
101  }
102 
103  return (double) score / num_hashes;
104 }
105 
106 
107 // estimate Jaccard similarity with one hash function
108 inline double estimate_jaccard2(vector<uint32_t>& query_hash, vector<uint32_t>& subject, int num_hashes)
109 {
110  int score=0;
111 
112  // help the compiler vectorize
113  uint32_t *a=&(query_hash[0]);
114  uint32_t *b=&(subject[0]);
115 
116  int bindex=0;
117  for(int h=0;h<num_hashes;h++)
118  { // DOes a[h] < b[bindex] make any sense?
119  while (bindex < num_hashes && a[h] > b[bindex])
120  {
121  bindex++;
122  }
123  if (bindex == num_hashes)
124  break;
125  if (a[h] == b[bindex])
126  score++;
127  }
128 
129  return (double) score / num_hashes;
130 }
131 
132 set<uint32_t> BlastKmerGetKmerSetStats(const string& query_sequence, int kmerNum, map<string, int>& kmerCount, map<string, int>& kmerCountPlus, int alphabetChoice, bool perQuery)
133 {
134  // the set of unique kmers
135  set<uint32_t> kmer_set;
136 
137  vector<Uint1> trans_table;
138  BlastKmerGetCompressedTranslationTable(trans_table, alphabetChoice);
139 
140  int seq_length = static_cast<int>(query_sequence.length());
141  const char* query = query_sequence.c_str();
142 
143  // bail out if the sequence is too short
144  if (seq_length < kmerNum)
145  return kmer_set;
146 
147  map<string, int> kmerCountPriv;
148  map<string, int> kmerCountPlusPriv;
149 
150  for(int i=0;i<seq_length-kmerNum+1;i++)
151  {
152 
153  // compute its index
154  uint32_t index=0;
155  for (int kindex=0; kindex<kmerNum; kindex++)
156  { // Check for X in the kmer
157  if (query[i+kindex] == 21)
158  {
159  index=0;
160  break;
161  }
162  index = (index << 4);
163  index += trans_table[query[i+kindex]];
164  }
165  if (index != 0)
166  kmer_set.insert(index);
167  if (index != 0)
168  {
169  string kmer_str="";
170  kmer_str += NStr::NumericToString(index) + " ";
171 /*
172  for (int myindex=0; myindex<kmerNum; myindex++)
173  kmer_str += NStr::NumericToString((int)query[i+myindex]) + " ";
174 */
175  kmerCountPriv[kmer_str]++;
176  }
177  if (index != 0 && i<seq_length-kmerNum && query[i+kmerNum] != 21)
178  {
179  int index_short = index;
180  index = (index << 4);
181  index += trans_table[query[i+kmerNum]];
182  string kmer_str="";
183  kmer_str += NStr::NumericToString(index) + " " + NStr::NumericToString(index_short);
184  kmerCountPlusPriv[kmer_str]++;
185  }
186  }
187 
188  if (perQuery)
189  {
190  for (map<string, int>::iterator i=kmerCountPriv.begin(); i != kmerCountPriv.end(); ++i)
191  kmerCount[(*i).first]++;
192  for (map<string, int>::iterator i=kmerCountPlusPriv.begin(); i != kmerCountPlusPriv.end(); ++i)
193  kmerCountPlus[(*i).first]++;
194  }
195  else
196  {
197  for (map<string, int>::iterator i=kmerCountPriv.begin(); i != kmerCountPriv.end(); ++i)
198  kmerCount[(*i).first] += (*i).second;
199  for (map<string, int>::iterator i=kmerCountPlusPriv.begin(); i != kmerCountPlusPriv.end(); ++i)
200  kmerCountPlus[(*i).first] += (*i).second;
201  }
202 
203  return kmer_set;
204 }
205 
206 set<uint32_t> BlastKmerGetKmerSet(const string& query_sequence, bool do_seg, TSeqRange& range, int kmerNum, int alphabetChoice)
207 {
208  // the set of unique kmers
209  set<uint32_t> kmer_set;
210 
211  vector<Uint1> trans_table;
212  BlastKmerGetCompressedTranslationTable(trans_table, alphabetChoice);
213 
214  int seq_length = static_cast<int>(query_sequence.length());
215  const char* query = query_sequence.c_str();
216 
217  // bail out if the sequence is too short
218  if (seq_length < kmerNum)
219  return kmer_set;
220 
221  int chunk_length=range.GetLength();
222  char *query_private=(char*)malloc(chunk_length);
223  int private_index=0;
224  for (unsigned int i=range.GetFrom(); i<=range.GetTo(); i++, private_index++)
225  query_private[private_index] = query[i];
226 
227  if (do_seg)
228  {
229  // filter the query to remove overrepresented regions
231  BlastSeqLoc* seq_locs = NULL;
232  SeqBufferSeg((unsigned char *)query_private, chunk_length, 0, sp, &seq_locs);
233  SegParametersFree(sp);
234 
235  // mask out low complexity regions with X residues
236  for(BlastSeqLoc *itr = seq_locs; itr; itr = itr->next)
237  {
238  for(int i=itr->ssr->left; i <= itr->ssr->right; i++)
239  query_private[i]=21;
240  }
241 
242  BlastSeqLocFree(seq_locs);
243  }
244 
245  for(int i=0;i<chunk_length-kmerNum+1;i++)
246  {
247 
248  // compute its index
249  uint32_t index=0;
250  for (int kindex=0; kindex<kmerNum; kindex++)
251  { // Check for X in the kmer
252  if (query_private[i+kindex] == 21)
253  {
254  index=0;
255  break;
256  }
257  index = (index << 4);
258  index += trans_table[query_private[i+kindex]];
259  }
260  if (index != 0)
261  kmer_set.insert(index);
262  }
263 
264  // free the sequence
265  free(query_private);
266 
267  return kmer_set;
268 }
269 
270 set<uint32_t> BlastKmerGetKmerSet2(const string& query_sequence, TSeqRange& range, int kmerNum, int alphabetChoice, vector<int> badMers)
271 {
272  // the set of unique kmers
273  set<uint32_t> kmer_set;
274 
275  vector<Uint1> trans_table;
276  BlastKmerGetCompressedTranslationTable(trans_table, alphabetChoice);
277 
278  int seq_length = static_cast<int>(query_sequence.length());
279  const char* query = query_sequence.c_str();
280 
281  // bail out if the sequence is too short
282  if (seq_length < kmerNum)
283  return kmer_set;
284 
285  int chunk_length=range.GetLength();
286  char *query_private=(char*)malloc(chunk_length);
287  int private_index=0;
288  for (unsigned int i=range.GetFrom(); i<=range.GetTo(); i++, private_index++)
289  query_private[private_index] = query[i];
290 
291  for(int i=0;i<chunk_length-kmerNum+1;i++)
292  {
293 
294  // compute its index
295  uint32_t index=0;
296  for (int kindex=0; kindex<kmerNum; kindex++)
297  { // Check for X in the kmer
298  if (query_private[i+kindex] == 21)
299  {
300  index=0;
301  break;
302  }
303  index = (index << 4);
304  index += trans_table[query_private[i+kindex]];
305  }
306  if (index != 0)
307  {
308  if (i < chunk_length-kmerNum && !badMers.empty())
309  {
310  std::vector<int>::iterator it;
311  it = std::find(badMers.begin(), badMers.end(), index);
312  if (it != badMers.end() && i < chunk_length-1)
313  {
314  index = (index << 4);
315  index += trans_table[query_private[i+kmerNum]];
316  }
317  }
318  kmer_set.insert(index);
319  }
320  }
321 
322  // free the sequence
323  free(query_private);
324 
325  return kmer_set;
326 }
327 
328 
329 
330 void BlastKmerGetCompressedTranslationTable(vector<Uint1>& trans_table, int alphabetChoice)
331 {
332  const unsigned int kAlphabetLen = 28;
333  // Compressed alphabets taken from
334  // Shiryev et al.(2007), Bioinformatics, 23:2949-2951
335  const char* kCompAlphabets[] = {
336  // 23-to-15 letter compressed alphabet. Based on SE_B(14)
337  "ST IJV LM KR EQZ A G BD P N F Y H C W",
338  // 23-to-10 letter compressed alphabet. Based on SE-V(10)
339  "IJLMV AST BDENZ KQR G FY P H C W"
340  };
341 
342  // Use second alphabet only
343  const char* trans_string = kCompAlphabets[alphabetChoice];
344 
345  Uint4 compressed_letter = 1; // this allows for gaps
346  trans_table.clear();
347  trans_table.resize(kAlphabetLen + 1, 0);
348  for (Uint4 i = 0; i < strlen(trans_string);i++) {
349  if (isspace(trans_string[i])) {
350  compressed_letter++;
351  }
352  else if (isalpha(trans_string[i])) {
353  Uint1 aa_letter = AMINOACID_TO_NCBISTDAA[(int)trans_string[i]];
354 
355  _ASSERT(aa_letter < trans_table.size());
356 
357  trans_table[aa_letter] = compressed_letter;
358  }
359  }
360 }
361 
362 int BlastKmerBreakUpSequence(int length, vector<TSeqRange>& range_v, int ChunkSize)
363 {
364  int numChunks=0;
365  int newChunkSize=0;
366  const int kOverlap=50;
367  // Adjust chunk size so chunks are all about same length.
368  if(length <= ChunkSize)
369  {
370  numChunks=1;
371  newChunkSize=length;
372  } else {
373  if (ChunkSize <= kOverlap)
374  numChunks=1;
375  else
376  numChunks = MAX(1, (length - kOverlap)/(ChunkSize - kOverlap));
377  newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;
378  if(newChunkSize > 1.1*ChunkSize)
379  {
380  numChunks++;
381  newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;
382  }
383  }
384  ChunkSize=newChunkSize;
385  TSeqPos last=0;
386  for (int i=0; i<numChunks; i++)
387  {
388  TSeqPos end = last + ChunkSize;
389  end = MIN(end, (TSeqPos)(length-1));
391  range.SetFrom(last);
392  range.SetTo(end);
393  range_v.push_back(range);
394  last = last + ChunkSize - kOverlap;
395  }
396  return numChunks;
397 }
398 
399 int BlastKmerGetDistance(const vector<uint32_t>& minhash1, const vector<uint32_t>& minhash2)
400 {
401 
402  int distance=0;
403  int num_hashes = static_cast<int>(minhash1.size());
404 
405  for (int index=0; index<num_hashes; index++)
406  {
407  if (minhash1[index] != minhash2[index])
408  distance++;
409  }
410 
411  return distance;
412 }
413 
414 // same as above without database
415 bool minhash_query(const string& query,
416  vector < vector <uint32_t> >& seq_hash,
417  int num_hashes,
418  uint32_t *a,
419  uint32_t *b,
420  int do_seg,
421  int kmerNum,
422  int alphabetChoice,
423  int chunkSize)
424 {
425  bool kmersFound=false; // return value;
426  int seq_length = static_cast<int>(query.length());
427  vector<TSeqRange> range_v;
428  int chunk_num = BlastKmerBreakUpSequence(seq_length, range_v, chunkSize);
429  seq_hash.resize(chunk_num);
430  bool seg = (do_seg > 0) ? true : false;
431 
432  vector<uint32_t> idx_tmp(num_hashes);
433  vector<uint32_t> hash_tmp(num_hashes);
434  int chunk_iter=0;
435  for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
436  {
437 
438  seq_hash[chunk_iter].resize(num_hashes);
439 
440  set<uint32_t> seq_kmer = BlastKmerGetKmerSet(query, seg, *iter, kmerNum, alphabetChoice);
441 
442  if (seq_kmer.empty())
443  continue;
444 
445  kmersFound = true;
446 
447  for(int h=0;h<num_hashes;h++)
448  {
449  hash_tmp[h]=0xffffffff;
450  idx_tmp[h]=0xffffffff;
451  }
452 
453  // for each kmer in the sequence,
454  for(set<uint32_t>::iterator i=seq_kmer.begin(); i != seq_kmer.end(); ++i)
455  {
456  // compute hashes and track their minima
457  for(int h=0;h<num_hashes;h++)
458  {
459  uint32_t hashval = uhash(*i, a[h], b[h]);
460 
461  if (hashval < hash_tmp[h])
462  {
463  hash_tmp[h] = hashval;
464  idx_tmp[h] = *i;
465  }
466  }
467  } // end each kmer
468 
469  // save the kmers with the minimum hash values
470  for(int h=0;h<num_hashes;h++)
471  {
472  seq_hash[chunk_iter][h] = idx_tmp[h];
473  }
474  }
475  return kmersFound;
476 }
477 
478 // Hash of kmers by one hash function
479 bool minhash_query2(const string& query,
480  vector < vector <uint32_t> >& seq_hash,
481  int kmerNum,
482  int numHashes,
483  int alphabetChoice,
484  vector<int> badMers,
485  int chunkSize)
486 {
487  bool kmersFound=false; // return value;
488  int seq_length = static_cast<int>(query.length());
489  vector<TSeqRange> range_v;
490  int chunk_num = BlastKmerBreakUpSequence(seq_length, range_v, chunkSize);
491  seq_hash.resize(chunk_num);
492 
493  vector<uint32_t> hash_values;
494  int chunk_iter=0;
495  for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
496  {
497  hash_values.clear();
498  seq_hash[chunk_iter].resize(numHashes);
499 
500  set<uint32_t> seq_kmer = BlastKmerGetKmerSet2(query, *iter, kmerNum, alphabetChoice, badMers);
501 
502  if (seq_kmer.empty())
503  continue;
504 
505  kmersFound = true;
506 
507  // for each kmer in the sequence,
508  for(set<uint32_t>::iterator i=seq_kmer.begin(); i != seq_kmer.end(); ++i)
509  {
510  uint32_t hashval = FNV_hash(*i);
511  hash_values.push_back(hashval);
512 
513  } // end each kmer
514 
515  if (hash_values.size() < static_cast<size_t>(numHashes))
516  {
517  int rem = 1 + numHashes - static_cast<int>(hash_values.size());
518  uint32_t hashval = 0xffffffff; // Fill in empties
519  for (int i=0; i<rem; i++)
520  hash_values.push_back(hashval);
521  }
522  std::sort(hash_values.begin(), hash_values.end());
523 
524  // save the kmers with the minimum hash values
525  for(int h=0;h<numHashes;h++)
526  seq_hash[chunk_iter][h] = hash_values[h];
527  }
528  return kmersFound;
529 }
530 
531 // Find candidate matches with LSH (based on array in index)
532 void get_LSH_hashes(vector < vector <uint32_t> >& query_hash,
533  vector < vector <uint32_t> >&lsh_hash_vec,
534  int num_bands,
535  int rows_per_band)
536 {
537  int num_chunks=static_cast<int>(query_hash.size());
538  uint32_t temp_hash=0;
539  for (int n=0; n<num_chunks; n++)
540  {
541  vector <uint32_t> lsh_chunk_vec;
542  for(int b=0;b<num_bands;b++)
543  {
544 
545  // LSH
546  unsigned char key[9];
547  for(int r=0;r<rows_per_band;r++)
548  {
549  temp_hash = query_hash[n][b*rows_per_band+r];
550  key[r*4] = (temp_hash) & 0xff;
551  key[1+r*4] = ((temp_hash) >> 8) & 0xff;
552  key[2+r*4] = ((temp_hash) >> 16) & 0xff;
553  key[3+r*4] = ((temp_hash) >> 24) & 0xff;
554  }
555  key[8] = (unsigned char) b;
556  uint32_t foo = do_pearson_hash(key, 9);
557  lsh_chunk_vec.push_back(foo);
558  }
559  std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
560  lsh_hash_vec.push_back(lsh_chunk_vec);
561  }
562 
563  return;
564 }
565 
566 void get_LSH_hashes5(vector < vector <uint32_t> >& query_hash,
567  vector < vector <uint32_t> >&lsh_hash_vec,
568  int numHashes,
569  int numRows)
570 {
571  int num_chunks=static_cast<int>(query_hash.size());
572  uint32_t temp_hash=0;
573  int numHashMax = numHashes - numRows + 1;
574  for (int n=0; n<num_chunks; n++)
575  {
576  vector <uint32_t> lsh_chunk_vec;
577  for(int b=0;b<numHashMax;b++)
578  {
579  // LSH
580  unsigned char key[8];
581  for(int r=0;r<numRows;r++)
582  {
583  temp_hash = query_hash[n][b+r];
584  key[r*4] = (temp_hash) & 0xff;
585  key[1+r*4] = ((temp_hash) >> 8) & 0xff;
586  key[2+r*4] = ((temp_hash) >> 16) & 0xff;
587  key[3+r*4] = ((temp_hash) >> 24) & 0xff;
588  }
589  uint32_t foo = do_pearson_hash(key, 4*numRows);
590  lsh_chunk_vec.push_back(foo);
591  }
592  for(int b=0;b<numHashMax-1;b++)
593  {
594  unsigned char key[8];
595  for(int r=0;r<numRows;r++)
596  {
597  temp_hash = query_hash[n][b+2*r];
598  key[r*4] = (temp_hash) & 0xff;
599  key[1+r*4] = ((temp_hash) >> 8) & 0xff;
600  key[2+r*4] = ((temp_hash) >> 16) & 0xff;
601  key[3+r*4] = ((temp_hash) >> 24) & 0xff;
602  }
603  uint32_t foo = do_pearson_hash(key, 4*numRows);
604  lsh_chunk_vec.push_back(foo);
605  }
606  std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
607  lsh_hash_vec.push_back(lsh_chunk_vec);
608  }
609 
610  return;
611 }
612 
613 void GetRandomNumbers(uint32_t* a, uint32_t* b, int numHashes)
614 {
615  CMutexGuard guard(randMutex);
616  CRandom random(1); // Always have the same random numbers.
617  for(int i=0;i<numHashes;i++)
618  {
619  do
620  {
621  a[i]=(random.GetRand()%PKMER_PRIME);
622  }
623  while (a[i] == 0);
624 
625  b[i]=(random.GetRand()%PKMER_PRIME);
626  }
627 }
628 
629 void GetKValues(vector< vector <int> >& kvector, int k_value, int l_value, int array_size)
630 {
631  CMutexGuard guard(randMutex);
632  CRandom random(10);
633 
634  for (int i=0; i<l_value; i++)
635  {
636  vector<int> ktemp;
637  for (int k=0; k<k_value; k++)
638  {
639  Uint1 temp=(Uint1) random.GetRand()%array_size;
640  ktemp.push_back(temp);
641  }
642  kvector.push_back(ktemp);
643  }
644  return;
645 }
646 
647 
648 // Find candidate matches with LSH.
649 // Based on article by Buhler (PMID:11331236) but applied to our arrays
650 // of hash functions rather than sequences.
651 void get_LSH_hashes2(vector < vector <uint32_t> >& query_hash,
652  vector < vector <uint32_t> >& lsh_hash_vec,
653  int num_k,
654  int num_l,
655  vector< vector<int> >& kvector)
656 {
657  int max=4*num_k+1;
658  vector<unsigned char> key(max);
659  int num_chunks=static_cast<int>(query_hash.size());
660  uint32_t temp_hash=0;
661  int temp_index=0;
662  for (int n=0; n<num_chunks; n++)
663  {
664  vector <uint32_t> lsh_chunk_vec;
665  for (int r=0; r<num_l; r++)
666  {
667  for (int i=0; i<num_k; i++)
668  {
669  temp_index=kvector[r][i];
670  temp_hash = query_hash[n][temp_index];
671  key[i*4] = (temp_hash) & 0xff;
672  key[1+i*4] = ((temp_hash) >> 8) & 0xff;
673  key[2+i*4] = ((temp_hash) >> 16) & 0xff;
674  key[3+i*4] = ((temp_hash) >> 24) & 0xff;
675  }
676  key[max-1] = r;
677  uint32_t foo = do_pearson_hash(key.data(), max);
678  lsh_chunk_vec.push_back(foo);
679  }
680  std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
681  lsh_hash_vec.push_back(lsh_chunk_vec);
682  }
683 }
684 
685 // Find candidate matches with LSH (based on array in index)
686 void get_LSH_match_from_hash(const vector< vector <uint32_t> >& query_LSH_hash,
687  const uint64_t* lsh_array,
688  vector< set<uint32_t > >& candidates)
689 {
690 
691  int num=static_cast<int>(query_LSH_hash.size());
692  for (int i=0; i<num; i++)
693  {
694  for(vector<uint32_t>::const_iterator iter=query_LSH_hash[i].begin(); iter != query_LSH_hash[i].end(); ++iter)
695  {
696  if (lsh_array[*iter] != 0)
697  candidates[i].insert(*iter);
698  }
699  }
700 
701  return;
702 }
703 
704 
705 void
706 s_HashHashQuery(const vector < vector <uint32_t> > & query_hash, vector < vector <uint32_t> >& query_hash_hash, int compress, int version)
707 {
708  int hash_value=0;
709  int num_chunks=static_cast<int>(query_hash.size());
710  const uint32_t kBig=0xffffffff;
711  for (int n=0; n<num_chunks; n++)
712  {
713  vector<uint32_t> tmp_hash;
714  for(vector<uint32_t>::const_iterator i = query_hash[n].begin(); i != query_hash[n].end(); ++i)
715  {
716  if (compress == 2)
717  hash_value = (int) pearson_hash_int2short(*i);
718  else if (compress == 1)
719  hash_value = (int) pearson_hash_int2byte(*i);
720  else
721  hash_value = *i;
722 
723  if (version == 3 && *i == kBig)
724  {
725  if (compress == 2)
726  tmp_hash.push_back(0xffff);
727  else if (compress == 1)
728  tmp_hash.push_back(0xff);
729  else
730  tmp_hash.push_back(kBig);
731  }
732  else
733  tmp_hash.push_back(hash_value);
734  }
735  if (version == 3) // kBig et al gets sorted to the end anyway.
736  std::sort(tmp_hash.begin(), tmp_hash.end());
737  query_hash_hash.push_back(tmp_hash);
738  }
739 }
740 
741 
742 
743 // find candidate matches with LSH
744 void neighbor_query(const vector < vector <uint32_t> >& query_hash,
745  const uint64_t* lsh,
746  vector < set<uint32_t> >& candidates,
747  CMinHashFile& mhfile,
748  int num_hashes,
749  int min_hits,
750  double thresh,
751  TBlastKmerPrelimScoreVector& score_vector,
752  BlastKmerStats& kmer_stats,
753  int kmerVer)
754 {
755  int num_chunks=static_cast<int>(query_hash.size());
756 
757  vector< vector<uint32_t> > candidate_oids;
758  candidate_oids.resize(num_chunks);
759 
760  for (int num=0; num<num_chunks; num++)
761  {
762  for(set<uint32_t>::iterator i=candidates[num].begin(); i != candidates[num].end(); ++i)
763  {
764  uint64_t offset = lsh[(*i)];
765  if(offset != 0)
766  {
767  int index=(*i)+1;
768  while (lsh[index] == 0)
769  index++;
770  uint64_t read_size = (lsh[index] - offset)/4;
771  int remaining = (int) read_size;
772  int* oid_offset = mhfile.GetHits(offset);
773  index=0;
774  while (index < remaining)
775  {
776  int oid = *(oid_offset+index);
777  candidate_oids[num].push_back(oid);
778  index++;
779  }
780  }
781  }
782  }
783 
784  for (int index=0; index<num_chunks; index++)
785  {
786  kmer_stats.hit_count += candidates[index].size();
787  kmer_stats.oids_considered += candidate_oids[index].size();
788  }
789 
790  vector < vector <uint32_t> > query_hash_hash;
791  s_HashHashQuery(query_hash, query_hash_hash, mhfile.GetDataWidth(), mhfile.GetVersion());
792 
793  vector <uint32_t> subject_hash;
794  subject_hash.resize(num_hashes);
795 
796  map< int, double > score_map;
797  typedef map< int, double >::value_type score_mapValType;
798  // for each candidate with hits estimate Jaccard distance.
799  for (int n=0; n<num_chunks; n++)
800  {
801  std::sort(candidate_oids[n].begin(), candidate_oids[n].end());
802  int last_oid = -1;
803  int oid_count=1;
804  for(vector<uint32_t>::iterator i = candidate_oids[n].begin(); i != candidate_oids[n].end(); ++i)
805  { // check that min_hits criteria satisfied.
806  if (last_oid == *i)
807  {
808  oid_count++;
809  if (min_hits != oid_count)
810  continue;
811  }
812  else
813  {
814  last_oid = *i;
815  oid_count = 1;
816  if (min_hits > 1)
817  continue;
818  }
819 
820  kmer_stats.jd_oid_count++;
821 
822  int oid = *i;
823  int subject_oid=0;
824  mhfile.GetMinHits(oid, subject_oid, subject_hash);
825  double current_score=0;
826  if (kmerVer < 3)
827  current_score = estimate_jaccard(query_hash_hash[n], subject_hash, num_hashes);
828  else
829  current_score = estimate_jaccard2(query_hash_hash[n], subject_hash, num_hashes);
830  kmer_stats.jd_count++;
831  if (current_score < thresh)
832  continue;
833  // cerr << "OID and score " << oid << " " << total_score << '\n';
834 
835  if (!score_map.count(subject_oid))
836  score_map.insert(score_mapValType(subject_oid, current_score));
837  else if (current_score > score_map[subject_oid])
838  score_map[subject_oid] = current_score;
839  }
840  }
841 
842  // Print out the score
843  for(map<int, double>::iterator i = score_map.begin(); i != score_map.end(); ++i)
844  {
845  double total_score = (*i).second;
846  if (total_score > thresh)
847  {
848  score_vector.push_back((*i));
849  kmer_stats.total_matches++;
850  }
851  }
852 }
853 
854 static int
855 s_BlastKmerVerifyVolume(CMinHashFile& mhfile, string& error_msg, int volume)
856 {
857  int num_hashes=mhfile.GetNumHashes();
858  if(num_hashes < 32)
859  {
860  error_msg = "Only " + NStr::NumericToString(num_hashes) + " hashes in volume " + NStr::NumericToString(volume);
861  return 1;
862  }
863  int kmerNum = mhfile.GetKmerSize();
864  if (kmerNum < 4)
865  {
866  error_msg = "Kmer size is only " + NStr::NumericToString(kmerNum) + " in volume " + NStr::NumericToString(volume);
867  return 1;
868  }
869  int numSeqs = mhfile.GetNumSeqs();
870  if (numSeqs < 1)
871  {
872  error_msg = "Only " + NStr::NumericToString(numSeqs) + " sequences in volume " + NStr::NumericToString(volume);
873  return 1;
874  }
875  int lshSize = mhfile.GetLSHSize();
876  if (lshSize < 1000)
877  {
878  error_msg = "LSH size is only " + NStr::NumericToString(lshSize) + " in volume " + NStr::NumericToString(volume);
879  return 1;
880  }
881  uint64_t* lsh_array = mhfile.GetLSHArray();
882  int i=0;
883  for (i=0; i<lshSize; i++)
884  {
885  if(lsh_array[i] > 0)
886  break;
887  }
888  if (i == lshSize)
889  {
890  error_msg = "LSH array is empty in volume " + NStr::NumericToString(volume);
891  return 1;
892  }
893  int subjectOid=0;
894  vector<uint32_t> hits;
895  mhfile.GetMinHits(3, subjectOid, hits);
896  if (subjectOid < 0)
897  {
898  error_msg = "Subject OID is less than zero: " + NStr::NumericToString(subjectOid) + " in volume " + NStr::NumericToString(volume);
899  return 1;
900  }
901  if (hits.size() != static_cast<size_t>(num_hashes))
902  {
903  error_msg = "Signature array only has only " + NStr::NumericToString((int) hits.size()) + " entries in volume " + NStr::NumericToString(volume);
904  return 1;
905  }
906  if (mhfile.GetVersion() == 3)
907  {
908  uint32_t last = 0;
909  for(vector<uint32_t>::iterator iter=hits.begin(); iter != hits.end(); ++iter)
910  {
911  if (last > *iter)
912  {
913  error_msg = "Signature array for version 3 index not in ascending order in volume " + NStr::NumericToString(volume);
914  return 1;
915  }
916  last = *iter;
917  }
918  }
919  return 0;
920 }
921 
922 int
923 BlastKmerVerifyIndex(CRef<CSeqDB> seqdb, string &error_msg)
924 {
925  int status=0;
926  vector<string> kmerFiles;
927  seqdb->FindVolumePaths(kmerFiles, false);
928  int numFiles = static_cast<int>(kmerFiles.size());
929  for (int i=0; i<numFiles; i++)
930  {
931  try {
932  CMinHashFile mhfile(kmerFiles[i]);
933  status = s_BlastKmerVerifyVolume(mhfile, error_msg, i);
934  }
935  catch (const blast::CMinHashException& e) {
936  error_msg = e.GetMsg();
937  status = 1;
938  }
939  catch (const CFileException& e) {
940  error_msg = e.GetMsg();
941  status = 1;
942  }
943  catch (...) {
944  error_msg = "Unknown error";
945  status = 1;
946  }
947  if (status != 0)
948  break;
949  }
950  return status;
951 }
952 
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
BLAST filtering functions.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
SEG filtering functions.
Int2 SeqBufferSeg(Uint1 *sequence, Int4 length, Int4 offset, SegParameters *sparamsp, BlastSeqLoc **seg_locs)
Runs seg on a protein sequence in ncbistdaa.
Definition: blast_seg.c:2281
SegParameters * SegParametersNewAa(void)
Allocated SeqParameter struct for proteins and fills with default values.
Definition: blast_seg.c:2225
void SegParametersFree(SegParameters *sparamsp)
Free SegParameters structure.
Definition: blast_seg.c:2272
USING_SCOPE(objects)
static int s_BlastKmerVerifyVolume(CMinHashFile &mhfile, string &error_msg, int volume)
uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)
void BlastKmerGetCompressedTranslationTable(vector< Uint1 > &trans_table, int alphabetChoice)
Creates translation table for compressed alphabets.
int BlastKmerVerifyIndex(CRef< CSeqDB > seqdb, string &error_msg)
double estimate_jaccard(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int ChunkSize)
Breaks a sequences up into chunks if the sequences is above a certain length.
int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
Calculates the number of differences between two minhash arrays.
void get_LSH_hashes(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
void get_LSH_hashes5(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
Gets the LSH hash for one hash function.
set< uint32_t > BlastKmerGetKmerSetStats(const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery)
Simplified version of BlastKmerGetKmerSet.
DEFINE_STATIC_MUTEX(randMutex)
set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
Get KMERs for a given sequence using a compressed alphabet.
bool minhash_query2(const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
Hash the query for the minimum values;.
static uint32_t FNV_hash(uint32_t num)
FNV hash, see http://www.isthe.com/chongo/tech/comp/fnv/index.html.
void s_HashHashQuery(const vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &query_hash_hash, int compress, int version)
void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)
Get the random numbers for the hash function.
set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
Get KMERs for a given sequence using a compressed alphabet.
void neighbor_query(const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVer)
double estimate_jaccard2(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
bool minhash_query(const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
void get_LSH_match_from_hash(const vector< vector< uint32_t > > &query_LSH_hash, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)
void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
Function to get the k sites to compare for Buhler LSH.
void get_LSH_hashes2(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kvector)
vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector
Vector of pairs of database OIDs and scores.
#define PKMER_PRIME
CFileException –.
Definition: ncbifile.hpp:136
Access data in Minhash files.
Definition: mhfile.hpp:108
int GetLSHSize(void) const
Definition: mhfile.hpp:133
uint32_t * GetMinHits(int oid) const
Definition: mhfile.hpp:155
int GetNumHashes(void) const
Returns the number of values in an array of hashes (probably 32)
Definition: mhfile.hpp:118
uint64_t * GetLSHArray(void) const
Definition: mhfile.hpp:151
int GetVersion(void) const
Definition: mhfile.hpp:113
int GetNumSeqs(void) const
Definition: mhfile.hpp:115
int GetDataWidth(void) const
Definition: mhfile.hpp:127
int GetKmerSize(void) const
Returns the length of the KMER.
Definition: mhfile.hpp:123
int * GetHits(uint64_t offset) const
Definition: mhfile.hpp:153
CRandom::
Definition: random_gen.hpp:66
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
parent_type::iterator iterator
Definition: set.hpp:80
bool empty() const
Definition: set.hpp:133
const_iterator end() const
Definition: set.hpp:136
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
int offset
Definition: replacements.h:160
Uint8 uint64_t
Uint4 uint32_t
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
Definition: random_gen.hpp:238
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
pair< int, int > ChunkSize(const CSpliced_exon_chunk &chunk)
int i
yy_size_t n
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const string version
version string
Definition: variables.hpp:66
const struct ncbi::grid::netcache::search::fields::KEY key
unsigned int a
Definition: ncbi_localip.c:102
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
T max(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
uint32_t do_pearson_hash(unsigned char *key, int length)
Definition: pearson.cpp:51
uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)
Pearson hash an integer into two bytes.
Definition: pearson.cpp:81
unsigned char pearson_hash_int2byte(uint32_t input, int seed1)
Pearson hash an integer into one byte.
Definition: pearson.cpp:64
static size_t read_size(CNcbiIstream &stream, const char *name)
Definition: reader_snp.cpp:404
Defines BLAST database access classes.
Structure for ancillary data on KMER search.
int jd_count
How often was the Jaccard distance calculated.
int total_matches
How many matches returned.
int oids_considered
How many OIDs were considered as candidates.
int hit_count
How many hits to the hash array were there?
int jd_oid_count
How many OIDs was the Jaccard distance calculated for.
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Structure to hold parameters for seg search.
Definition: blast_seg.h:49
static string subject
static string query
Definition: _hash_fun.h:40
#define _ASSERT
#define compress
Definition: zconf_cf.h:39
void free(voidpf ptr)
voidp malloc(uInt size)
Modified on Fri Sep 20 14:58:07 2024 by modify_doxy.py rev. 669887