59 return ( (
a*x +
b) % p );
66 const Uint4 fnv_prime = 16777619u;
67 const Uint4 fnv_offset_basis = 2166136261u;
73 key[1] = (num >> 8) & 0xff;
74 key[2] = (num >> 16) & 0xff;
75 key[3] = (num >> 24) & 0xff;
78 hash = fnv_offset_basis;
79 for (
i = 0;
i < 4;
i++) {
97 for(
int h=0;h<num_hashes;h++)
103 return (
double) score / num_hashes;
117 for(
int h=0;h<num_hashes;h++)
119 while (bindex < num_hashes &&
a[h] >
b[bindex])
123 if (bindex == num_hashes)
125 if (
a[h] ==
b[bindex])
129 return (
double) score / num_hashes;
137 vector<Uint1> trans_table;
140 int seq_length =
static_cast<int>(query_sequence.length());
141 const char*
query = query_sequence.c_str();
144 if (seq_length < kmerNum)
150 for(
int i=0;
i<seq_length-kmerNum+1;
i++)
155 for (
int kindex=0; kindex<kmerNum; kindex++)
157 if (
query[
i+kindex] == 21)
162 index = (index << 4);
163 index += trans_table[
query[
i+kindex]];
175 kmerCountPriv[kmer_str]++;
177 if (index != 0 &&
i<seq_length-kmerNum &&
query[
i+kmerNum] != 21)
179 int index_short = index;
180 index = (index << 4);
181 index += trans_table[
query[
i+kmerNum]];
184 kmerCountPlusPriv[kmer_str]++;
191 kmerCount[(*i).first]++;
193 kmerCountPlus[(*i).first]++;
198 kmerCount[(*i).first] += (*i).second;
200 kmerCountPlus[(*i).first] += (*i).second;
211 vector<Uint1> trans_table;
214 int seq_length =
static_cast<int>(query_sequence.length());
215 const char*
query = query_sequence.c_str();
218 if (seq_length < kmerNum)
221 int chunk_length=
range.GetLength();
222 char *query_private=(
char*)
malloc(chunk_length);
224 for (
unsigned int i=
range.GetFrom();
i<=
range.GetTo();
i++, private_index++)
225 query_private[private_index] =
query[
i];
232 SeqBufferSeg((
unsigned char *)query_private, chunk_length, 0, sp, &seq_locs);
238 for(
int i=itr->ssr->left; i <= itr->ssr->right;
i++)
245 for(
int i=0;
i<chunk_length-kmerNum+1;
i++)
250 for (
int kindex=0; kindex<kmerNum; kindex++)
252 if (query_private[
i+kindex] == 21)
257 index = (index << 4);
258 index += trans_table[query_private[
i+kindex]];
275 vector<Uint1> trans_table;
278 int seq_length =
static_cast<int>(query_sequence.length());
279 const char*
query = query_sequence.c_str();
282 if (seq_length < kmerNum)
285 int chunk_length=
range.GetLength();
286 char *query_private=(
char*)
malloc(chunk_length);
288 for (
unsigned int i=
range.GetFrom();
i<=
range.GetTo();
i++, private_index++)
289 query_private[private_index] =
query[
i];
291 for(
int i=0;
i<chunk_length-kmerNum+1;
i++)
296 for (
int kindex=0; kindex<kmerNum; kindex++)
298 if (query_private[
i+kindex] == 21)
303 index = (index << 4);
304 index += trans_table[query_private[
i+kindex]];
308 if (
i < chunk_length-kmerNum && !badMers.empty())
310 std::vector<int>::iterator it;
311 it = std::find(badMers.begin(), badMers.end(), index);
312 if (it != badMers.end() &&
i < chunk_length-1)
314 index = (index << 4);
315 index += trans_table[query_private[
i+kmerNum]];
332 const unsigned int kAlphabetLen = 28;
335 const char* kCompAlphabets[] = {
337 "ST IJV LM KR EQZ A G BD P N F Y H C W",
339 "IJLMV AST BDENZ KQR G FY P H C W"
343 const char* trans_string = kCompAlphabets[alphabetChoice];
345 Uint4 compressed_letter = 1;
347 trans_table.resize(kAlphabetLen + 1, 0);
348 for (
Uint4 i = 0;
i < strlen(trans_string);
i++) {
352 else if (
isalpha(trans_string[
i])) {
355 _ASSERT(aa_letter < trans_table.size());
357 trans_table[aa_letter] = compressed_letter;
366 const int kOverlap=50;
376 numChunks =
MAX(1, (length - kOverlap)/(
ChunkSize - kOverlap));
377 newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;
381 newChunkSize = (length + (numChunks-1)*kOverlap)/numChunks;
386 for (
int i=0;
i<numChunks;
i++)
393 range_v.push_back(
range);
403 int num_hashes =
static_cast<int>(minhash1.size());
405 for (
int index=0; index<num_hashes; index++)
407 if (minhash1[index] != minhash2[index])
416 vector < vector <uint32_t> >& seq_hash,
425 bool kmersFound=
false;
426 int seq_length =
static_cast<int>(
query.length());
427 vector<TSeqRange> range_v;
429 seq_hash.resize(chunk_num);
430 bool seg = (do_seg > 0) ?
true :
false;
432 vector<uint32_t> idx_tmp(num_hashes);
433 vector<uint32_t> hash_tmp(num_hashes);
435 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
438 seq_hash[chunk_iter].resize(num_hashes);
442 if (seq_kmer.
empty())
447 for(
int h=0;h<num_hashes;h++)
449 hash_tmp[h]=0xffffffff;
450 idx_tmp[h]=0xffffffff;
457 for(
int h=0;h<num_hashes;h++)
461 if (hashval < hash_tmp[h])
463 hash_tmp[h] = hashval;
470 for(
int h=0;h<num_hashes;h++)
472 seq_hash[chunk_iter][h] = idx_tmp[h];
480 vector < vector <uint32_t> >& seq_hash,
487 bool kmersFound=
false;
488 int seq_length =
static_cast<int>(
query.length());
489 vector<TSeqRange> range_v;
491 seq_hash.resize(chunk_num);
493 vector<uint32_t> hash_values;
495 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
498 seq_hash[chunk_iter].resize(numHashes);
502 if (seq_kmer.
empty())
511 hash_values.push_back(hashval);
515 if (hash_values.size() <
static_cast<size_t>(numHashes))
517 int rem = 1 + numHashes -
static_cast<int>(hash_values.size());
519 for (
int i=0;
i<rem;
i++)
520 hash_values.push_back(hashval);
522 std::sort(hash_values.begin(), hash_values.end());
525 for(
int h=0;h<numHashes;h++)
526 seq_hash[chunk_iter][h] = hash_values[h];
533 vector < vector <uint32_t> >&lsh_hash_vec,
537 int num_chunks=
static_cast<int>(query_hash.size());
539 for (
int n=0;
n<num_chunks;
n++)
541 vector <uint32_t> lsh_chunk_vec;
542 for(
int b=0;
b<num_bands;
b++)
546 unsigned char key[9];
547 for(
int r=0;
r<rows_per_band;
r++)
549 temp_hash = query_hash[
n][
b*rows_per_band+
r];
550 key[
r*4] = (temp_hash) & 0xff;
551 key[1+
r*4] = ((temp_hash) >> 8) & 0xff;
552 key[2+
r*4] = ((temp_hash) >> 16) & 0xff;
553 key[3+
r*4] = ((temp_hash) >> 24) & 0xff;
555 key[8] = (
unsigned char)
b;
557 lsh_chunk_vec.push_back(foo);
559 std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
560 lsh_hash_vec.push_back(lsh_chunk_vec);
567 vector < vector <uint32_t> >&lsh_hash_vec,
571 int num_chunks=
static_cast<int>(query_hash.size());
573 int numHashMax = numHashes - numRows + 1;
574 for (
int n=0;
n<num_chunks;
n++)
576 vector <uint32_t> lsh_chunk_vec;
577 for(
int b=0;
b<numHashMax;
b++)
580 unsigned char key[8];
581 for(
int r=0;
r<numRows;
r++)
583 temp_hash = query_hash[
n][
b+
r];
584 key[
r*4] = (temp_hash) & 0xff;
585 key[1+
r*4] = ((temp_hash) >> 8) & 0xff;
586 key[2+
r*4] = ((temp_hash) >> 16) & 0xff;
587 key[3+
r*4] = ((temp_hash) >> 24) & 0xff;
590 lsh_chunk_vec.push_back(foo);
592 for(
int b=0;
b<numHashMax-1;
b++)
594 unsigned char key[8];
595 for(
int r=0;
r<numRows;
r++)
597 temp_hash = query_hash[
n][
b+2*
r];
598 key[
r*4] = (temp_hash) & 0xff;
599 key[1+
r*4] = ((temp_hash) >> 8) & 0xff;
600 key[2+
r*4] = ((temp_hash) >> 16) & 0xff;
601 key[3+
r*4] = ((temp_hash) >> 24) & 0xff;
604 lsh_chunk_vec.push_back(foo);
606 std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
607 lsh_hash_vec.push_back(lsh_chunk_vec);
617 for(
int i=0;
i<numHashes;
i++)
629 void GetKValues(vector< vector <int> >& kvector,
int k_value,
int l_value,
int array_size)
634 for (
int i=0;
i<l_value;
i++)
637 for (
int k=0; k<k_value; k++)
640 ktemp.push_back(temp);
642 kvector.push_back(ktemp);
652 vector < vector <uint32_t> >& lsh_hash_vec,
655 vector< vector<int> >& kvector)
658 vector<unsigned char>
key(
max);
659 int num_chunks=
static_cast<int>(query_hash.size());
662 for (
int n=0;
n<num_chunks;
n++)
664 vector <uint32_t> lsh_chunk_vec;
665 for (
int r=0;
r<num_l;
r++)
667 for (
int i=0;
i<num_k;
i++)
669 temp_index=kvector[
r][
i];
670 temp_hash = query_hash[
n][temp_index];
671 key[
i*4] = (temp_hash) & 0xff;
672 key[1+
i*4] = ((temp_hash) >> 8) & 0xff;
673 key[2+
i*4] = ((temp_hash) >> 16) & 0xff;
674 key[3+
i*4] = ((temp_hash) >> 24) & 0xff;
678 lsh_chunk_vec.push_back(foo);
680 std::sort(lsh_chunk_vec.begin(), lsh_chunk_vec.end());
681 lsh_hash_vec.push_back(lsh_chunk_vec);
691 int num=
static_cast<int>(query_LSH_hash.size());
692 for (
int i=0;
i<num;
i++)
694 for(vector<uint32_t>::const_iterator iter=query_LSH_hash[
i].begin(); iter != query_LSH_hash[
i].end(); ++iter)
696 if (lsh_array[*iter] != 0)
697 candidates[
i].insert(*iter);
709 int num_chunks=
static_cast<int>(query_hash.size());
711 for (
int n=0;
n<num_chunks;
n++)
713 vector<uint32_t> tmp_hash;
714 for(vector<uint32_t>::const_iterator
i = query_hash[
n].begin();
i != query_hash[
n].end(); ++
i)
726 tmp_hash.push_back(0xffff);
728 tmp_hash.push_back(0xff);
730 tmp_hash.push_back(kBig);
733 tmp_hash.push_back(hash_value);
736 std::sort(tmp_hash.begin(), tmp_hash.end());
737 query_hash_hash.push_back(tmp_hash);
755 int num_chunks=
static_cast<int>(query_hash.size());
757 vector< vector<uint32_t> > candidate_oids;
758 candidate_oids.resize(num_chunks);
760 for (
int num=0; num<num_chunks; num++)
768 while (lsh[index] == 0)
774 while (index < remaining)
776 int oid = *(oid_offset+index);
777 candidate_oids[num].push_back(oid);
784 for (
int index=0; index<num_chunks; index++)
786 kmer_stats.
hit_count += candidates[index].size();
790 vector < vector <uint32_t> > query_hash_hash;
793 vector <uint32_t> subject_hash;
794 subject_hash.resize(num_hashes);
799 for (
int n=0;
n<num_chunks;
n++)
801 std::sort(candidate_oids[
n].begin(), candidate_oids[
n].end());
804 for(vector<uint32_t>::iterator
i = candidate_oids[
n].begin();
i != candidate_oids[
n].end(); ++
i)
809 if (min_hits != oid_count)
824 mhfile.
GetMinHits(oid, subject_oid, subject_hash);
825 double current_score=0;
831 if (current_score < thresh)
835 if (!score_map.count(subject_oid))
836 score_map.
insert(score_mapValType(subject_oid, current_score));
837 else if (current_score > score_map[subject_oid])
838 score_map[subject_oid] = current_score;
845 double total_score = (*i).second;
846 if (total_score > thresh)
848 score_vector.push_back((*
i));
883 for (
i=0;
i<lshSize;
i++)
894 vector<uint32_t> hits;
901 if (hits.size() !=
static_cast<size_t>(num_hashes))
909 for(vector<uint32_t>::iterator iter=hits.begin(); iter != hits.end(); ++iter)
913 error_msg =
"Signature array for version 3 index not in ascending order in volume " +
NStr::NumericToString(volume);
926 vector<string> kmerFiles;
928 int numFiles =
static_cast<int>(kmerFiles.size());
929 for (
int i=0;
i<numFiles;
i++)
935 catch (
const blast::CMinHashException& e) {
936 error_msg = e.GetMsg();
940 error_msg = e.GetMsg();
944 error_msg =
"Unknown error";
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
BLAST filtering functions.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Int2 SeqBufferSeg(Uint1 *sequence, Int4 length, Int4 offset, SegParameters *sparamsp, BlastSeqLoc **seg_locs)
Runs seg on a protein sequence in ncbistdaa.
SegParameters * SegParametersNewAa(void)
Allocated SeqParameter struct for proteins and fills with default values.
void SegParametersFree(SegParameters *sparamsp)
Free SegParameters structure.
static int s_BlastKmerVerifyVolume(CMinHashFile &mhfile, string &error_msg, int volume)
uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)
void BlastKmerGetCompressedTranslationTable(vector< Uint1 > &trans_table, int alphabetChoice)
Creates translation table for compressed alphabets.
int BlastKmerVerifyIndex(CRef< CSeqDB > seqdb, string &error_msg)
double estimate_jaccard(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int ChunkSize)
Breaks a sequences up into chunks if the sequences is above a certain length.
int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
Calculates the number of differences between two minhash arrays.
void get_LSH_hashes(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
void get_LSH_hashes5(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
Gets the LSH hash for one hash function.
set< uint32_t > BlastKmerGetKmerSetStats(const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery)
Simplified version of BlastKmerGetKmerSet.
DEFINE_STATIC_MUTEX(randMutex)
set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
Get KMERs for a given sequence using a compressed alphabet.
bool minhash_query2(const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
Hash the query for the minimum values;.
static uint32_t FNV_hash(uint32_t num)
FNV hash, see http://www.isthe.com/chongo/tech/comp/fnv/index.html.
void s_HashHashQuery(const vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &query_hash_hash, int compress, int version)
void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)
Get the random numbers for the hash function.
set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
Get KMERs for a given sequence using a compressed alphabet.
void neighbor_query(const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVer)
double estimate_jaccard2(vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
bool minhash_query(const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
void get_LSH_match_from_hash(const vector< vector< uint32_t > > &query_LSH_hash, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)
void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
Function to get the k sites to compare for Buhler LSH.
void get_LSH_hashes2(vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kvector)
vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector
Vector of pairs of database OIDs and scores.
Access data in Minhash files.
int GetLSHSize(void) const
uint32_t * GetMinHits(int oid) const
int GetNumHashes(void) const
Returns the number of values in an array of hashes (probably 32)
uint64_t * GetLSHArray(void) const
int GetVersion(void) const
int GetNumSeqs(void) const
int GetDataWidth(void) const
int GetKmerSize(void) const
Returns the length of the KMER.
int * GetHits(uint64_t offset) const
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
const_iterator begin() const
const_iterator end() const
iterator_bool insert(const value_type &val)
iterator_bool insert(const value_type &val)
const_iterator begin() const
parent_type::iterator iterator
const_iterator end() const
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
unsigned int TSeqPos
Type for sequence locations and lengths.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
unsigned int
A callback function used to compare two keys in a database.
pair< int, int > ChunkSize(const CSpliced_exon_chunk &chunk)
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const string version
version string
const struct ncbi::grid::netcache::search::fields::KEY key
#define MIN(a, b)
returns smaller of a and b.
#define MAX(a, b)
returns larger of a and b.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
uint32_t do_pearson_hash(unsigned char *key, int length)
uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)
Pearson hash an integer into two bytes.
unsigned char pearson_hash_int2byte(uint32_t input, int seed1)
Pearson hash an integer into one byte.
static size_t read_size(CNcbiIstream &stream, const char *name)
Defines BLAST database access classes.
Structure for ancillary data on KMER search.
int jd_count
How often was the Jaccard distance calculated.
int total_matches
How many matches returned.
int oids_considered
How many OIDs were considered as candidates.
int hit_count
How many hits to the hash array were there?
int jd_oid_count
How many OIDs was the Jaccard distance calculated for.
Used to hold a set of positions, mostly used for filtering.
struct BlastSeqLoc * next
next in linked list
Structure to hold parameters for seg search.