116 : m_NumHashFct(numHashFct),
117 m_KmerSize(kmerSize),
122 m_Alphabet(alphabet),
124 m_ChunkSize(chunkSize)
131 m_NumBands = m_NumHashFct/m_RowsPerBand;
141 return ( (
a*x +
b) % p );
148 const Uint4 fnv_prime = 16777619u;
149 const Uint4 fnv_offset_basis = 2166136261u;
155 key[1] = (num >> 8) & 0xff;
156 key[2] = (num >> 16) & 0xff;
157 key[3] = (num >> 24) & 0xff;
160 hash = fnv_offset_basis;
161 for (
i = 0;
i < 4;
i++) {
172 vector < vector < vector < uint32_t > > > & seq_hash,
184 int fullOID=q_oid+oidOffset;
186 vector<TSeqRange> range_v;
188 seq_hash[q_oid].resize(chunk_num);
193 bool first_time=
true;
195 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
200 if (seq_kmer.
empty())
206 vector<uint32_t> idx_tmp(num_hashes);
207 vector<uint32_t> hash_tmp(num_hashes);
209 for(
int h=0;h<num_hashes;h++)
211 hash_tmp[h]=0xffffffff;
212 idx_tmp[h]=0xffffffff;
219 for(
int h=0;h<num_hashes;h++)
223 if (hashval < hash_tmp[h])
225 hash_tmp[h] = hashval;
231 if (first_time ==
false)
244 seq_hash[q_oid][chunk_counter].resize(num_hashes+1);
245 for(h=0;h<num_hashes;h++)
246 seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];
250 seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;
252 seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;
256 if (chunk_num > chunk_counter+1)
257 seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());
258 if (first_time ==
true)
259 seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());
266 vector < vector < vector < uint32_t > > > & seq_hash,
276 int fullOID=q_oid+oidOffset;
278 vector<TSeqRange> range_v;
280 seq_hash[q_oid].resize(chunk_num);
285 bool first_time=
true;
287 for(vector<TSeqRange>::iterator iter=range_v.begin(); iter != range_v.end(); ++iter, chunk_iter++)
292 if (seq_kmer.
empty())
298 vector<uint32_t> hash_values;
299 vector<uint32_t> idx_tmp(num_hashes);
305 hash_values.push_back(hashval);
308 if (hash_values.size() <
static_cast<size_t>(num_hashes))
310 int rem = 1 + num_hashes -
static_cast<int>(hash_values.size());
312 for (
int i=0;
i<rem;
i++)
313 hash_values.push_back(hashval);
315 std::sort(hash_values.begin(), hash_values.end());
317 for(
int i=0;
i<num_hashes;
i++)
318 idx_tmp[
i] = hash_values[
i];
320 if (first_time ==
false)
333 seq_hash[q_oid][chunk_counter].resize(num_hashes+1);
334 for(h=0;h<num_hashes;h++)
335 seq_hash[q_oid][chunk_counter][h] = idx_tmp[h];
339 seq_hash[q_oid][chunk_counter][num_hashes] = q_oid;
341 seq_hash[q_oid][chunk_counter][num_hashes] = fullOID;
345 if (chunk_num > chunk_counter+1)
346 seq_hash[q_oid].erase(seq_hash[q_oid].begin()+chunk_counter+1, seq_hash[q_oid].end());
347 if (first_time ==
true)
348 seq_hash[q_oid].erase(seq_hash[q_oid].begin(), seq_hash[q_oid].end());
354 int q_oid,
int numBands,
int numRows,
int& total_chunks,
uint8_t* uniqueHash)
357 int num_chunks =
static_cast<int>(seq_hash[q_oid].size());
358 for (
int n=0;
n<num_chunks;
n++)
360 for(
int b=0;
b<numBands;
b++)
362 unsigned char key[9];
363 for(
int r=0;
r<numRows;
r++)
365 key[
r*4] = (seq_hash[q_oid][
n][
b*numRows+
r]) & 0xff;
366 key[1+
r*4] = ((seq_hash[q_oid][
n][
b*numRows+
r]) >> 8) & 0xff;
367 key[2+
r*4] = ((seq_hash[q_oid][
n][
b*numRows+
r]) >> 16) & 0xff;
368 key[3+
r*4] = ((seq_hash[q_oid][
n][
b*numRows+
r]) >> 24) & 0xff;
370 key[8] = (
unsigned char)
b;
374 lsh[foo].push_back(total_chunks);
382 int q_oid,
int num_k,
int num_l,
int array_size,
int& total_chunks,
uint8_t* uniqueHash,
383 vector < vector <int> >& kvector)
387 vector<unsigned char>
key(
max);
388 int num_chunks=
static_cast<int>(seq_hash[q_oid].size());
391 for (
int n=0;
n<num_chunks;
n++)
393 for (
int r=0;
r<num_l;
r++)
395 for (
int i=0;
i<num_k;
i++)
397 temp_index = kvector[
r][
i];
398 temp_hash = seq_hash[q_oid][
n][temp_index];
399 key[
i*4] = (temp_hash) & 0xff;
400 key[1+
i*4] = ((temp_hash) >> 8) & 0xff;
401 key[2+
i*4] = ((temp_hash) >> 16) & 0xff;
402 key[3+
i*4] = ((temp_hash) >> 24) & 0xff;
408 lsh[foo].push_back(total_chunks);
416 int q_oid,
int numHashes,
int numRows,
int& total_chunks,
uint8_t* uniqueHash)
419 int num_chunks =
static_cast<int>(seq_hash[q_oid].size());
420 int numHashMax = numHashes - numRows + 1;
422 for (
int n=0;
n<num_chunks;
n++)
424 for(
int b=0;
b<numHashMax;
b++)
426 unsigned char key[12];
427 for(
int r=0;
r<numRows;
r++)
429 key[
r*4] = (seq_hash[q_oid][
n][
b+
r]) & 0xff;
430 key[1+
r*4] = ((seq_hash[q_oid][
n][
b+
r]) >> 8) & 0xff;
431 key[2+
r*4] = ((seq_hash[q_oid][
n][
b+
r]) >> 16) & 0xff;
432 key[3+
r*4] = ((seq_hash[q_oid][
n][
b+
r]) >> 24) & 0xff;
437 lsh[foo].push_back(total_chunks);
439 for(
int b=0;
b<numHashMax-1;
b++)
441 unsigned char key[8];
442 for (
int r=0;
r<numRows;
r++)
444 temp_hash = seq_hash[q_oid][
n][
b+2*
r];
445 key[
r*4] = (temp_hash) & 0xff;
446 key[1+
r*4] = (temp_hash >> 8) & 0xff;
447 key[2+
r*4] = (temp_hash >> 16) & 0xff;
448 key[3+
r*4] = (temp_hash >> 24) & 0xff;
453 lsh[foo].push_back(total_chunks);
465 vector<string> paths;
468 vector<TSeqRange> range_vec;
469 vector<string> volname_vec;
471 for (vector<string>::iterator iter=paths.begin(); iter != paths.end(); ++iter)
476 string volName = base + ext;
477 volname_vec.push_back(volName);
482 range.SetTo(oid_offset);
483 range_vec.push_back(
range);
487 int numVols =
static_cast<int>(paths.size());
488 #pragma omp parallel for num_threads(numThreads)
489 for(
int index=0; index<numVols; index++)
491 x_BuildIndex(volname_vec[index], range_vec[index].GetFrom(), range_vec[index].GetTo());
499 char* loadBadMers = getenv(
"LOADBADMERS");
510 badMers.push_back(badKmer);
511 cerr << badKmer <<
'\n';
515 char* noBadMers = getenv(
"NOBADMERS");
517 return vector<int>();
521 const int kLength=10;
522 int array[] = {139810, 69905, 70161, 70177, 74257,
523 69921, 69906, 74001, 135441, 69922};
528 return vector<int>();
537 int vectorRandNums=0;
548 string indexFile = name +
".pki";
549 string dataFile = name +
".pkd";
561 num_seqs = stop - start;
566 index_file.write((
char *) &(
m_Version), 4);
567 index_file.write((
char *) &(num_seqs), 4);
569 index_file.write((
char *) &(
m_Samples), 4);
574 index_file.write((
char *) &(StartLSH), 4);
575 index_file.write((
char *) &(kSizeLSH), 4);
585 for(
int q_oid=0;q_oid<3*num_seqs;q_oid++)
592 vector < vector < vector < uint32_t > > > seq_hash(num_seqs);
595 for(
int q_oid=0;q_oid<num_seqs;q_oid++)
625 const uint32_t kUniqueHash = 0x1000000;
627 for (
uint32_t index=0; index<kUniqueHash; index++)
628 uniqueHash[index] = 0;
630 vector< vector<uint32_t> > lsh(kSizeLSH);
633 vector < vector <int> > kvector;
638 for(
int q_oid=0;q_oid<num_seqs;q_oid++)
645 total_chunks, uniqueHash, kvector);
653 for(
uint32_t index=0; index<kUniqueHash; index++)
655 if (uniqueHash[index] > 0)
658 delete [] uniqueHash;
662 for(
int index=0; index<kSizeLSH-1; index++)
664 LSHMatchSize += lsh[index].size();
668 const uint64_t kLSHMatchEnd = 4*LSHMatchSize + StartLSH + 8*kSizeLSH;
669 index_file.write((
char *) &(kLSHMatchEnd), 8);
676 const int kFutureUse=0;
677 index_file.write((
char *) &(kFutureUse), 4);
679 index_file.write((
char *) &(kFutureUse), 4);
686 index_file.write((
char *) &(
a[
i]), 4);
688 index_file.write((
char *) &(
b[
i]), 4);
693 int num=
static_cast<int>(badMers.size());
696 index_file.write((
char *) &(num), 4);
697 for(vector<int>::iterator iter=badMers.begin(); iter != badMers.end() &&
i<2*
m_NumHashFct; ++iter, ++
i)
698 index_file.write((
char *) &(*iter), 4);
703 index_file.write((
char *) &(kZero), 4);
713 vector<int> temp = kvector[
i];
715 index_file.write((
char *) &(temp[j]), 1);
720 for (
int i=0;
i<extra;
i++)
721 index_file.write((
char *) &(temp), 1);
725 uint64_t lsh_offset = StartLSH + 8*kSizeLSH;
727 for(
int index=0; index<kSizeLSH-1; index++)
729 if (lsh[index].
size() == 0)
730 index_file.write((
char *) &(kNoValue), 8);
732 index_file.write((
char *) &(lsh_offset), 8);
733 lsh_offset += 4*(lsh[index].size());
735 index_file.write((
char *) &(lsh_offset), 8);
738 for(
int index=0; index<kSizeLSH-1; index++)
740 for(vector<uint32_t>::iterator
i=lsh[index].begin();
i != lsh[index].end(); ++
i)
742 index_file.write((
char *) &(*i), 4);
767 for(
int q_oid=0;q_oid<num_seqs;q_oid++)
769 int num_chunks =
static_cast<int>(seq_hash[q_oid].size());
770 for (
int n=0;
n<num_chunks;
n++)
772 vector<uint32_t> tmp_hash;
778 tmp_hash.push_back(hash_val);
784 tmp_hash.push_back(hash_val);
790 tmp_hash.push_back(seq_hash[q_oid][
n][
b]);
794 std::sort(tmp_hash.begin(), tmp_hash.end());
797 data_file.write((
char *) &(tmp_hash[
b]), width);
799 data_file.write((
char *) &(seq_hash[q_oid][
n][
m_NumHashFct]), 4);
static void s_Get_LSH_index_hashes5(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numHashes, int numRows, int &total_chunks, uint8_t *uniqueHash)
uint32_t uhash(uint64_t x, uint64_t a, uint64_t b)
static Uint4 FNV_hash(uint32_t num)
FNV Hash. See http://www.isthe.com/chongo/tech/comp/fnv/index.html.
void s_MinhashSequences(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, const uint32_t *a, const uint32_t *b, bool do_seg, int kmerNum, int oidOffset, int alphabetChoice, int version, int chunkSize)
static void s_Get_LSH_index_hashes2(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int num_k, int num_l, int array_size, int &total_chunks, uint8_t *uniqueHash, vector< vector< int > > &kvector)
static void s_Get_LSH_index_hashes(vector< vector< vector< uint32_t > > > &seq_hash, vector< vector< uint32_t > > &lsh, int q_oid, int numBands, int numRows, int &total_chunks, uint8_t *uniqueHash)
void s_MinhashSequences2(uint32_t q_oid, CSeqDB &db, vector< vector< vector< uint32_t > > > &seq_hash, uint32_t *dead, int num_hashes, int kmerNum, int oidOffset, int alphabetChoice, int version, vector< int > badMers, int chunkSize)
vector< int > s_BlastKmerLoadBadMers(int alphabet)
int BlastKmerGetDistance(const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
Calculates the number of differences between two minhash arrays.
set< uint32_t > BlastKmerGetKmerSet2(const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
Get KMERs for a given sequence using a compressed alphabet.
void GetRandomNumbers(uint32_t *a, uint32_t *b, int numHashes)
Get the random numbers for the hash function.
set< uint32_t > BlastKmerGetKmerSet(const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
Get KMERs for a given sequence using a compressed alphabet.
int BlastKmerBreakUpSequence(int length, vector< TSeqRange > &range_v, int chunkSize)
Breaks a sequences up into chunks if the sequences is above a certain length.
void GetKValues(vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
Function to get the k sites to compare for Buhler LSH.
CRef< CSeqDB > m_SeqDB
Residues in kmer.
int m_ChunkSize
version of index file
int m_KmerSize
Number of rows per band.
int m_Compress
Number of samples (Buhler only)
bool m_DoSeg
BLAST database.
int m_Version
0 for 15 letters, 1 for 10 letters.
int m_Samples
Should Seg be run on sequences.
void x_BuildIndex(string &name, int start=0, int number=0)
BUild index for an individual BLAST volume.
void Build(int numThreads=1)
Build the index.
int m_RowsPerBand
Number of LSH bands.
int m_NumBands
Number of hash functions.
int m_Alphabet
Compress the arrays for Jaccard matches.
void x_WriteDataFile(vector< vector< vector< uint32_t > > > &seq_hash, int num_seqs, CNcbiOfstream &data_file)
Writes out the data file.
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
void GetSequenceAsString(int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const
Get a sequence in a given encoding.
const string & GetDBNameList() const
Get list of database names.
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
int GetNumSeqs() const
Returns the number of sequences available.
void SetNumberOfThreads(int num_threads, bool force_mt=false)
Setting the number of threads.
const_iterator begin() const
parent_type::iterator iterator
const_iterator end() const
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
#define KMER_LSH_ARRAY_SIZE
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const string version
version string
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
uint32_t do_pearson_hash(unsigned char *key, int length)
uint16_t pearson_hash_int2short(uint32_t input, int seed1, int seed2)
Pearson hash an integer into two bytes.
unsigned char pearson_hash_int2byte(uint32_t input, int seed1)
Pearson hash an integer into one byte.