NCBI C++ ToolKit
|
#include <corelib/ncbistd.hpp>
#include <util/range.hpp>
#include <objtools/blast/seqdb_reader/seqdb.hpp>
#include <algo/blast/core/blast_message.h>
#include "mhfile.hpp"
Go to the source code of this file.
Go to the SVN repository for this file.
Classes | |
struct | BlastKmerStats |
Structure for ancillary data on KMER search. More... | |
struct | SBlastKmerParameters |
struct | SOneBlastKmerSearch |
Macros | |
#define | PKMER_PRIME 1048583 |
Typedefs | |
typedef vector< pair< uint32_t, double > > | TBlastKmerPrelimScoreVector |
Vector of pairs of database OIDs and scores. More... | |
Functions | |
USING_SCOPE (blast) | |
set< uint32_t > | BlastKmerGetKmerSet (const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice) |
Get KMERs for a given sequence using a compressed alphabet. More... | |
set< uint32_t > | BlastKmerGetKmerSet2 (const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers) |
Get KMERs for a given sequence using a compressed alphabet. More... | |
set< uint32_t > | BlastKmerGetKmerSetStats (const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery) |
Simplified version of BlastKmerGetKmerSet. More... | |
int | BlastKmerBreakUpSequence (int length, vector< TSeqRange > &range_v, int chunkSize) |
Breaks a sequences up into chunks if the sequences is above a certain length. More... | |
void | BlastKmerGetCompressedTranslationTable (vector< Uint1 > &trans_table, int alphabetChoice) |
Creates translation table for compressed alphabets. More... | |
int | BlastKmerGetDistance (const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2) |
Calculates the number of differences between two minhash arrays. More... | |
bool | minhash_query (const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize) |
bool | minhash_query2 (const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize) |
Hash the query for the minimum values;. More... | |
void | get_LSH_match_from_hash (const vector< vector< uint32_t > > &lsh_hash_vec, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates) |
void | get_LSH_hashes (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band) |
void | get_LSH_hashes5 (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows) |
Gets the LSH hash for one hash function. More... | |
void | neighbor_query (const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVersion) |
void | GetRandomNumbers (uint32_t *a, uint32_t *b, int numHashes) |
Get the random numbers for the hash function. More... | |
void | GetKValues (vector< vector< int > > &kvector, int k_value, int l_value, int array_size) |
Function to get the k sites to compare for Buhler LSH. More... | |
void | get_LSH_hashes2 (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kValues) |
int | BlastKmerVerifyIndex (CRef< CSeqDB > seqdb, string &error_msg) |
#define PKMER_PRIME 1048583 |
Definition at line 48 of file blastkmerutils.hpp.
typedef vector< pair<uint32_t, double> > TBlastKmerPrelimScoreVector |
Vector of pairs of database OIDs and scores.
ONLY for use during KMER search, not presentation of results or communication with other modules (BLAST or not).
Definition at line 122 of file blastkmerutils.hpp.
Breaks a sequences up into chunks if the sequences is above a certain length.
Each chunk has an overlap with the adjoining chunks. Breaking sequences up into chunks makes the minhash procedure much more effective two sequences of different lengths are being compared.
length | Total length of sequence being broken up [in] |
range_v | Vector of ranges to be filled in [out] |
chunkSize | number of residues in sequence chunk [in] |
Definition at line 362 of file blastkmerutils.cpp.
References ChunkSize(), i, last(), MAX, MIN, and compile_time_bits::range().
Referenced by minhash_query(), minhash_query2(), s_MinhashSequences(), and s_MinhashSequences2().
Creates translation table for compressed alphabets.
trans_table | Translation table [out] |
alphabetChoice | 0 is 15 letter, 1 is 10 letter alphabet [in] |
Definition at line 330 of file blastkmerutils.cpp.
References _ASSERT, AMINOACID_TO_NCBISTDAA, i, int, isalpha(), and isspace().
Referenced by BlastKmerGetKmerSet(), BlastKmerGetKmerSet2(), and BlastKmerGetKmerSetStats().
int BlastKmerGetDistance | ( | const vector< uint32_t > & | minhash1, |
const vector< uint32_t > & | minhash2 | ||
) |
Calculates the number of differences between two minhash arrays.
Used to decide whether two arrays are similar enough. The assumption is made that both arrays are of the same size.
minhash1 | First array [in] |
minhash2 | Second array [in] |
Definition at line 399 of file blastkmerutils.cpp.
Referenced by s_MinhashSequences(), and s_MinhashSequences2().
set<uint32_t> BlastKmerGetKmerSet | ( | const string & | query_sequence, |
bool | do_seg, | ||
TSeqRange & | range, | ||
int | kmerNum, | ||
int | alphabetChoice | ||
) |
Get KMERs for a given sequence using a compressed alphabet.
query_sequence | string with one sequence [in] |
do_seg | Should the sequence be segged (not recommended) [in] |
range | portion of sequence to be processed [in] |
kmerNum | size of kmer [in] |
alphabetChoice | 0 is 15 letter, 1 is 10 letter alphabet [in] |
Definition at line 206 of file blastkmerutils.cpp.
References BlastKmerGetCompressedTranslationTable(), BlastSeqLocFree(), free(), i, set< Key, Compare >::insert(), malloc(), BlastSeqLoc::next, NULL, query, compile_time_bits::range(), SegParametersFree(), SegParametersNewAa(), and SeqBufferSeg().
Referenced by minhash_query(), and s_MinhashSequences().
set<uint32_t> BlastKmerGetKmerSet2 | ( | const string & | query_sequence, |
TSeqRange & | range, | ||
int | kmerNum, | ||
int | alphabetChoice, | ||
vector< int > | badMers | ||
) |
Get KMERs for a given sequence using a compressed alphabet.
This version can read in overrepresented KMERs and extend them by one.
query_sequence | string with one sequence [in] |
range | portion of sequence to be processed [in] |
kmerNum | size of kmer [in] |
alphabetChoice | 0 is 15 letter, 1 is 10 letter alphabet [in] |
badMers | Overrepresented KMERs [in] |
Definition at line 270 of file blastkmerutils.cpp.
References BlastKmerGetCompressedTranslationTable(), free(), i, set< Key, Compare >::insert(), malloc(), query, and compile_time_bits::range().
Referenced by minhash_query2(), and s_MinhashSequences2().
set<uint32_t> BlastKmerGetKmerSetStats | ( | const string & | query_sequence, |
int | kmerNum, | ||
map< string, int > & | kmerCount, | ||
map< string, int > & | kmerCountPlus, | ||
int | alphabetChoice, | ||
bool | perQuery | ||
) |
Simplified version of BlastKmerGetKmerSet.
Intended for gathering statistics on KMERS in the database.
query_sequence | string with one sequence [in] |
kmerNum | size of kmer [in] |
alphabetChoice | 0 is 15 letter, 1 is 10 letter alphabet [in] |
perQuery | COunt kmers per query or total in database. |
Definition at line 132 of file blastkmerutils.cpp.
References map_checker< Container >::begin(), BlastKmerGetCompressedTranslationTable(), map_checker< Container >::end(), i, set< Key, Compare >::insert(), NStr::NumericToString(), and query.
Definition at line 923 of file blastkmerutils.cpp.
References CSeqDB::FindVolumePaths(), i, and s_BlastKmerVerifyVolume().
Referenced by CCkblastindexApplication::Run().
void get_LSH_hashes | ( | vector< vector< uint32_t > > & | query_hash, |
vector< vector< uint32_t > > & | lsh_hash_vec, | ||
int | num_bands, | ||
int | rows_per_band | ||
) |
Definition at line 532 of file blastkmerutils.cpp.
References b, do_pearson_hash(), ncbi::grid::netcache::search::fields::key, n, r(), and ct::sort().
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().
void get_LSH_hashes2 | ( | vector< vector< uint32_t > > & | query_hash, |
vector< vector< uint32_t > > & | lsh_hash_vec, | ||
int | num_k, | ||
int | num_l, | ||
vector< vector< int > > & | kValues | ||
) |
Definition at line 651 of file blastkmerutils.cpp.
References do_pearson_hash(), i, ncbi::grid::netcache::search::fields::key, max(), n, r(), and ct::sort().
Referenced by CBlastKmer::x_ProcessQuery().
void get_LSH_hashes5 | ( | vector< vector< uint32_t > > & | query_hash, |
vector< vector< uint32_t > > & | lsh_hash_vec, | ||
int | numHashes, | ||
int | numRows | ||
) |
Gets the LSH hash for one hash function.
query_hash | Hash values for query [in] |
lsh_hash_vec | LSH query hash [out] |
numHashes | number of hashes in signature [in] |
numRows | number of rows (2?) in LSH [in] |
Definition at line 566 of file blastkmerutils.cpp.
References b, do_pearson_hash(), ncbi::grid::netcache::search::fields::key, n, r(), and ct::sort().
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().
void get_LSH_match_from_hash | ( | const vector< vector< uint32_t > > & | lsh_hash_vec, |
const uint64_t * | lsh_array, | ||
vector< set< uint32_t > > & | candidates | ||
) |
Definition at line 686 of file blastkmerutils.cpp.
References i.
Referenced by CBlastKmer::x_RunKmerFile().
Function to get the k sites to compare for Buhler LSH.
Definition at line 629 of file blastkmerutils.cpp.
References CRandom::GetRand(), and i.
Referenced by CBlastKmerBuildIndex::x_BuildIndex().
Get the random numbers for the hash function.
Definition at line 613 of file blastkmerutils.cpp.
References a, b, CRandom::GetRand(), i, and PKMER_PRIME.
Referenced by CBlastKmerBuildIndex::x_BuildIndex().
bool minhash_query | ( | const string & | query, |
vector< vector< uint32_t > > & | seq_hash, | ||
int | num_hashes, | ||
uint32_t * | a, | ||
uint32_t * | b, | ||
int | do_seg, | ||
int | kmerNum, | ||
int | alphabetChoice, | ||
int | chunkSize | ||
) |
Definition at line 415 of file blastkmerutils.cpp.
References a, b, set< Key, Compare >::begin(), BlastKmerBreakUpSequence(), BlastKmerGetKmerSet(), set< Key, Compare >::empty(), set< Key, Compare >::end(), i, query, and uhash().
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().
bool minhash_query2 | ( | const string & | query, |
vector< vector< uint32_t > > & | seq_hash, | ||
int | kmerNum, | ||
int | numHashes, | ||
int | alphabetChoice, | ||
vector< int > | badMers, | ||
int | chunkSize | ||
) |
Hash the query for the minimum values;.
query | as a ASCII string [in] |
seq_hash | hash values for all kmers [out] |
kmerNum | number of letters in a KMER [in] |
numHashes | number of hashes in a signature [in] |
alphabetChoice | 15 or 10 letters [in] |
badMers | Overrepresented KMERS [in] |
Definition at line 479 of file blastkmerutils.cpp.
References set< Key, Compare >::begin(), BlastKmerBreakUpSequence(), BlastKmerGetKmerSet2(), set< Key, Compare >::empty(), set< Key, Compare >::end(), FNV_hash(), i, query, and ct::sort().
Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().
void neighbor_query | ( | const vector< vector< uint32_t > > & | query_hash, |
const uint64_t * | lsh, | ||
vector< set< uint32_t > > & | candidates, | ||
CMinHashFile & | mhfile, | ||
int | num_hashes, | ||
int | min_hits, | ||
double | thresh, | ||
TBlastKmerPrelimScoreVector & | score_vector, | ||
BlastKmerStats & | kmer_stats, | ||
int | kmerVersion | ||
) |
Definition at line 744 of file blastkmerutils.cpp.
References map_checker< Container >::begin(), map_checker< Container >::end(), estimate_jaccard(), estimate_jaccard2(), CMinHashFile::GetDataWidth(), CMinHashFile::GetHits(), CMinHashFile::GetMinHits(), CMinHashFile::GetVersion(), BlastKmerStats::hit_count, i, map_checker< Container >::insert(), int, BlastKmerStats::jd_count, BlastKmerStats::jd_oid_count, n, offset, BlastKmerStats::oids_considered, read_size(), s_HashHashQuery(), ct::sort(), and BlastKmerStats::total_matches.
Referenced by CBlastKmer::x_RunKmerFile().
USING_SCOPE | ( | blast | ) |