NCBI C++ ToolKit
Classes | Macros | Typedefs | Functions
blastkmerutils.hpp File Reference
#include <corelib/ncbistd.hpp>
#include <util/range.hpp>
#include <objtools/blast/seqdb_reader/seqdb.hpp>
#include <algo/blast/core/blast_message.h>
#include "mhfile.hpp"
+ Include dependency graph for blastkmerutils.hpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Go to the SVN repository for this file.

Classes

struct  BlastKmerStats
 Structure for ancillary data on KMER search. More...
 
struct  SBlastKmerParameters
 
struct  SOneBlastKmerSearch
 

Macros

#define PKMER_PRIME   1048583
 

Typedefs

typedef vector< pair< uint32_t, double > > TBlastKmerPrelimScoreVector
 Vector of pairs of database OIDs and scores. More...
 

Functions

 USING_SCOPE (blast)
 
set< uint32_tBlastKmerGetKmerSet (const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
 Get KMERs for a given sequence using a compressed alphabet. More...
 
set< uint32_tBlastKmerGetKmerSet2 (const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
 Get KMERs for a given sequence using a compressed alphabet. More...
 
set< uint32_tBlastKmerGetKmerSetStats (const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery)
 Simplified version of BlastKmerGetKmerSet. More...
 
int BlastKmerBreakUpSequence (int length, vector< TSeqRange > &range_v, int chunkSize)
 Breaks a sequences up into chunks if the sequences is above a certain length. More...
 
void BlastKmerGetCompressedTranslationTable (vector< Uint1 > &trans_table, int alphabetChoice)
 Creates translation table for compressed alphabets. More...
 
int BlastKmerGetDistance (const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
 Calculates the number of differences between two minhash arrays. More...
 
bool minhash_query (const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
 
bool minhash_query2 (const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
 Hash the query for the minimum values;. More...
 
void get_LSH_match_from_hash (const vector< vector< uint32_t > > &lsh_hash_vec, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)
 
void get_LSH_hashes (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
 
void get_LSH_hashes5 (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
 Gets the LSH hash for one hash function. More...
 
void neighbor_query (const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVersion)
 
void GetRandomNumbers (uint32_t *a, uint32_t *b, int numHashes)
 Get the random numbers for the hash function. More...
 
void GetKValues (vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
 Function to get the k sites to compare for Buhler LSH. More...
 
void get_LSH_hashes2 (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kValues)
 
int BlastKmerVerifyIndex (CRef< CSeqDB > seqdb, string &error_msg)
 

Macro Definition Documentation

◆ PKMER_PRIME

#define PKMER_PRIME   1048583

Definition at line 48 of file blastkmerutils.hpp.

Typedef Documentation

◆ TBlastKmerPrelimScoreVector

typedef vector< pair<uint32_t, double> > TBlastKmerPrelimScoreVector

Vector of pairs of database OIDs and scores.

ONLY for use during KMER search, not presentation of results or communication with other modules (BLAST or not).

Definition at line 122 of file blastkmerutils.hpp.

Function Documentation

◆ BlastKmerBreakUpSequence()

int BlastKmerBreakUpSequence ( int  length,
vector< TSeqRange > &  range_v,
int  chunkSize 
)

Breaks a sequences up into chunks if the sequences is above a certain length.

Each chunk has an overlap with the adjoining chunks. Breaking sequences up into chunks makes the minhash procedure much more effective two sequences of different lengths are being compared.

Parameters
lengthTotal length of sequence being broken up [in]
range_vVector of ranges to be filled in [out]
chunkSizenumber of residues in sequence chunk [in]
Returns
number of chunks. Should be integer more than zero. Zero or less indicates an error.

Definition at line 362 of file blastkmerutils.cpp.

References ChunkSize(), i, last(), MAX, MIN, and compile_time_bits::range().

Referenced by minhash_query(), minhash_query2(), s_MinhashSequences(), and s_MinhashSequences2().

◆ BlastKmerGetCompressedTranslationTable()

void BlastKmerGetCompressedTranslationTable ( vector< Uint1 > &  trans_table,
int  alphabetChoice 
)

Creates translation table for compressed alphabets.

Parameters
trans_tableTranslation table [out]
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]

Definition at line 330 of file blastkmerutils.cpp.

References _ASSERT, AMINOACID_TO_NCBISTDAA, i, int, isalpha(), and isspace().

Referenced by BlastKmerGetKmerSet(), BlastKmerGetKmerSet2(), and BlastKmerGetKmerSetStats().

◆ BlastKmerGetDistance()

int BlastKmerGetDistance ( const vector< uint32_t > &  minhash1,
const vector< uint32_t > &  minhash2 
)

Calculates the number of differences between two minhash arrays.

Used to decide whether two arrays are similar enough. The assumption is made that both arrays are of the same size.

Parameters
minhash1First array [in]
minhash2Second array [in]
Returns
distance.

Definition at line 399 of file blastkmerutils.cpp.

Referenced by s_MinhashSequences(), and s_MinhashSequences2().

◆ BlastKmerGetKmerSet()

set<uint32_t> BlastKmerGetKmerSet ( const string query_sequence,
bool  do_seg,
TSeqRange range,
int  kmerNum,
int  alphabetChoice 
)

Get KMERs for a given sequence using a compressed alphabet.

Parameters
query_sequencestring with one sequence [in]
do_segShould the sequence be segged (not recommended) [in]
rangeportion of sequence to be processed [in]
kmerNumsize of kmer [in]
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]
Returns
set of unsigned ints for the kmers.

Definition at line 206 of file blastkmerutils.cpp.

References BlastKmerGetCompressedTranslationTable(), BlastSeqLocFree(), free(), i, set< Key, Compare >::insert(), malloc(), BlastSeqLoc::next, NULL, query, compile_time_bits::range(), SegParametersFree(), SegParametersNewAa(), and SeqBufferSeg().

Referenced by minhash_query(), and s_MinhashSequences().

◆ BlastKmerGetKmerSet2()

set<uint32_t> BlastKmerGetKmerSet2 ( const string query_sequence,
TSeqRange range,
int  kmerNum,
int  alphabetChoice,
vector< int badMers 
)

Get KMERs for a given sequence using a compressed alphabet.

This version can read in overrepresented KMERs and extend them by one.

Parameters
query_sequencestring with one sequence [in]
rangeportion of sequence to be processed [in]
kmerNumsize of kmer [in]
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]
badMersOverrepresented KMERs [in]
Returns
set of unsigned ints for the kmers.

Definition at line 270 of file blastkmerutils.cpp.

References BlastKmerGetCompressedTranslationTable(), free(), i, set< Key, Compare >::insert(), malloc(), query, and compile_time_bits::range().

Referenced by minhash_query2(), and s_MinhashSequences2().

◆ BlastKmerGetKmerSetStats()

set<uint32_t> BlastKmerGetKmerSetStats ( const string query_sequence,
int  kmerNum,
map< string, int > &  kmerCount,
map< string, int > &  kmerCountPlus,
int  alphabetChoice,
bool  perQuery 
)

Simplified version of BlastKmerGetKmerSet.

Intended for gathering statistics on KMERS in the database.

Parameters
query_sequencestring with one sequence [in]
kmerNumsize of kmer [in]
Returns
param kmerCount Count population of different KMERS
param kmerCount Count population of different KMERS one longer than kmerNum
Parameters
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]
perQueryCOunt kmers per query or total in database.
Returns
set of unsigned ints for the kmers.

Definition at line 132 of file blastkmerutils.cpp.

References map_checker< Container >::begin(), BlastKmerGetCompressedTranslationTable(), map_checker< Container >::end(), i, set< Key, Compare >::insert(), NStr::NumericToString(), and query.

◆ BlastKmerVerifyIndex()

int BlastKmerVerifyIndex ( CRef< CSeqDB seqdb,
string error_msg 
)

◆ get_LSH_hashes()

void get_LSH_hashes ( vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  lsh_hash_vec,
int  num_bands,
int  rows_per_band 
)

◆ get_LSH_hashes2()

void get_LSH_hashes2 ( vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  lsh_hash_vec,
int  num_k,
int  num_l,
vector< vector< int > > &  kValues 
)

◆ get_LSH_hashes5()

void get_LSH_hashes5 ( vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  lsh_hash_vec,
int  numHashes,
int  numRows 
)

Gets the LSH hash for one hash function.

Parameters
query_hashHash values for query [in]
lsh_hash_vecLSH query hash [out]
numHashesnumber of hashes in signature [in]
numRowsnumber of rows (2?) in LSH [in]

Definition at line 566 of file blastkmerutils.cpp.

References b, do_pearson_hash(), ncbi::grid::netcache::search::fields::key, n, r(), and ct::sort().

Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().

◆ get_LSH_match_from_hash()

void get_LSH_match_from_hash ( const vector< vector< uint32_t > > &  lsh_hash_vec,
const uint64_t lsh_array,
vector< set< uint32_t > > &  candidates 
)

Definition at line 686 of file blastkmerutils.cpp.

References i.

Referenced by CBlastKmer::x_RunKmerFile().

◆ GetKValues()

void GetKValues ( vector< vector< int > > &  kvector,
int  k_value,
int  l_value,
int  array_size 
)

Function to get the k sites to compare for Buhler LSH.

Definition at line 629 of file blastkmerutils.cpp.

References CRandom::GetRand(), and i.

Referenced by CBlastKmerBuildIndex::x_BuildIndex().

◆ GetRandomNumbers()

void GetRandomNumbers ( uint32_t a,
uint32_t b,
int  numHashes 
)

Get the random numbers for the hash function.

Definition at line 613 of file blastkmerutils.cpp.

References a, b, CRandom::GetRand(), i, and PKMER_PRIME.

Referenced by CBlastKmerBuildIndex::x_BuildIndex().

◆ minhash_query()

bool minhash_query ( const string query,
vector< vector< uint32_t > > &  seq_hash,
int  num_hashes,
uint32_t a,
uint32_t b,
int  do_seg,
int  kmerNum,
int  alphabetChoice,
int  chunkSize 
)

◆ minhash_query2()

bool minhash_query2 ( const string query,
vector< vector< uint32_t > > &  seq_hash,
int  kmerNum,
int  numHashes,
int  alphabetChoice,
vector< int badMers,
int  chunkSize 
)

Hash the query for the minimum values;.

Parameters
queryas a ASCII string [in]
seq_hashhash values for all kmers [out]
kmerNumnumber of letters in a KMER [in]
numHashesnumber of hashes in a signature [in]
alphabetChoice15 or 10 letters [in]
badMersOverrepresented KMERS [in]

Definition at line 479 of file blastkmerutils.cpp.

References set< Key, Compare >::begin(), BlastKmerBreakUpSequence(), BlastKmerGetKmerSet2(), set< Key, Compare >::empty(), set< Key, Compare >::end(), FNV_hash(), i, query, and ct::sort().

Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().

◆ neighbor_query()

void neighbor_query ( const vector< vector< uint32_t > > &  query_hash,
const uint64_t lsh,
vector< set< uint32_t > > &  candidates,
CMinHashFile mhfile,
int  num_hashes,
int  min_hits,
double  thresh,
TBlastKmerPrelimScoreVector score_vector,
BlastKmerStats kmer_stats,
int  kmerVersion 
)

◆ USING_SCOPE()

USING_SCOPE ( blast  )
Modified on Thu Jul 11 17:52:10 2024 by modify_doxy.py rev. 669887