NCBI C++ ToolKit
Functions
blastkmerutils.cpp File Reference
#include <ncbi_pch.hpp>
#include <corelib/ncbiapp.hpp>
#include <objtools/blast/seqdb_reader/seqdb.hpp>
#include <algo/blast/core/blast_filter.h>
#include <algo/blast/core/blast_seg.h>
#include <algo/blast/core/blast_encoding.h>
#include <math.h>
#include <algo/blast/proteinkmer/blastkmerutils.hpp>
#include <algo/blast/proteinkmer/mhfile.hpp>
#include <util/random_gen.hpp>
#include "pearson.hpp"
+ Include dependency graph for blastkmerutils.cpp:

Go to the source code of this file.

Go to the SVN repository for this file.

Functions

 USING_SCOPE (objects)
 
 USING_SCOPE (blast)
 
 DEFINE_STATIC_MUTEX (randMutex)
 
uint32_t uhash (uint64_t x, uint64_t a, uint64_t b)
 
static uint32_t FNV_hash (uint32_t num)
 FNV hash, see http://www.isthe.com/chongo/tech/comp/fnv/index.html. More...
 
double estimate_jaccard (vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
 
double estimate_jaccard2 (vector< uint32_t > &query_hash, vector< uint32_t > &subject, int num_hashes)
 
set< uint32_tBlastKmerGetKmerSetStats (const string &query_sequence, int kmerNum, map< string, int > &kmerCount, map< string, int > &kmerCountPlus, int alphabetChoice, bool perQuery)
 Simplified version of BlastKmerGetKmerSet. More...
 
set< uint32_tBlastKmerGetKmerSet (const string &query_sequence, bool do_seg, TSeqRange &range, int kmerNum, int alphabetChoice)
 Get KMERs for a given sequence using a compressed alphabet. More...
 
set< uint32_tBlastKmerGetKmerSet2 (const string &query_sequence, TSeqRange &range, int kmerNum, int alphabetChoice, vector< int > badMers)
 Get KMERs for a given sequence using a compressed alphabet. More...
 
void BlastKmerGetCompressedTranslationTable (vector< Uint1 > &trans_table, int alphabetChoice)
 Creates translation table for compressed alphabets. More...
 
int BlastKmerBreakUpSequence (int length, vector< TSeqRange > &range_v, int ChunkSize)
 Breaks a sequences up into chunks if the sequences is above a certain length. More...
 
int BlastKmerGetDistance (const vector< uint32_t > &minhash1, const vector< uint32_t > &minhash2)
 Calculates the number of differences between two minhash arrays. More...
 
bool minhash_query (const string &query, vector< vector< uint32_t > > &seq_hash, int num_hashes, uint32_t *a, uint32_t *b, int do_seg, int kmerNum, int alphabetChoice, int chunkSize)
 
bool minhash_query2 (const string &query, vector< vector< uint32_t > > &seq_hash, int kmerNum, int numHashes, int alphabetChoice, vector< int > badMers, int chunkSize)
 Hash the query for the minimum values;. More...
 
void get_LSH_hashes (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_bands, int rows_per_band)
 
void get_LSH_hashes5 (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int numHashes, int numRows)
 Gets the LSH hash for one hash function. More...
 
void GetRandomNumbers (uint32_t *a, uint32_t *b, int numHashes)
 Get the random numbers for the hash function. More...
 
void GetKValues (vector< vector< int > > &kvector, int k_value, int l_value, int array_size)
 Function to get the k sites to compare for Buhler LSH. More...
 
void get_LSH_hashes2 (vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &lsh_hash_vec, int num_k, int num_l, vector< vector< int > > &kvector)
 
void get_LSH_match_from_hash (const vector< vector< uint32_t > > &query_LSH_hash, const uint64_t *lsh_array, vector< set< uint32_t > > &candidates)
 
void s_HashHashQuery (const vector< vector< uint32_t > > &query_hash, vector< vector< uint32_t > > &query_hash_hash, int compress, int version)
 
void neighbor_query (const vector< vector< uint32_t > > &query_hash, const uint64_t *lsh, vector< set< uint32_t > > &candidates, CMinHashFile &mhfile, int num_hashes, int min_hits, double thresh, TBlastKmerPrelimScoreVector &score_vector, BlastKmerStats &kmer_stats, int kmerVer)
 
static int s_BlastKmerVerifyVolume (CMinHashFile &mhfile, string &error_msg, int volume)
 
int BlastKmerVerifyIndex (CRef< CSeqDB > seqdb, string &error_msg)
 

Function Documentation

◆ BlastKmerBreakUpSequence()

int BlastKmerBreakUpSequence ( int  length,
vector< TSeqRange > &  range_v,
int  chunkSize 
)

Breaks a sequences up into chunks if the sequences is above a certain length.

Each chunk has an overlap with the adjoining chunks. Breaking sequences up into chunks makes the minhash procedure much more effective two sequences of different lengths are being compared.

Parameters
lengthTotal length of sequence being broken up [in]
range_vVector of ranges to be filled in [out]
chunkSizenumber of residues in sequence chunk [in]
Returns
number of chunks. Should be integer more than zero. Zero or less indicates an error.

Definition at line 362 of file blastkmerutils.cpp.

References ChunkSize(), i, last(), MAX, MIN, and compile_time_bits::range().

Referenced by minhash_query(), minhash_query2(), s_MinhashSequences(), and s_MinhashSequences2().

◆ BlastKmerGetCompressedTranslationTable()

void BlastKmerGetCompressedTranslationTable ( vector< Uint1 > &  trans_table,
int  alphabetChoice 
)

Creates translation table for compressed alphabets.

Parameters
trans_tableTranslation table [out]
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]

Definition at line 330 of file blastkmerutils.cpp.

References _ASSERT, AMINOACID_TO_NCBISTDAA, i, int, isalpha(), and isspace().

Referenced by BlastKmerGetKmerSet(), BlastKmerGetKmerSet2(), and BlastKmerGetKmerSetStats().

◆ BlastKmerGetDistance()

int BlastKmerGetDistance ( const vector< uint32_t > &  minhash1,
const vector< uint32_t > &  minhash2 
)

Calculates the number of differences between two minhash arrays.

Used to decide whether two arrays are similar enough. The assumption is made that both arrays are of the same size.

Parameters
minhash1First array [in]
minhash2Second array [in]
Returns
distance.

Definition at line 399 of file blastkmerutils.cpp.

Referenced by s_MinhashSequences(), and s_MinhashSequences2().

◆ BlastKmerGetKmerSet()

set<uint32_t> BlastKmerGetKmerSet ( const string query_sequence,
bool  do_seg,
TSeqRange range,
int  kmerNum,
int  alphabetChoice 
)

Get KMERs for a given sequence using a compressed alphabet.

Parameters
query_sequencestring with one sequence [in]
do_segShould the sequence be segged (not recommended) [in]
rangeportion of sequence to be processed [in]
kmerNumsize of kmer [in]
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]
Returns
set of unsigned ints for the kmers.

Definition at line 206 of file blastkmerutils.cpp.

References BlastKmerGetCompressedTranslationTable(), BlastSeqLocFree(), free(), i, set< Key, Compare >::insert(), malloc(), BlastSeqLoc::next, NULL, query, compile_time_bits::range(), SegParametersFree(), SegParametersNewAa(), and SeqBufferSeg().

Referenced by minhash_query(), and s_MinhashSequences().

◆ BlastKmerGetKmerSet2()

set<uint32_t> BlastKmerGetKmerSet2 ( const string query_sequence,
TSeqRange range,
int  kmerNum,
int  alphabetChoice,
vector< int badMers 
)

Get KMERs for a given sequence using a compressed alphabet.

This version can read in overrepresented KMERs and extend them by one.

Parameters
query_sequencestring with one sequence [in]
rangeportion of sequence to be processed [in]
kmerNumsize of kmer [in]
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]
badMersOverrepresented KMERs [in]
Returns
set of unsigned ints for the kmers.

Definition at line 270 of file blastkmerutils.cpp.

References BlastKmerGetCompressedTranslationTable(), free(), i, set< Key, Compare >::insert(), malloc(), query, and compile_time_bits::range().

Referenced by minhash_query2(), and s_MinhashSequences2().

◆ BlastKmerGetKmerSetStats()

set<uint32_t> BlastKmerGetKmerSetStats ( const string query_sequence,
int  kmerNum,
map< string, int > &  kmerCount,
map< string, int > &  kmerCountPlus,
int  alphabetChoice,
bool  perQuery 
)

Simplified version of BlastKmerGetKmerSet.

Intended for gathering statistics on KMERS in the database.

Parameters
query_sequencestring with one sequence [in]
kmerNumsize of kmer [in]
Returns
param kmerCount Count population of different KMERS
param kmerCount Count population of different KMERS one longer than kmerNum
Parameters
alphabetChoice0 is 15 letter, 1 is 10 letter alphabet [in]
perQueryCOunt kmers per query or total in database.
Returns
set of unsigned ints for the kmers.

Definition at line 132 of file blastkmerutils.cpp.

References map_checker< Container >::begin(), BlastKmerGetCompressedTranslationTable(), map_checker< Container >::end(), i, set< Key, Compare >::insert(), NStr::NumericToString(), and query.

◆ BlastKmerVerifyIndex()

int BlastKmerVerifyIndex ( CRef< CSeqDB seqdb,
string error_msg 
)

◆ DEFINE_STATIC_MUTEX()

DEFINE_STATIC_MUTEX ( randMutex  )

◆ estimate_jaccard()

double estimate_jaccard ( vector< uint32_t > &  query_hash,
vector< uint32_t > &  subject,
int  num_hashes 
)
inline

Definition at line 89 of file blastkmerutils.cpp.

References a, b, and subject.

Referenced by neighbor_query().

◆ estimate_jaccard2()

double estimate_jaccard2 ( vector< uint32_t > &  query_hash,
vector< uint32_t > &  subject,
int  num_hashes 
)
inline

Definition at line 108 of file blastkmerutils.cpp.

References a, b, and subject.

Referenced by neighbor_query().

◆ FNV_hash()

static uint32_t FNV_hash ( uint32_t  num)
static

◆ get_LSH_hashes()

void get_LSH_hashes ( vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  lsh_hash_vec,
int  num_bands,
int  rows_per_band 
)

◆ get_LSH_hashes2()

void get_LSH_hashes2 ( vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  lsh_hash_vec,
int  num_k,
int  num_l,
vector< vector< int > > &  kvector 
)

◆ get_LSH_hashes5()

void get_LSH_hashes5 ( vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  lsh_hash_vec,
int  numHashes,
int  numRows 
)

Gets the LSH hash for one hash function.

Parameters
query_hashHash values for query [in]
lsh_hash_vecLSH query hash [out]
numHashesnumber of hashes in signature [in]
numRowsnumber of rows (2?) in LSH [in]

Definition at line 566 of file blastkmerutils.cpp.

References b, do_pearson_hash(), ncbi::grid::netcache::search::fields::key, n, r(), and ct::sort().

Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().

◆ get_LSH_match_from_hash()

void get_LSH_match_from_hash ( const vector< vector< uint32_t > > &  query_LSH_hash,
const uint64_t lsh_array,
vector< set< uint32_t > > &  candidates 
)

Definition at line 686 of file blastkmerutils.cpp.

References i.

Referenced by CBlastKmer::x_RunKmerFile().

◆ GetKValues()

void GetKValues ( vector< vector< int > > &  kvector,
int  k_value,
int  l_value,
int  array_size 
)

Function to get the k sites to compare for Buhler LSH.

Definition at line 629 of file blastkmerutils.cpp.

References CRandom::GetRand(), and i.

Referenced by CBlastKmerBuildIndex::x_BuildIndex().

◆ GetRandomNumbers()

void GetRandomNumbers ( uint32_t a,
uint32_t b,
int  numHashes 
)

Get the random numbers for the hash function.

Definition at line 613 of file blastkmerutils.cpp.

References a, b, CRandom::GetRand(), i, and PKMER_PRIME.

Referenced by CBlastKmerBuildIndex::x_BuildIndex().

◆ minhash_query()

bool minhash_query ( const string query,
vector< vector< uint32_t > > &  seq_hash,
int  num_hashes,
uint32_t a,
uint32_t b,
int  do_seg,
int  kmerNum,
int  alphabetChoice,
int  chunkSize 
)

◆ minhash_query2()

bool minhash_query2 ( const string query,
vector< vector< uint32_t > > &  seq_hash,
int  kmerNum,
int  numHashes,
int  alphabetChoice,
vector< int badMers,
int  chunkSize 
)

Hash the query for the minimum values;.

Parameters
queryas a ASCII string [in]
seq_hashhash values for all kmers [out]
kmerNumnumber of letters in a KMER [in]
numHashesnumber of hashes in a signature [in]
alphabetChoice15 or 10 letters [in]
badMersOverrepresented KMERS [in]

Definition at line 479 of file blastkmerutils.cpp.

References set< Key, Compare >::begin(), BlastKmerBreakUpSequence(), BlastKmerGetKmerSet2(), set< Key, Compare >::empty(), set< Key, Compare >::end(), FNV_hash(), i, query, and ct::sort().

Referenced by BOOST_AUTO_TEST_CASE(), and CBlastKmer::x_ProcessQuery().

◆ neighbor_query()

void neighbor_query ( const vector< vector< uint32_t > > &  query_hash,
const uint64_t lsh,
vector< set< uint32_t > > &  candidates,
CMinHashFile mhfile,
int  num_hashes,
int  min_hits,
double  thresh,
TBlastKmerPrelimScoreVector score_vector,
BlastKmerStats kmer_stats,
int  kmerVer 
)

◆ s_BlastKmerVerifyVolume()

static int s_BlastKmerVerifyVolume ( CMinHashFile mhfile,
string error_msg,
int  volume 
)
static

◆ s_HashHashQuery()

void s_HashHashQuery ( const vector< vector< uint32_t > > &  query_hash,
vector< vector< uint32_t > > &  query_hash_hash,
int  compress,
int  version 
)

Definition at line 706 of file blastkmerutils.cpp.

References compress, i, int, n, pearson_hash_int2byte(), pearson_hash_int2short(), ct::sort(), and version.

Referenced by neighbor_query().

◆ uhash()

uint32_t uhash ( uint64_t  x,
uint64_t  a,
uint64_t  b 
)
inline

Definition at line 56 of file blastkmerutils.cpp.

References a, b, and PKMER_PRIME.

Referenced by minhash_query().

◆ USING_SCOPE() [1/2]

USING_SCOPE ( blast  )

◆ USING_SCOPE() [2/2]

USING_SCOPE ( objects  )
Modified on Thu Feb 29 12:19:45 2024 by modify_doxy.py rev. 669887