33 #ifndef ALGO_BLAST_CORE__BLAST_AALOOKUP__H
34 #define ALGO_BLAST_CORE__BLAST_AALOOKUP__H
49 #define AA_HITS_PER_CELL 3
189 #define COMPRESSED_HITS_PER_BACKBONE_CELL 4
190 #define COMPRESSED_HITS_CELL_MASK 0x03
193 #define COMPRESSED_HITS_PER_OVERFLOW_CELL 4
196 #define COMPRESSED_OVERFLOW_CELLS_IN_BANK 209710
200 #define COMPRESSED_OVERFLOW_MAX_BANKS 1024
306 Int4 compressed_alphabet_size,
312 Int4 *scaled_compress_table =
lookup->scaled_compress_table;
315 for(
i = 0;
i < wordsize;
i++) {
316 Int4 ch = scaled_compress_table[word[
i]];
322 index = index / compressed_alphabet_size + ch;
330 #define RPS_HITS_PER_CELL 3
351 #define RPS_BUCKET_SIZE 2048
Int2 RPSLookupTableNew(const BlastRPSInfo *rps_info, BlastRPSLookupTable **lut)
Create a new RPS blast lookup table.
struct BlastCompressedAaLookupTable BlastCompressedAaLookupTable
The lookup table structure for protein searches using a compressed alphabet.
#define COMPRESSED_HITS_PER_OVERFLOW_CELL
number of query offsets to store in an overflow cell
struct AaLookupSmallboneCell AaLookupSmallboneCell
structure defining one cell of the small (i.e., use short) lookup table
Int4 BlastCompressedAaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastCompressedAaLookupTable **lut, const LookupTableOptions *opt, BlastScoreBlk *sbp)
Create a new compressed protein lookup table.
struct CompressedLookupBackboneCell CompressedLookupBackboneCell
structure for hashtable of indexed query offsets
BlastCompressedAaLookupTable * BlastCompressedAaLookupTableDestruct(BlastCompressedAaLookupTable *lookup)
Free the compressed lookup table.
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
#define COMPRESSED_HITS_PER_BACKBONE_CELL
number of query offsets to store in a backbone cell
BlastRPSLookupTable * RPSLookupTableDestruct(BlastRPSLookupTable *lookup)
Free the lookup table.
struct CompressedMixedOffsets CompressedMixedOffsets
"alternative" structure of CompressedLookupBackboneCell storage
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *unmasked_regions, Int4 query_bias)
Index a protein query.
#define AA_HITS_PER_CELL
maximum number of hits in one lookup table cell
struct RPSBucket RPSBucket
structure used for bucket sorting offsets retrieved from the RPS blast lookup table.
struct CompressedOverflowCell CompressedOverflowCell
cell in list for holding query offsets
struct RPSBackboneCell RPSBackboneCell
structure defining one cell of the RPS lookup table
struct BlastAaLookupTable BlastAaLookupTable
The basic lookup table structure for blastp searches.
struct AaLookupBackboneCell AaLookupBackboneCell
structure defining one cell of the compacted lookup table
struct BlastRPSLookupTable BlastRPSLookupTable
The basic lookup table structure for RPS blast searches.
#define RPS_HITS_PER_CELL
maximum number of hits in an RPS backbone cell; this may be redundant (have the same value as AA_HITS...
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
static NCBI_INLINE Int4 s_ComputeCompressedIndex(Int4 wordsize, const Uint1 *word, Int4 compressed_alphabet_size, Int4 *skip, BlastCompressedAaLookupTable *lookup)
Convert a word to use a compressed alphabet.
Definitions used throughout BLAST.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Common definitions for protein and nucleotide lookup tables.
#define PV_ARRAY_TYPE
The pv_array 'native' type.
The structures and functions in blast_options.
RPS BLAST structure definitions.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
static int lookup(const char *name, const struct lookup_int *table)
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint16_t Uint2
2-byte (16-bit) unsigned integer
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
#define NCBI_INLINE
"inline" seems to work on our remaining in-house compilers (WorkShop, Compaq, ICC,...
Uint1 Boolean
bool replacment for C
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
structure defining one cell of the small (i.e., use short) lookup table
union AaLookupSmallboneCell::@4 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Uint2 num_used
number of hits stored for this cell
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Uint2 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Structure to hold a sequence.
The basic lookup table structure for blastp searches.
void * thick_backbone
may point to BackboneCell, SmallboneCell, or TinyboneCell.
Int4 mask
part of index to mask off, that is, top (wordsize*charsize) bits should be discarded.
Int4 ** thin_backbone
the "thin" backbone.
Boolean use_pssm
if TRUE, lookup table construction will assume that the underlying score matrix is position- specific
Int4 lut_word_length
Length in bases of a word indexed by the lookup table.
Int4 charsize
number of bits for a base/residue
void * scansub_callback
function for scanning subject sequences
EBoneType bone_type
type of bone used: 0: normal backbone (using Int4) 1: small backbone (using Uint2) will be determined...
PV_ARRAY_TYPE * pv
Presence vector bitfield; bit positions that are set indicate that the corresponding thick backbone c...
Int4 neighbor_matches
the number of neighboring words found while indexing the queries, used for informational/ debugging p...
Int4 threshold
the score threshold for neighboring words
Int4 overflow_size
Number of elements in the overflow array.
Int4 alphabet_size
number of letters in the alphabet
Int4 word_length
Length in letters of the full word match required to trigger extension.
Int4 longest_chain
length of the longest chain on the backbone
void * overflow
may point to Int4 or Uint2, the overflow array for the compacted lookup table
Int4 backbone_size
number of cells in the backbone
Int4 exact_matches
the number of exact matches found while indexing the queries, used for informational/ debugging purpo...
The lookup table structure for protein searches using a compressed alphabet.
Int4 pv_array_bts
bit-to-shift value for PV array indicies
Int4 longest_chain
length of the longest chain on the backbone
void * scansub_callback
function for scanning subject sequences
Int4 threshold
the score threshold for neighboring words
Int4 neighbor_matches
the number of neighboring words found while indexing the queries, used for informational/ debugging p...
Int4 exact_matches
the number of exact matches found while indexing the queries, used for informational/ debugging purpo...
Int4 alphabet_size
number of letters in the alphabet
PV_ARRAY_TYPE * pv
Presence vector bitfield; bit positions that are set indicate that the corresponding thick backbone c...
Int4 reciprocal_alphabet_size
2^32 / compressed_alphabet_size
Uint1 * compress_table
translation table (protein->compressed)
Int4 * scaled_compress_table
scaled version of compress_table
CompressedLookupBackboneCell * backbone
hashtable for storing indexed query offsets
Int4 compressed_alphabet_size
letters in the compressed alphabet
Int4 word_length
Length in letters of the full word match required to trigger extension.
Int4 curr_overflow_bank
current bank to fill-up
Int4 backbone_size
number of cells in the backbone
CompressedOverflowCell ** overflow_banks
array of batches of query offsets that are too numerous to fit in backbone cells
Int4 curr_overflow_cell
occupied cells in the current bank
The RPS engine uses this structure to access all of the RPS blast related data (assumed to be collect...
The basic lookup table structure for RPS blast searches.
Int4 * overflow
the overflow array for the compacted lookup table
RPSBucket * bucket_array
list of buckets
Int4 num_buckets
number of buckets used to sort offsets retrieved from the lookup table
RPSBackboneCell * rps_backbone
the lookup table used for RPS blast
Int4 overflow_size
Number of elements in the overflow array.
Int4 mask
part of index to mask off, that is, top (wordsize*charsize) bits should be discarded.
Int4 charsize
number of bits for a base/residue
Int4 alphabet_size
number of letters in the alphabet
Int4 num_profiles
Number of profiles in RPS database.
Int4 wordsize
number of full bytes in a full word
PV_ARRAY_TYPE * pv
Presence vector bitfield; bit positions that are set indicate that the corresponding thick backbone c...
Int4 backbone_size
number of cells in the backbone
Int4 * rps_seq_offsets
array of start offsets for each RPS DB seq.
Int4 ** rps_pssm
Pointer to memory-mapped RPS Blast profile file.
Structure used for scoring calculations.
Used to hold a set of positions, mostly used for filtering.
structure for hashtable of indexed query offsets
Int4 query_offsets[4]
storage for query offsets local to the backbone cell
CompressedMixedOffsets overflow_list
storage for remote query offsets
union CompressedLookupBackboneCell::@5 payload
structure for holding the list of query offsets
"alternative" structure of CompressedLookupBackboneCell storage
Int4 query_offsets[4 -2]
the query offsets stored locally
CompressedOverflowCell * head
head of linked list of cells of query offsets stored off the backbone
cell in list for holding query offsets
struct CompressedOverflowCell * next
pointer to next cell
Int4 query_offsets[4]
the query offsets stored in the cell
Options needed to construct a lookup table Also needed: query sequence and query length.
structure defining one cell of the RPS lookup table
Int4 num_used
number of hits in this cell
Int4 entries[3]
if the number of hits in this cell is RPS_HITS_PER_CELL or less, all hits go into this array.
structure used for bucket sorting offsets retrieved from the RPS blast lookup table.
Int4 num_filled
number of offset pairs currently in bucket
Int4 num_alloc
max number of offset pairs bucket can hold
BlastOffsetPair * offset_pairs
list of offset pairs
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...