63 const int kCheckingMatches = s &
mask;
70 if ((kCheckingMatches >> right_index) % 2 == 1)
72 if ((
mask >> right_index) %2 == 1)
73 left_index = right_index;
80 *rightOne = right_index;
81 *rightMaskOnly = left_index;
100 return (rightOne - rightMaskOnly);
108 Int4 prefixMatchedBitPattern = 0;
119 maskShiftPlus1 = (
mask << 1) + 1;
120 for (
i = 0;
i < len1;
i++) {
124 prefixMatchedBitPattern =
125 ((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
127 if (prefixMatchedBitPattern &
mask) {
130 hitArray[numMatches++] =
i;
131 hitArray[numMatches++] =
i -
s_LenOf(prefixMatchedBitPattern,
mask)+1;
157 Uint4 prefixMatchedBitPattern;
164 Int4 twiceNumHits = 0;
169 const Int4 kMaskShiftPlus1 = (kMatchMask << 1)+1;
173 prefixMatchedBitPattern =
174 ((kMatchMask * ((1 << (pos+1))-1)*2) + (1 << (pos+1))-1) &
178 remain = (
len-pos) % 4;
181 prefixMatchedBitPattern = kMaskShiftPlus1;
185 for (
i = 0;
i < end;
i++) {
186 if ( (
tmp = (prefixMatchedBitPattern &
188 for (j = 0; j < 4; j++) {
189 if (
tmp & kMatchMask) {
190 hitArray[twiceNumHits++] =
i*4 + j + pos;
191 hitArray[twiceNumHits++] =
i*4 + j + pos -
197 prefixMatchedBitPattern =
198 (((prefixMatchedBitPattern << 4) | kMask2) &
202 if ( (
tmp = (prefixMatchedBitPattern &
204 for (j = 0; j < remain; j++) {
205 if (
tmp & kMatchMask) {
206 hitArray[twiceNumHits++] =
i*4+j + pos;
207 hitArray[twiceNumHits++] =
i*4+j + pos -
243 for (
i = 0;
i < numWords;
i++) {
245 if (x >= kOverflowThreshold) {
246 a[
i] = x - kOverflowThreshold;
260 for (
i = 0;
i < numWords;
i++)
268 Int4 returnValue = 0;
270 for (
i = 0;
i < numWords;
i++) {
293 for (wordIndex = 0; wordIndex < numWords; wordIndex++) {
295 if ((s[wordIndex] >> bitIndex) % 2 == 1)
297 if ((
mask[wordIndex] >> bitIndex) %2 == 1)
320 Int4 *prefixMatchedBitPattern;
322 Int4 twiceNumHits = 0;
332 prefixMatchedBitPattern = (
Int4 *)
calloc(num_words,
sizeof(
Int4));
333 for (wordIndex = 0; wordIndex < num_words; wordIndex++) {
335 prefixMatchedBitPattern[wordIndex] = 0;
339 for (
i = 0;
i < len1;
i++) {
347 hitArray[twiceNumHits++] =
i;
348 hitArray[twiceNumHits++] =
i -
352 sfree(prefixMatchedBitPattern);
372 Int4 twiceHitsOneCall;
377 Int4 nextPosInHitArray;
378 Int4 hitIndex1, hitIndex2;
395 if (twiceNumHits < 2)
398 for (wordIndex = most_specific_word+1;
399 wordIndex < multiword_items->
numWords; wordIndex++) {
408 nextPosInHitArray = 0;
409 for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
412 hitArray[hitIndex2]+1,
413 MIN(
len-hitArray[hitIndex2]-1,
414 extra_items->
spacing[wordIndex-1] +
416 is_dna, pattern_blk);
417 for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
418 hitArray1[nextPosInHitArray+hitIndex1] =
419 hitArray[hitIndex2]+hitArray1[nextPosInHitArray+hitIndex1]+1;
420 hitArray1[nextPosInHitArray+hitIndex1+1] = hitArray[hitIndex2+1];
422 nextPosInHitArray += twiceHitsOneCall;
424 twiceNumHits = nextPosInHitArray;
425 if (twiceNumHits < 2)
428 for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
429 hitArray[hitIndex2] = hitArray1[hitIndex2];
432 for (wordIndex = most_specific_word-1; wordIndex >=0;
442 nextPosInHitArray = 0;
443 for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
444 start = hitArray[hitIndex2+1] - extra_items->
spacing[wordIndex] -
450 hitArray[hitIndex2+1]-start, is_dna, pattern_blk);
451 for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
452 hitArray1[nextPosInHitArray+hitIndex1] = hitArray[hitIndex2];
453 hitArray1[nextPosInHitArray+hitIndex1+1] = start +
454 hitArray1[nextPosInHitArray+hitIndex1+1];
456 nextPosInHitArray += twiceHitsOneCall;
458 twiceNumHits = nextPosInHitArray;
459 if (twiceNumHits < 2)
462 for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
463 hitArray[hitIndex2] = hitArray1[hitIndex2];
481 const Int4 kMinPhiLookupSize = 8;
569 Int4 i, twiceNumHits;
576 loc_length = to - from + 1;
577 sequence =
query->sequence + from;
582 for (
i = 0;
i < twiceNumHits;
i += 2) {
583 if (hitArray[
i+1]+from == 0 &&
589 hitArray[
i]-hitArray[
i+1]+1);
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo *qinfo, EBlastProgramType program, Int4 query_index)
Obtains the sequence length for a given query in the query, without taking into consideration any app...
static const char location[]
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define MIN(a, b)
returns smaller of a and b.
#define INT4_MAX
largest nubmer represented by signed int
void * BlastMemDup(const void *orig, size_t size)
Copies memory using memcpy and malloc.
Uint1 Boolean
bool replacment for C
#define ASSERT
macro for assert.
static Int4 s_FindHitsShortDNA(Int4 *hitArray, const Uint1 *seq, Int4 pos, Int4 len, const SPHIPatternSearchBlk *pattern_blk)
Find hits when sequence is DNA and pattern is short returns twice the number of hits.
static Int4 s_LenOfL(Int4 *s, Int4 *mask, Int4 numWords)
Returns the difference between the offset F of a first 1-bit in a word sequence and the first offset ...
static Int4 s_FindHitsVeryLong(Int4 *hitArray, const Uint1 *seq, Int4 len, Boolean is_dna, const SPHIPatternSearchBlk *pattern_blk)
Find matches when pattern is very long,.
static Int4 s_LenOf(Int4 s, Int4 mask)
Looks for 1 bits in the same position of s and mask Let R be the rightmost position where s and mask ...
Int4 PHIGetPatternOccurrences(const SPHIPatternSearchBlk *pattern_blk, const BLAST_SequenceBlk *query, const BlastSeqLoc *location, Boolean is_dna, BlastQueryInfo *query_info)
Finds all pattern hits in a given query and saves them in the previously allocated SPHIQueryInfo stru...
Int4 _PHIBlastFindHitsShort(Int4 *hitArray, const Uint1 *seq, Int4 len1, const SPHIPatternSearchBlk *pattern_blk)
Routine to find hits of pattern to sequence when sequence is proteins.
SPHIQueryInfo * SPHIQueryInfoFree(SPHIQueryInfo *pat_info)
Frees the pattern information structure.
Int4 FindPatternHits(Int4 *hitArray, const Uint1 *seq, Int4 len, Boolean is_dna, const SPHIPatternSearchBlk *pattern_blk)
Find the places where the pattern matches seq; 3 different methods are used depending on the length o...
void _PHIPatternWordsLeftShift(Int4 *a, Uint1 b, Int4 numWords)
Shift each word in the array left by 1 bit and add bit b.
static Int2 s_PHIBlastAddPatternHit(SPHIQueryInfo *pattern_info, Int4 offset, Int4 length)
Adds a new pattern hit to the PHI BLAST pseudo lookup table.
Int4 _PHIPatternWordsBitwiseAnd(Int4 *result, Int4 *a, Int4 *b, Int4 numWords)
Do a word-by-word bit-wise and of two integer arrays and put the result in a new array.
static Int4 s_FindHitsShortHead(Int4 *hitArray, const Uint1 *seq, Int4 start, Int4 len, Uint1 is_dna, const SPHIPatternSearchBlk *pattern_blk)
Top level routine to find hits when pattern has a short description.
void _PHIGetRightOneBits(Int4 s, Int4 mask, Int4 *rightOne, Int4 *rightMaskOnly)
Looks for 1 bits in the same position of s and mask Let R be the rightmost position where s and mask ...
static Int4 s_FindHitsLong(Int4 *hitArray, const Uint1 *seq, Int4 len1, const SPHIPatternSearchBlk *pattern_blk)
Finds places where pattern matches seq and returns them as pairs of positions in consecutive entries ...
void _PHIPatternWordsBitwiseOr(Int4 *a, Int4 *b, Int4 numWords)
Do a word-by-word bit-wise or of two integer arrays and put the result back in the first array.
SPHIQueryInfo * SPHIQueryInfoCopy(const SPHIQueryInfo *pat_info)
Copies the SPHIQueryInfo structure.
SPHIQueryInfo * SPHIQueryInfoNew()
Allocates the pattern occurrences structure.
Functions for finding pattern matches in sequence (PHI-BLAST).
@ eMultiWord
Does pattern consist of a multiple words?
@ eOneWord
Does pattern consist of a single word?
#define PHI_BITS_PACKED_PER_WORD
Number of bits packed in a word.
#define PHI_MAX_HIT
Maximal size of an array of pattern hits.
Auxiliary functions for finding pattern matches in sequence (PHI-BLAST), that are used in multiple so...
Structure to hold a sequence.
The query related information.
struct SPHIQueryInfo * pattern_info
Counts of PHI BLAST pattern occurrences, used in PHI BLAST only.
Used to hold a set of positions, mostly used for filtering.
SSeqRange * ssr
location data on the sequence.
struct BlastSeqLoc * next
next in linked list
Uint4 DNAprefixSLL[100][256]
Where prefix of DNA 4-mer matches pattern, for multiple-word patterns.
Uint4 DNAsuffixSLL[100][256]
Where suffix of DNA 4-mer matches pattern, for multiple-word patterns.
Uint4 * DNAwhichPrefixPosPtr
Prefix position array for DNA patterns.
Uint4 * DNAwhichSuffixPosPtr
Suffix position array for DNA patterns.
Auxiliary items needed for a PHI BLAST search with pattern containing multiple words.
Int4 match_maskL[100]
Bit mask representation of input pattern for long patterns.
SExtraLongPatternItems * extra_long_items
Additional items necessary if pattern contains pieces longer than a word.
SDNALongPatternItems * dna_items
Additional items necessary for a DNA pattern.
Int4 SLL[100][256]
For each letter in the alphabet and each word in the masked pattern representation,...
Int4 bitPatternByLetter[256][11]
Which positions can a character occur in for long patterns.
Int4 numWords
Number of words need to hold bit representation of pattern.
Information about a single pattern occurence in the query.
Int4 length
Length of this pattern occurrence.
Int4 offset
Starting offset of this pattern occurrence.
Structure containing all auxiliary information needed in a pattern search.
SShortPatternItems * one_word_items
Items necessary when pattern fits in one word.
EPatternType flagPatternLength
Indicates if the whole pattern fits in 1 word, each of several parts of the pattern fit in a word,...
SLongPatternItems * multi_word_items
Additional items, when pattern requires multiple words.
In PHI BLAST, structure containing information about all pattern occurrences in query.
char * pattern
Pattern used, saved here for formatting purposes.
Int4 allocated_size
Allocated size of the occurrences array.
Int4 num_patterns
Number of pattern occurrences in query.
SPHIPatternInfo * occurrences
Array of pattern occurrence information structures.
Int4 left
left endpoint of range (zero based)
Int4 right
right endpoint of range (zero based)
Auxiliary items needed for a PHI BLAST search with a pattern that fits in a single word.
Int4 * whichPositionPtr
Array of positions where pattern lettern should match, for a single word of the pattern.
SDNAShortPatternItems * dna_items
Additional items for a DNA search.
Int4 match_mask
Bit mask representation of input pattern for patterns that fit in a word.
voidp calloc(uInt items, uInt size)