NCBI C++ ToolKit
pattern.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: pattern.c 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ilya Dondoshansky
27  *
28  */
29 
30 /** @file pattern.c
31  * Functions for finding pattern matches in sequence.
32  * The following functions are defined here. @sa phi_lookup.h
33  *
34  * <pre>
35  * SPHIQueryInfoNew, SPHIQueryInfoFree, SPHIQueryInfoCopy - life cycle functions
36  * for the SPHIQueryInfo structure for saving pattern occurrences in query.
37  *
38  * Main API function to find and save pattern occurrences in query, and functions
39  * called from it:
40  *
41  * PHIGetPatternOccurrences
42  * FindPatternHits
43  * if ( pattern fits into a single word)
44  * s_FindHitsShortHead
45  * else if ( pattern fits into several words )
46  * s_FindHitsLong
47  * else if ( pattern contains parts longer than a word )
48  * s_FindHitsVeryLong
49  * calls s_FindHitsShortHead for every word and extends them
50  *
51  * For pattern occurrences in subject (database),
52  * FindPatternHits is called from PHIBlastScanSubject.
53  * </pre>
54  *
55  */
56 
58 #include "pattern_priv.h"
59 
60 void
61 _PHIGetRightOneBits(Int4 s, Int4 mask, Int4* rightOne, Int4* rightMaskOnly)
62 {
63  const int kCheckingMatches = s & mask; /*look for 1 bits in same position*/
64  Int4 right_index; /*loop index looking for 1 in kCheckingMatches*/
65  Int4 left_index; /*rightmost bit that is 1 in the mask only*/
66 
67  left_index = -1;
68 
69  for (right_index = 0; right_index < PHI_BITS_PACKED_PER_WORD; right_index++) {
70  if ((kCheckingMatches >> right_index) % 2 == 1)
71  break;
72  if ((mask >> right_index) %2 == 1)
73  left_index = right_index;
74  }
75  /* If there was no break from the loop, s and mask have no 1 bits in same
76  position, so rightOne should be 0. */
77  if (right_index == PHI_BITS_PACKED_PER_WORD)
78  right_index = 0;
79 
80  *rightOne = right_index;
81  *rightMaskOnly = left_index;
82 }
83 
84 /** Looks for 1 bits in the same position of s and mask
85  * Let R be the rightmost position where s and mask both have a 1.
86  * Let L < R be the rightmost position where mask has a 1, if any,
87  * or -1 otherwise.
88  * @param s Some number [in]
89  * @param mask Mask [in]
90  * @return (R - L).
91  */
92 static Int4
94 {
95  Int4 rightOne; /*loop index looking for 1 in kCheckingMatches*/
96  Int4 rightMaskOnly; /*rightmost bit that is 1 in the mask only*/
97 
98  _PHIGetRightOneBits(s, mask, &rightOne, &rightMaskOnly);
99 
100  return (rightOne - rightMaskOnly);
101 }
102 
103 Int4
104 _PHIBlastFindHitsShort(Int4 *hitArray, const Uint1* seq, Int4 len1,
105  const SPHIPatternSearchBlk *pattern_blk)
106 {
107  Int4 i; /*loop index on sequence*/
108  Int4 prefixMatchedBitPattern = 0; /*indicates where pattern aligns
109  with seq; e.g., if value is 9 = 0101 then
110  last 3 chars of seq match first 3 positions in pattern
111  and last 1 char of seq matches 1 position of pattern*/
112  Int4 numMatches = 0; /*number of matches found*/
113  Int4 mask; /*mask of input pattern positions after which
114  a match can be declared*/
115  Int4 maskShiftPlus1; /*mask shifted left 1 plus 1 */
116  const SShortPatternItems* pattern_items = pattern_blk->one_word_items;
117 
118  mask = pattern_items->match_mask;
119  maskShiftPlus1 = (mask << 1) + 1;
120  for (i = 0; i < len1; i++) {
121  /*shift the positions matched by 1 and try to match up against
122  the next character, also allow next character to match the
123  first position*/
124  prefixMatchedBitPattern =
125  ((prefixMatchedBitPattern << 1) | maskShiftPlus1) &
126  pattern_items->whichPositionPtr[seq[i]];
127  if (prefixMatchedBitPattern & mask) {
128  /*first part of pair is index of place in seq where match
129  ends; second part is where match starts*/
130  hitArray[numMatches++] = i;
131  hitArray[numMatches++] = i - s_LenOf(prefixMatchedBitPattern, mask)+1;
132  if (numMatches == PHI_MAX_HIT)
133  {
134  /** @todo FIXME: Pass back an error message saying that
135  numMatches matches are saved, others discarded. */
136  break;
137  }
138  }
139  }
140  return numMatches;
141 }
142 
143 /** Find hits when sequence is DNA and pattern is short returns twice the number
144  * of hits.
145  * @param hitArray Array of hits to pass back [out]
146  * @param seq The input sequence [in]
147  * @param pos Starting position [in]
148  * @param len Length of sequence seq [in]
149  * @param pattern_blk Carries variables that keep track of search
150  * parameters. [in]
151  * @return Number of hits found.
152  */
153 static Int4
154 s_FindHitsShortDNA(Int4* hitArray, const Uint1* seq, Int4 pos, Int4 len,
155  const SPHIPatternSearchBlk *pattern_blk)
156 {
157  Uint4 prefixMatchedBitPattern; /*indicates where pattern aligns
158  with sequence*/
159  Uint4 tmp; /*intermediate result of masked comparisons*/
160  Int4 i; /*index on seq*/
161  Int4 end; /*count of number of 4-mer iterations needed*/
162  Int4 remain; /*0,1,2,3 DNA letters left over*/
163  Int4 j; /*index on suffixRemnant*/
164  Int4 twiceNumHits = 0; /*twice the number of hits*/
165  const SShortPatternItems* pattern_items = pattern_blk->one_word_items;
166  const Int4 kMatchMask = pattern_items->match_mask;
167  /* Mask to match agaist */
168  const Uint4 kMask2 = kMatchMask*PHI_BITS_PACKED_PER_WORD+15;
169  const Int4 kMaskShiftPlus1 = (kMatchMask << 1)+1; /* kMask2 shifted plus 1*/
170 
171  if (pos != 0) {
172  pos = 4 - pos;
173  prefixMatchedBitPattern =
174  ((kMatchMask * ((1 << (pos+1))-1)*2) + (1 << (pos+1))-1) &
175  pattern_items->dna_items->DNAwhichSuffixPosPtr[seq[0]];
176  seq++;
177  end = (len-pos)/4;
178  remain = (len-pos) % 4;
179  }
180  else {
181  prefixMatchedBitPattern = kMaskShiftPlus1;
182  end = len/4;
183  remain = len % 4;
184  }
185  for (i = 0; i < end; i++) {
186  if ( (tmp = (prefixMatchedBitPattern &
187  pattern_items->dna_items->DNAwhichPrefixPosPtr[seq[i]]))) {
188  for (j = 0; j < 4; j++) {
189  if (tmp & kMatchMask) {
190  hitArray[twiceNumHits++] = i*4 + j + pos;
191  hitArray[twiceNumHits++] = i*4 + j + pos -
192  s_LenOf(tmp & kMatchMask, kMatchMask) + 1;
193  }
194  tmp = (tmp << 1);
195  }
196  }
197  prefixMatchedBitPattern =
198  (((prefixMatchedBitPattern << 4) | kMask2) &
199  pattern_items->dna_items->DNAwhichSuffixPosPtr[seq[i]]);
200  }
201  /* In the last byte check bits only up to 'remain' */
202  if ( (tmp = (prefixMatchedBitPattern &
203  pattern_items->dna_items->DNAwhichPrefixPosPtr[seq[i]]))) {
204  for (j = 0; j < remain; j++) {
205  if (tmp & kMatchMask) {
206  hitArray[twiceNumHits++] = i*4+j + pos;
207  hitArray[twiceNumHits++] = i*4+j + pos -
208  s_LenOf(tmp & kMatchMask, kMatchMask) + 1;
209  }
210  tmp = (tmp << 1);
211  }
212  }
213  return twiceNumHits;
214 }
215 
216 /** Top level routine to find hits when pattern has a short description.
217  * @param hitArray Array of hits to pass back [out]
218  * @param seq Input sequence [in]
219  * @param start Position to start at in seq [in]
220  * @param len Length of seq [in]
221  * @param is_dna 1 if and only if seq is a DNA sequence [in]
222  * @param pattern_blk Carries variables that keep track of search
223  * parameters. [in]
224  * @return Number of matches found.
225  */
226 static Int4
227 s_FindHitsShortHead(Int4* hitArray, const Uint1* seq, Int4 start, Int4 len,
228  Uint1 is_dna, const SPHIPatternSearchBlk *pattern_blk)
229 {
230  if (is_dna)
231  return s_FindHitsShortDNA(hitArray, &seq[start/4], start % 4, len, pattern_blk);
232  return _PHIBlastFindHitsShort(hitArray, &seq[start], len, pattern_blk);
233 }
234 
235 void
237 {
238  Int4 x;
239  Int4 i; /*index on words*/
240  /* Overflow threshold */
241  const Int4 kOverflowThreshold = (1 << PHI_BITS_PACKED_PER_WORD);
242 
243  for (i = 0; i < numWords; i++) {
244  x = (a[i] << 1) + b;
245  if (x >= kOverflowThreshold) {
246  a[i] = x - kOverflowThreshold;
247  b = 1;
248  }
249  else {
250  a[i] = x;
251  b = 0;
252  }
253  }
254 }
255 
256 void
258 {
259  Int4 i; /*index over words*/
260  for (i = 0; i < numWords; i++)
261  a[i] = (a[i] | b[i]);
262 }
263 
264 Int4
266 {
267  Int4 i; /*index over words*/
268  Int4 returnValue = 0;
269 
270  for (i = 0; i < numWords; i++) {
271  if ((result[i] = (a[i] & b[i])))
272  returnValue = 1;
273  }
274  return returnValue;
275 }
276 
277 /** Returns the difference between the offset F of a first 1-bit in a word
278  * sequence and the first offset G < F of a 1-bit in the pattern mask. If
279  * such G does not exist, it is set to -1.
280  * @param s Input sequence [in]
281  * @param mask Array of word masks [in]
282  * @param numWords Number of words in s. [in]
283  * @return F - G, see explanation above.
284  */
285 static Int4
286 s_LenOfL(Int4 *s, Int4 *mask, Int4 numWords)
287 {
288  Int4 bitIndex; /*loop index over bits in a word*/
289  Int4 wordIndex; /*loop index over words*/
290  Int4 firstOneInMask;
291 
292  firstOneInMask = -1;
293  for (wordIndex = 0; wordIndex < numWords; wordIndex++) {
294  for (bitIndex = 0; bitIndex < PHI_BITS_PACKED_PER_WORD; bitIndex++) {
295  if ((s[wordIndex] >> bitIndex) % 2 == 1)
296  return wordIndex*PHI_BITS_PACKED_PER_WORD+bitIndex-firstOneInMask;
297  if ((mask[wordIndex] >> bitIndex) %2 == 1)
298  firstOneInMask = wordIndex*PHI_BITS_PACKED_PER_WORD+bitIndex;
299  }
300  }
301  /* This point should never be reached. */
302  return -1;
303 }
304 
305 /** Finds places where pattern matches seq and returns them as
306  * pairs of positions in consecutive entries of hitArray;
307  * similar to _PHIBlastFindHitsShort
308  * @param hitArray Array of hits to return [out]
309  * @param seq Input sequence [in]
310  * @param len1 Length of seq [in]
311  * @param pattern_blk carries all the pattern variables
312  * @return twice the number of hits.
313  */
314 static Int4
315 s_FindHitsLong(Int4 *hitArray, const Uint1* seq, Int4 len1,
316  const SPHIPatternSearchBlk *pattern_blk)
317 {
318  Int4 wordIndex; /*index on words in mask*/
319  Int4 i; /*loop index on seq */
320  Int4 *prefixMatchedBitPattern; /*see similar variable in
321  _PHIBlastFindHitsShort*/
322  Int4 twiceNumHits = 0; /*counter for hitArray*/
323  Int4 *mask; /*local copy of match_maskL version of pattern
324  indicates after which positions a match can be declared*/
325  Int4 *matchResult; /*Array of words to hold the result of the
326  final test for a match*/
327  SLongPatternItems* pattern_items = pattern_blk->multi_word_items;
328  Int4 num_words = pattern_items->numWords;
329 
330  matchResult = (Int4 *) calloc(num_words, sizeof(Int4));
331  mask = (Int4 *) calloc(num_words, sizeof(Int4));
332  prefixMatchedBitPattern = (Int4 *) calloc(num_words, sizeof(Int4));
333  for (wordIndex = 0; wordIndex < num_words; wordIndex++) {
334  mask[wordIndex] = pattern_items->match_maskL[wordIndex];
335  prefixMatchedBitPattern[wordIndex] = 0;
336  }
337  /* This is a multiword version of the algorithm in _PHIBlastFindHitsShort */
338  _PHIPatternWordsLeftShift(mask, 1, num_words);
339  for (i = 0; i < len1; i++) {
340  _PHIPatternWordsLeftShift(prefixMatchedBitPattern, 0, num_words);
341  _PHIPatternWordsBitwiseOr(prefixMatchedBitPattern, mask, num_words);
342  _PHIPatternWordsBitwiseAnd(prefixMatchedBitPattern, prefixMatchedBitPattern,
343  pattern_items->bitPatternByLetter[seq[i]],
344  num_words);
345  if (_PHIPatternWordsBitwiseAnd(matchResult, prefixMatchedBitPattern,
346  pattern_items->match_maskL, num_words)) {
347  hitArray[twiceNumHits++] = i;
348  hitArray[twiceNumHits++] = i -
349  s_LenOfL(matchResult, pattern_items->match_maskL, num_words) + 1;
350  }
351  }
352  sfree(prefixMatchedBitPattern);
353  sfree(matchResult);
354  sfree(mask);
355  return twiceNumHits;
356 }
357 
358 /** Find matches when pattern is very long,
359  * @param hitArray Array to pass back pairs of start and end positions for
360  * hits [out]
361  * @param seq Input sequence [in]
362  * @param len Length of seq [in]
363  * @param is_dna Is sequence DNA or protein? [in]
364  * @param pattern_blk carries all the pattern variables [in]
365  * @return Twice the number of hits found.
366  */
367 static Int4
368 s_FindHitsVeryLong(Int4 *hitArray, const Uint1* seq, Int4 len, Boolean is_dna,
369  const SPHIPatternSearchBlk *pattern_blk)
370 {
371  Int4 twiceNumHits; /*twice the number of matches*/
372  Int4 twiceHitsOneCall; /*twice the number of hits in one call to
373  _PHIBlastFindHitsShort */
374  Int4 wordIndex; /*index over words in pattern*/
375  Int4 start; /*start position in sequence for calls to _PHIBlastFindHitsShort */
376  Int4 hitArray1[PHI_MAX_HIT]; /*used to get hits against different words*/
377  Int4 nextPosInHitArray; /*next available position in hitArray1 */
378  Int4 hitIndex1, hitIndex2; /*indices over hitArray1*/
379  SLongPatternItems* multiword_items = pattern_blk->multi_word_items;
380  SShortPatternItems* word_items = pattern_blk->one_word_items;
381  SExtraLongPatternItems* extra_items = multiword_items->extra_long_items;
382  Int4 most_specific_word = extra_items->whichMostSpecific;
383 
384  word_items->whichPositionPtr = multiword_items->SLL[most_specific_word];
385  word_items->match_mask = multiword_items->match_maskL[most_specific_word];
386  if (is_dna) {
387  word_items->dna_items->DNAwhichPrefixPosPtr =
388  multiword_items->dna_items->DNAprefixSLL[most_specific_word];
389  word_items->dna_items->DNAwhichSuffixPosPtr =
390  multiword_items->dna_items->DNAsuffixSLL[most_specific_word];
391  }
392  /*find matches to most specific word of pattern*/
393  twiceNumHits =
394  s_FindHitsShortHead(hitArray, seq, 0, len, is_dna, pattern_blk);
395  if (twiceNumHits < 2)
396  return 0;
397  /*extend matches word by word*/
398  for (wordIndex = most_specific_word+1;
399  wordIndex < multiword_items->numWords; wordIndex++) {
400  word_items->whichPositionPtr = multiword_items->SLL[wordIndex];
401  word_items->match_mask = multiword_items->match_maskL[wordIndex];
402  if (is_dna) {
403  word_items->dna_items->DNAwhichPrefixPosPtr =
404  multiword_items->dna_items->DNAprefixSLL[wordIndex];
405  word_items->dna_items->DNAwhichSuffixPosPtr =
406  multiword_items->dna_items->DNAsuffixSLL[wordIndex];
407  }
408  nextPosInHitArray = 0;
409  for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
410  twiceHitsOneCall =
411  s_FindHitsShortHead(&hitArray1[nextPosInHitArray], seq,
412  hitArray[hitIndex2]+1,
413  MIN(len-hitArray[hitIndex2]-1,
414  extra_items->spacing[wordIndex-1] +
415  extra_items->numPlacesInWord[wordIndex]),
416  is_dna, pattern_blk);
417  for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
418  hitArray1[nextPosInHitArray+hitIndex1] =
419  hitArray[hitIndex2]+hitArray1[nextPosInHitArray+hitIndex1]+1;
420  hitArray1[nextPosInHitArray+hitIndex1+1] = hitArray[hitIndex2+1];
421  }
422  nextPosInHitArray += twiceHitsOneCall;
423  }
424  twiceNumHits = nextPosInHitArray;
425  if (twiceNumHits < 2)
426  return 0;
427  /*copy back matches that extend */
428  for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
429  hitArray[hitIndex2] = hitArray1[hitIndex2];
430  }
431  /*extend each match back one word at a time*/
432  for (wordIndex = most_specific_word-1; wordIndex >=0;
433  wordIndex--) {
434  word_items->whichPositionPtr = multiword_items->SLL[wordIndex];
435  word_items->match_mask = multiword_items->match_maskL[wordIndex];
436  if (is_dna) {
437  word_items->dna_items->DNAwhichPrefixPosPtr =
438  multiword_items->dna_items->DNAprefixSLL[wordIndex];
439  word_items->dna_items->DNAwhichSuffixPosPtr =
440  multiword_items->dna_items->DNAsuffixSLL[wordIndex];
441  }
442  nextPosInHitArray = 0;
443  for (hitIndex2 = 0; hitIndex2 < twiceNumHits; hitIndex2 += 2) {
444  start = hitArray[hitIndex2+1] - extra_items->spacing[wordIndex] -
445  extra_items->numPlacesInWord[wordIndex];
446  if (start < 0)
447  start = 0;
448  twiceHitsOneCall =
449  s_FindHitsShortHead(&hitArray1[nextPosInHitArray], seq, start,
450  hitArray[hitIndex2+1]-start, is_dna, pattern_blk);
451  for (hitIndex1 = 0; hitIndex1 < twiceHitsOneCall; hitIndex1+= 2) {
452  hitArray1[nextPosInHitArray+hitIndex1] = hitArray[hitIndex2];
453  hitArray1[nextPosInHitArray+hitIndex1+1] = start +
454  hitArray1[nextPosInHitArray+hitIndex1+1];
455  }
456  nextPosInHitArray += twiceHitsOneCall;
457  }
458  twiceNumHits = nextPosInHitArray;
459  if (twiceNumHits < 2)
460  return 0;
461  /*copy back matches that extend*/
462  for (hitIndex2 = 0; hitIndex2 < nextPosInHitArray; hitIndex2++)
463  hitArray[hitIndex2] = hitArray1[hitIndex2];
464  }
465  return twiceNumHits;
466 }
467 
468 Int4 FindPatternHits(Int4 * hitArray, const Uint1* seq, Int4 len,
469  Boolean is_dna, const SPHIPatternSearchBlk * pattern_blk)
470 {
471  if (pattern_blk->flagPatternLength == eOneWord)
472  return s_FindHitsShortHead(hitArray, seq, 0, len, is_dna, pattern_blk);
473  if (pattern_blk->flagPatternLength == eMultiWord)
474  return s_FindHitsLong(hitArray, seq, len, pattern_blk);
475  return s_FindHitsVeryLong(hitArray, seq, len, is_dna, pattern_blk);
476 }
477 
479 {
480  SPHIQueryInfo* pattern_info;
481  const Int4 kMinPhiLookupSize = 8; /* Minimal allocation size for the array
482  of pattern occurrences. */
483 
484  if ((pattern_info =
485  (SPHIQueryInfo*) calloc(1, sizeof(SPHIQueryInfo))) == NULL)
486  return NULL;
487  pattern_info->allocated_size = kMinPhiLookupSize;
488  if ((pattern_info->occurrences =
489  (SPHIPatternInfo*) calloc(kMinPhiLookupSize, sizeof(SPHIPatternInfo)))
490  == NULL)
491  return NULL;
492  return pattern_info;
493 }
494 
497 {
498  if (pat_info) {
499  sfree(pat_info->occurrences);
500  sfree(pat_info->pattern);
501  sfree(pat_info);
502  }
503  return NULL;
504 }
505 
508 {
509  SPHIQueryInfo* retval = NULL;
510 
511  if (!pat_info)
512  return retval;
513 
514  retval =
515  (SPHIQueryInfo*) BlastMemDup(pat_info, sizeof(SPHIQueryInfo));
516  retval->pattern =
517  (char *) BlastMemDup(pat_info->pattern, 1+strlen(pat_info->pattern));
518  retval->occurrences = (SPHIPatternInfo*)
519  BlastMemDup(pat_info->occurrences,
520  pat_info->num_patterns*sizeof(SPHIPatternInfo));
521  return retval;
522 }
523 
524 /** Adds a new pattern hit to the PHI BLAST pseudo lookup table.
525  * @param pattern_info The query pattern information structure. [in] [out]
526  * @param offset Offset in query at which pattern was found. [in]
527  * @param length Length of the pattern at this offset. [in]
528  */
529 static Int2
531  Int4 offset, Int4 length)
532 {
533  SPHIPatternInfo* occurrence_array;
534  Int4 pat_index = pattern_info->num_patterns;
535 
536  if (pat_index >= pattern_info->allocated_size) {
537  if ((occurrence_array = (SPHIPatternInfo*)
538  realloc(pattern_info->occurrences,
539  2*pattern_info->allocated_size*sizeof(SPHIPatternInfo)))
540  == NULL)
541  return -1;
542  pattern_info->occurrences = occurrence_array;
543  pattern_info->allocated_size *= 2;
544  }
545 
546  pattern_info->occurrences[pat_index].offset = offset;
547  pattern_info->occurrences[pat_index].length = length;
548  ++pattern_info->num_patterns;
549 
550  return 0;
551 }
552 
554  const BLAST_SequenceBlk * query,
555  const BlastSeqLoc * location,
556  Boolean is_dna,
557  BlastQueryInfo * query_info)
558 {
559  const BlastSeqLoc* loc;
560  Int4* hitArray;
562  SPHIQueryInfo* pattern_info = query_info->pattern_info;
563 
564  ASSERT(pattern_info);
565 
566  hitArray = (Int4 *) calloc(2*query->length, sizeof(Int4));
567 
568  for(loc=location; loc; loc=loc->next) {
569  Int4 i, twiceNumHits;
570  Int4 from, to;
571  Int4 loc_length;
572  Uint1* sequence;
573 
574  from = loc->ssr->left;
575  to = loc->ssr->right;
576  loc_length = to - from + 1;
577  sequence = query->sequence + from;
578 
579  twiceNumHits = FindPatternHits(hitArray, sequence, loc_length, is_dna,
580  pattern_blk);
581 
582  for (i = 0; i < twiceNumHits; i += 2) {
583  if (hitArray[i+1]+from == 0 &&
584  hitArray[i]-hitArray[i+1]+1 == BlastQueryInfoGetQueryLength(query_info, program, 0))
585  {
586  return INT4_MAX;
587  }
588  s_PHIBlastAddPatternHit(pattern_info, hitArray[i+1]+from,
589  hitArray[i]-hitArray[i+1]+1);
590  }
591  }
592 
593  sfree(hitArray);
594 
595  return pattern_info->num_patterns;
596 }
597 
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypePhiBlastn
Definition: blast_program.h:87
@ eBlastTypePhiBlastp
Definition: blast_program.h:86
Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo *qinfo, EBlastProgramType program, Int4 query_index)
Obtains the sequence length for a given query in the query, without taking into consideration any app...
ncbi::TMaskedQueryRegions mask
static const char location[]
Definition: config.c:97
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int i
int len
unsigned int a
Definition: ncbi_localip.c:102
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
void * BlastMemDup(const void *orig, size_t size)
Copies memory using memcpy and malloc.
Definition: ncbi_std.c:35
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
static char tmp[2048]
Definition: utf8.c:42
static Int4 s_FindHitsShortDNA(Int4 *hitArray, const Uint1 *seq, Int4 pos, Int4 len, const SPHIPatternSearchBlk *pattern_blk)
Find hits when sequence is DNA and pattern is short returns twice the number of hits.
Definition: pattern.c:154
static Int4 s_LenOfL(Int4 *s, Int4 *mask, Int4 numWords)
Returns the difference between the offset F of a first 1-bit in a word sequence and the first offset ...
Definition: pattern.c:286
static Int4 s_FindHitsVeryLong(Int4 *hitArray, const Uint1 *seq, Int4 len, Boolean is_dna, const SPHIPatternSearchBlk *pattern_blk)
Find matches when pattern is very long,.
Definition: pattern.c:368
static Int4 s_LenOf(Int4 s, Int4 mask)
Looks for 1 bits in the same position of s and mask Let R be the rightmost position where s and mask ...
Definition: pattern.c:93
Int4 PHIGetPatternOccurrences(const SPHIPatternSearchBlk *pattern_blk, const BLAST_SequenceBlk *query, const BlastSeqLoc *location, Boolean is_dna, BlastQueryInfo *query_info)
Finds all pattern hits in a given query and saves them in the previously allocated SPHIQueryInfo stru...
Definition: pattern.c:553
Int4 _PHIBlastFindHitsShort(Int4 *hitArray, const Uint1 *seq, Int4 len1, const SPHIPatternSearchBlk *pattern_blk)
Routine to find hits of pattern to sequence when sequence is proteins.
Definition: pattern.c:104
SPHIQueryInfo * SPHIQueryInfoFree(SPHIQueryInfo *pat_info)
Frees the pattern information structure.
Definition: pattern.c:496
Int4 FindPatternHits(Int4 *hitArray, const Uint1 *seq, Int4 len, Boolean is_dna, const SPHIPatternSearchBlk *pattern_blk)
Find the places where the pattern matches seq; 3 different methods are used depending on the length o...
Definition: pattern.c:468
void _PHIPatternWordsLeftShift(Int4 *a, Uint1 b, Int4 numWords)
Shift each word in the array left by 1 bit and add bit b.
Definition: pattern.c:236
static Int2 s_PHIBlastAddPatternHit(SPHIQueryInfo *pattern_info, Int4 offset, Int4 length)
Adds a new pattern hit to the PHI BLAST pseudo lookup table.
Definition: pattern.c:530
Int4 _PHIPatternWordsBitwiseAnd(Int4 *result, Int4 *a, Int4 *b, Int4 numWords)
Do a word-by-word bit-wise and of two integer arrays and put the result in a new array.
Definition: pattern.c:265
static Int4 s_FindHitsShortHead(Int4 *hitArray, const Uint1 *seq, Int4 start, Int4 len, Uint1 is_dna, const SPHIPatternSearchBlk *pattern_blk)
Top level routine to find hits when pattern has a short description.
Definition: pattern.c:227
void _PHIGetRightOneBits(Int4 s, Int4 mask, Int4 *rightOne, Int4 *rightMaskOnly)
Looks for 1 bits in the same position of s and mask Let R be the rightmost position where s and mask ...
Definition: pattern.c:61
static Int4 s_FindHitsLong(Int4 *hitArray, const Uint1 *seq, Int4 len1, const SPHIPatternSearchBlk *pattern_blk)
Finds places where pattern matches seq and returns them as pairs of positions in consecutive entries ...
Definition: pattern.c:315
void _PHIPatternWordsBitwiseOr(Int4 *a, Int4 *b, Int4 numWords)
Do a word-by-word bit-wise or of two integer arrays and put the result back in the first array.
Definition: pattern.c:257
SPHIQueryInfo * SPHIQueryInfoCopy(const SPHIQueryInfo *pat_info)
Copies the SPHIQueryInfo structure.
Definition: pattern.c:507
SPHIQueryInfo * SPHIQueryInfoNew()
Allocates the pattern occurrences structure.
Definition: pattern.c:478
Functions for finding pattern matches in sequence (PHI-BLAST).
@ eMultiWord
Does pattern consist of a multiple words?
Definition: pattern.h:75
@ eOneWord
Does pattern consist of a single word?
Definition: pattern.h:74
#define PHI_BITS_PACKED_PER_WORD
Number of bits packed in a word.
Definition: pattern.h:52
#define PHI_MAX_HIT
Maximal size of an array of pattern hits.
Definition: pattern.h:57
Auxiliary functions for finding pattern matches in sequence (PHI-BLAST), that are used in multiple so...
int offset
Definition: replacements.h:160
Structure to hold a sequence.
Definition: blast_def.h:242
The query related information.
struct SPHIQueryInfo * pattern_info
Counts of PHI BLAST pattern occurrences, used in PHI BLAST only.
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Uint4 DNAprefixSLL[100][256]
Where prefix of DNA 4-mer matches pattern, for multiple-word patterns.
Definition: pattern.h:122
Uint4 DNAsuffixSLL[100][256]
Where suffix of DNA 4-mer matches pattern, for multiple-word patterns.
Definition: pattern.h:124
Uint4 * DNAwhichPrefixPosPtr
Prefix position array for DNA patterns.
Definition: pattern.h:83
Uint4 * DNAwhichSuffixPosPtr
Suffix position array for DNA patterns.
Definition: pattern.h:84
Auxiliary items needed for a PHI BLAST search with pattern that contains pieces longer than a word.
Definition: pattern.h:105
Int4 numPlacesInWord[100]
When pattern has more than 7 words, keep track of how many places of pattern in each word of the repr...
Definition: pattern.h:108
Int4 spacing[100]
Spaces until next word due to wildcard.
Definition: pattern.h:110
Int4 whichMostSpecific
Which word in an extra long pattern has the lowest probability of a match.
Definition: pattern.h:113
Auxiliary items needed for a PHI BLAST search with pattern containing multiple words.
Definition: pattern.h:131
Int4 match_maskL[100]
Bit mask representation of input pattern for long patterns.
Definition: pattern.h:134
SExtraLongPatternItems * extra_long_items
Additional items necessary if pattern contains pieces longer than a word.
Definition: pattern.h:147
SDNALongPatternItems * dna_items
Additional items necessary for a DNA pattern.
Definition: pattern.h:145
Int4 SLL[100][256]
For each letter in the alphabet and each word in the masked pattern representation,...
Definition: pattern.h:142
Int4 bitPatternByLetter[256][11]
Which positions can a character occur in for long patterns.
Definition: pattern.h:137
Int4 numWords
Number of words need to hold bit representation of pattern.
Definition: pattern.h:132
Information about a single pattern occurence in the query.
Definition: blast_def.h:292
Int4 length
Length of this pattern occurrence.
Definition: blast_def.h:294
Int4 offset
Starting offset of this pattern occurrence.
Definition: blast_def.h:293
Structure containing all auxiliary information needed in a pattern search.
Definition: pattern.h:155
SShortPatternItems * one_word_items
Items necessary when pattern fits in one word.
Definition: pattern.h:163
EPatternType flagPatternLength
Indicates if the whole pattern fits in 1 word, each of several parts of the pattern fit in a word,...
Definition: pattern.h:159
SLongPatternItems * multi_word_items
Additional items, when pattern requires multiple words.
Definition: pattern.h:166
In PHI BLAST, structure containing information about all pattern occurrences in query.
Definition: blast_def.h:300
char * pattern
Pattern used, saved here for formatting purposes.
Definition: blast_def.h:306
Int4 allocated_size
Allocated size of the occurrences array.
Definition: blast_def.h:304
Int4 num_patterns
Number of pattern occurrences in query.
Definition: blast_def.h:301
SPHIPatternInfo * occurrences
Array of pattern occurrence information structures.
Definition: blast_def.h:302
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
Auxiliary items needed for a PHI BLAST search with a pattern that fits in a single word.
Definition: pattern.h:94
Int4 * whichPositionPtr
Array of positions where pattern lettern should match, for a single word of the pattern.
Definition: pattern.h:97
SDNAShortPatternItems * dna_items
Additional items for a DNA search.
Definition: pattern.h:99
Int4 match_mask
Bit mask representation of input pattern for patterns that fit in a word.
Definition: pattern.h:95
static string query
else result
Definition: token2.c:20
voidp calloc(uInt items, uInt size)
Modified on Sat Dec 02 09:21:30 2023 by modify_doxy.py rev. 669887