NCBI C++ ToolKit
pattern.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: pattern.h 37548 2008-04-15 15:27:44Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ilya Dondoshansky
27  *
28  */
29 
30 /** @file pattern.h
31  * Functions for finding pattern matches in sequence (PHI-BLAST).
32  * @todo FIXME The structures used for finding pattern contain a number of
33  * arrays with static fixed sizes. It remains to be determined
34  * whether some of these arrays may be dynamically allocated
35  * instead.
36  */
37 
38 #ifndef ALGO_BLAST_CORE__PATTERN_H
39 #define ALGO_BLAST_CORE__PATTERN_H
40 
45 
46 #ifdef __cplusplus
47 extern "C" {
48 #endif
49 
50 #define PHI_BUF_SIZE 100 /**< Default size for buffers */
51 #define PHI_ASCII_SIZE 256 /**< Size of ASCII alphabet */
52 #define PHI_BITS_PACKED_PER_WORD 30 /**< Number of bits packed in a word */
53 #define PHI_MAX_WORD_SIZE 11 /**< Maximal word size */
54 /** Threshold pattern length. */
55 #define PHI_MAX_PATTERN_LENGTH (PHI_BITS_PACKED_PER_WORD * PHI_MAX_WORD_SIZE)
56 #define PHI_MAX_WORDS_IN_PATTERN 100 /**< Maximal number of words in pattern */
57 #define PHI_MAX_HIT 20000 /**< Maximal size of an array of pattern hits.
58  @todo FIXME This limit can be avoided by using
59  dynamically allocated arrays! */
60 
61 /** Options for running the pattern search. */
62 typedef enum EPatternProgram {
63  eSeed = 1,/**< Use only those query occurrences that are specified in the input
64  pattern file. @todo Not implemented. */
65  ePattern, /**< Only find pattern occurrences in database, but do not perform
66  alignments. @todo Not implemented. */
67  ePatSeed, /**< Search a BLAST database using pattern occurrences as seeds */
68  ePatMatch /**< Only find pattern occurrences in query, but do not search the
69  database. @todo Not implemented. */
71 
72 /** Type of pattern: fits in single word, several words, or is very long. */
73 typedef enum EPatternType {
74  eOneWord = 0, /**< Does pattern consist of a single word? */
75  eMultiWord, /**< Does pattern consist of a multiple words? */
76  eVeryLong /**< Is pattern too long for a simple multi-word processing? */
78 
79 /** Structure containing auxiliary items needed for a DNA search with a pattern
80  * that fits in a single word.
81  */
82 typedef struct SDNAShortPatternItems {
83  Uint4 *DNAwhichPrefixPosPtr; /**< Prefix position array for DNA patterns */
84  Uint4 *DNAwhichSuffixPosPtr; /**< Suffix position array for DNA patterns*/
85  /** Where prefix of DNA 4-mer matches pattern */
87  /** Where suffix of DNA 4-mer matches pattern */
90 
91 /** Auxiliary items needed for a PHI BLAST search with a pattern that fits in a
92  * single word.
93  */
94 typedef struct SShortPatternItems {
95  Int4 match_mask;/**< Bit mask representation of input pattern
96  for patterns that fit in a word*/
97  Int4 *whichPositionPtr; /**< Array of positions where pattern lettern should
98  match, for a single word of the pattern. */
99  SDNAShortPatternItems* dna_items; /**< Additional items for a DNA search. */
101 
102 /** Auxiliary items needed for a PHI BLAST search with pattern that contains
103  * pieces longer than a word.
104  */
105 typedef struct SExtraLongPatternItems {
106  /** When pattern has more than 7 words, keep track of how many places of
107  pattern in each word of the representation; */
109  /** Spaces until next word due to wildcard*/
111  Int4 highestPlace; /**< Number of places in pattern representation
112  as computed in input_pattern */
113  Int4 whichMostSpecific; /**< Which word in an extra long pattern
114  has the lowest probability of a match*/
116 
117 /** Auxiliary items needed for a DNA pattern search with pattern containing
118  * multiple words.
119  */
120 typedef struct SDNALongPatternItems {
121  /** Where prefix of DNA 4-mer matches pattern, for multiple-word patterns */
123  /** Where suffix of DNA 4-mer matches pattern, for multiple-word patterns */
126 
127 
128 /** Auxiliary items needed for a PHI BLAST search with pattern containing
129  * multiple words.
130  */
131 typedef struct SLongPatternItems {
132  Int4 numWords; /**< Number of words need to hold bit representation
133  of pattern*/
134  Int4 match_maskL[PHI_BUF_SIZE]; /**< Bit mask representation of input pattern
135  for long patterns*/
136  /** Which positions can a character occur in for long patterns*/
138  /** For each letter in the alphabet and each word in the masked
139  pattern representation, holds a bit pattern saying for which
140  positions the letter will match. Similar to whichPositionsByCharacter
141  for many-word patterns. */
143  Int4 inputPatternMasked[PHI_MAX_PATTERN_LENGTH]; /**< Masked input pattern */
144 
145  SDNALongPatternItems* dna_items; /**< Additional items necessary for a DNA
146  pattern. */
147  SExtraLongPatternItems* extra_long_items; /**< Additional items necessary if
148  pattern contains pieces longer
149  than a word. */
151 
152 /** Structure containing all auxiliary information needed in a pattern
153  * search.
154  */
155 typedef struct SPHIPatternSearchBlk {
156  /** Indicates if the whole pattern fits in 1 word, each of several parts of
157  the pattern fit in a word, or some parts of the pattern are too long to
158  fit in a word. */
160  double patternProbability; /**< Probability of this letter combination */
161  Int4 minPatternMatchLength; /**< Minimum length of string to match this
162  pattern*/
163  SShortPatternItems* one_word_items; /**< Items necessary when pattern fits
164  in one word. */
165 
166  SLongPatternItems* multi_word_items; /**< Additional items, when pattern
167  requires multiple words. */
168  Int4 num_patterns_db; /**< Number of patterns actually found during the
169  database search. */
170  char* pattern; /**< Pattern used, saved here for error reporting. */
172 
173 
174 /** Find the places where the pattern matches seq;
175  * 3 different methods are used depending on the length of the pattern.
176  * @param hitArray Stores the results as pairs of positions in consecutive
177  * entries [out]
178  * @param seq Sequence [in]
179  * @param len Length of the sequence [in]
180  * @param is_dna Indicates whether seq is made of DNA or protein letters [in]
181  * @param patternSearch Pattern information [in]
182  * @return Twice the number of hits (length of hitArray filled in)
183 */
185 Int4 FindPatternHits(Int4 * hitArray, const Uint1 * seq, Int4 len,
186  Boolean is_dna,
187  const SPHIPatternSearchBlk * patternSearch);
188 
189 /** Allocates the pattern occurrences structure. */
192 
193 /** Frees the pattern information structure.
194  * @param pat_info Structure to free. [in]
195  * @return NULL.
196  */
200 
201 /** Copies the SPHIQueryInfo structure.
202  * @param pat_info Structure to copy [in]
203  * @return New structure.
204  */
207 SPHIQueryInfoCopy(const SPHIQueryInfo* pat_info);
208 
209 /** Finds all pattern hits in a given query and saves them in the
210  * previously allocated SPHIQueryInfo structure.
211  * @param pattern_blk Structure containing pattern structure. [in]
212  * @param query Query sequence(s) [in]
213  * @param location Segments in the query sequence where to look for
214  * pattern [in]
215  * @param is_dna Is this a nucleotide sequence? [in]
216  * @param query_info Used to store pattern occurrences and get length
217  * of query (for error checking) [out]
218  * @return a negative number is an unknown error, INT4_MAX indicates the
219  * pattern (illegally) covered the entire query, other non-negative numbers
220  * indicate the nubmer of pattern occurrences found.
221  */
224  const BLAST_SequenceBlk * query,
225  const BlastSeqLoc * location,
226  Boolean is_dna,
227  BlastQueryInfo* query_info);
228 
229 
230 #ifdef __cplusplus
231 }
232 #endif
233 
234 #endif /* !ALGO_BLAST_CORE__PATTERN_H */
Definitions used throughout BLAST.
Defines to provide correct exporting from BLAST DLL in Windows.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
Definitions and functions associated with the BlastQueryInfo structure.
static const char location[]
Definition: config.c:97
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int len
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
EPatternProgram
Options for running the pattern search.
Definition: pattern.h:62
@ eSeed
Use only those query occurrences that are specified in the input pattern file.
Definition: pattern.h:63
@ ePatMatch
Only find pattern occurrences in query, but do not search the database.
Definition: pattern.h:68
@ ePattern
Only find pattern occurrences in database, but do not perform alignments.
Definition: pattern.h:65
@ ePatSeed
Search a BLAST database using pattern occurrences as seeds.
Definition: pattern.h:67
EPatternType
Type of pattern: fits in single word, several words, or is very long.
Definition: pattern.h:73
@ eVeryLong
Is pattern too long for a simple multi-word processing?
Definition: pattern.h:76
@ eMultiWord
Does pattern consist of a multiple words?
Definition: pattern.h:75
@ eOneWord
Does pattern consist of a single word?
Definition: pattern.h:74
#define PHI_BUF_SIZE
Default size for buffers.
Definition: pattern.h:50
Int4 PHIGetPatternOccurrences(const SPHIPatternSearchBlk *pattern_blk, const BLAST_SequenceBlk *query, const BlastSeqLoc *location, Boolean is_dna, BlastQueryInfo *query_info)
Finds all pattern hits in a given query and saves them in the previously allocated SPHIQueryInfo stru...
Definition: pattern.c:553
SPHIQueryInfo * SPHIQueryInfoFree(SPHIQueryInfo *pat_info)
Frees the pattern information structure.
Definition: pattern.c:496
struct SShortPatternItems SShortPatternItems
Auxiliary items needed for a PHI BLAST search with a pattern that fits in a single word.
#define PHI_MAX_WORDS_IN_PATTERN
Maximal number of words in pattern.
Definition: pattern.h:56
#define PHI_ASCII_SIZE
Size of ASCII alphabet.
Definition: pattern.h:51
struct SDNALongPatternItems SDNALongPatternItems
Auxiliary items needed for a DNA pattern search with pattern containing multiple words.
SPHIQueryInfo * SPHIQueryInfoNew(void)
Allocates the pattern occurrences structure.
Definition: pattern.c:478
#define PHI_MAX_WORD_SIZE
Maximal word size.
Definition: pattern.h:53
Int4 FindPatternHits(Int4 *hitArray, const Uint1 *seq, Int4 len, Boolean is_dna, const SPHIPatternSearchBlk *patternSearch)
Find the places where the pattern matches seq; 3 different methods are used depending on the length o...
Definition: pattern.c:468
struct SPHIPatternSearchBlk SPHIPatternSearchBlk
Structure containing all auxiliary information needed in a pattern search.
#define PHI_MAX_PATTERN_LENGTH
Threshold pattern length.
Definition: pattern.h:55
SPHIQueryInfo * SPHIQueryInfoCopy(const SPHIQueryInfo *pat_info)
Copies the SPHIQueryInfo structure.
Definition: pattern.c:507
struct SLongPatternItems SLongPatternItems
Auxiliary items needed for a PHI BLAST search with pattern containing multiple words.
struct SExtraLongPatternItems SExtraLongPatternItems
Auxiliary items needed for a PHI BLAST search with pattern that contains pieces longer than a word.
struct SDNAShortPatternItems SDNAShortPatternItems
Structure containing auxiliary items needed for a DNA search with a pattern that fits in a single wor...
Structure to hold a sequence.
Definition: blast_def.h:242
The query related information.
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Auxiliary items needed for a DNA pattern search with pattern containing multiple words.
Definition: pattern.h:120
Uint4 DNAprefixSLL[100][256]
Where prefix of DNA 4-mer matches pattern, for multiple-word patterns.
Definition: pattern.h:122
Uint4 DNAsuffixSLL[100][256]
Where suffix of DNA 4-mer matches pattern, for multiple-word patterns.
Definition: pattern.h:124
Structure containing auxiliary items needed for a DNA search with a pattern that fits in a single wor...
Definition: pattern.h:82
Uint4 * DNAwhichPrefixPosPtr
Prefix position array for DNA patterns.
Definition: pattern.h:83
Uint4 DNAwhichSuffixPositions[256]
Where suffix of DNA 4-mer matches pattern.
Definition: pattern.h:88
Uint4 * DNAwhichSuffixPosPtr
Suffix position array for DNA patterns.
Definition: pattern.h:84
Uint4 DNAwhichPrefixPositions[256]
Where prefix of DNA 4-mer matches pattern.
Definition: pattern.h:86
Auxiliary items needed for a PHI BLAST search with pattern that contains pieces longer than a word.
Definition: pattern.h:105
Int4 numPlacesInWord[100]
When pattern has more than 7 words, keep track of how many places of pattern in each word of the repr...
Definition: pattern.h:108
Int4 highestPlace
Number of places in pattern representation as computed in input_pattern.
Definition: pattern.h:111
Int4 spacing[100]
Spaces until next word due to wildcard.
Definition: pattern.h:110
Int4 whichMostSpecific
Which word in an extra long pattern has the lowest probability of a match.
Definition: pattern.h:113
Auxiliary items needed for a PHI BLAST search with pattern containing multiple words.
Definition: pattern.h:131
Int4 match_maskL[100]
Bit mask representation of input pattern for long patterns.
Definition: pattern.h:134
SExtraLongPatternItems * extra_long_items
Additional items necessary if pattern contains pieces longer than a word.
Definition: pattern.h:147
SDNALongPatternItems * dna_items
Additional items necessary for a DNA pattern.
Definition: pattern.h:145
Int4 SLL[100][256]
For each letter in the alphabet and each word in the masked pattern representation,...
Definition: pattern.h:142
Int4 inputPatternMasked[(30 *11)]
Masked input pattern.
Definition: pattern.h:143
Int4 bitPatternByLetter[256][11]
Which positions can a character occur in for long patterns.
Definition: pattern.h:137
Int4 numWords
Number of words need to hold bit representation of pattern.
Definition: pattern.h:132
Structure containing all auxiliary information needed in a pattern search.
Definition: pattern.h:155
SShortPatternItems * one_word_items
Items necessary when pattern fits in one word.
Definition: pattern.h:163
EPatternType flagPatternLength
Indicates if the whole pattern fits in 1 word, each of several parts of the pattern fit in a word,...
Definition: pattern.h:159
double patternProbability
Probability of this letter combination.
Definition: pattern.h:160
Int4 minPatternMatchLength
Minimum length of string to match this pattern.
Definition: pattern.h:161
char * pattern
Pattern used, saved here for error reporting.
Definition: pattern.h:170
Int4 num_patterns_db
Number of patterns actually found during the database search.
Definition: pattern.h:168
SLongPatternItems * multi_word_items
Additional items, when pattern requires multiple words.
Definition: pattern.h:166
In PHI BLAST, structure containing information about all pattern occurrences in query.
Definition: blast_def.h:300
Auxiliary items needed for a PHI BLAST search with a pattern that fits in a single word.
Definition: pattern.h:94
Int4 * whichPositionPtr
Array of positions where pattern lettern should match, for a single word of the pattern.
Definition: pattern.h:97
SDNAShortPatternItems * dna_items
Additional items for a DNA search.
Definition: pattern.h:99
Int4 match_mask
Bit mask representation of input pattern for patterns that fit in a word.
Definition: pattern.h:95
static string query
Modified on Sun Jul 21 04:19:05 2024 by modify_doxy.py rev. 669887