NCBI C++ ToolKit
blast_util.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_util.h 94402 2021-08-02 13:25:26Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  *
27  */
28 
29 /** @file blast_util.h
30  * Various auxiliary BLAST utility functions
31  */
32 
33 #ifndef ALGO_BLAST_CORE__BLAST_UTIL__H
34 #define ALGO_BLAST_CORE__BLAST_UTIL__H
35 
41 
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45 
46 #ifndef IS_residue
47 /** Does character encode a residue? */
48 #define IS_residue(x) (x <= 250)
49 #endif
50 
51 /** Bit mask for obtaining a single base from a byte in ncbi2na format */
52 #define NCBI2NA_MASK 0x03
53 
54 /** Macro to extract base N from a byte x (N >= 0, N < 4) */
55 #define NCBI2NA_UNPACK_BASE(x, N) (((x)>>(2*(N))) & NCBI2NA_MASK)
56 
57 
58 /** Deallocate memory only for the sequence in the sequence block */
61 
62 /** Deallocate memory for a sequence block */
65 
66 /** Copies contents of the source sequence block without copying sequence
67  * buffers; sets all "field_allocated" booleans to FALSE, to make sure
68  * fields are not freed on the call to BlastSequenceBlkFree.
69  * @param copy New sequence block [out]
70  * @param src Input sequence block [in]
71  */
74  BLAST_SequenceBlk* src);
75 
76 /** Set number for a given program type. Return is zero on success.
77  * @param program string name of program [in]
78  * @param number Enumerated value of program [out]
79 */
81 Int2 BlastProgram2Number(const char *program, EBlastProgramType *number);
82 
83 /** Return string name for program given a number. Return is zero on success.
84  * @param number Enumerated value of program [in]
85  * @param program string name of program (memory should be deallocated by called) [out]
86 */
89 
90 /** Allocates memory for *sequence_blk and then populates it.
91  * @param buffer start of sequence [in]
92  * @param length query sequence length [in]
93  * @param seq_blk SequenceBlk to be allocated and filled in [out]
94  * @param buffer_allocated Is the buffer allocated? If yes, 'sequence_start' is
95  * the start of the sequence, otherwise it is 'sequence'. [in]
96  * @deprecated Use BlastSeqBlkNew and BlastSeqBlkSet* functions instead
97 */
99 Int2
100 BlastSetUp_SeqBlkNew (const Uint1* buffer, Int4 length,
101  BLAST_SequenceBlk* *seq_blk, Boolean buffer_allocated);
102 
103 /** Allocates a new sequence block structure.
104  * @param retval Pointer to where the sequence block structure will be
105  * allocated [out]
106  */
109 
110 /** Stores the sequence in the sequence block structure.
111  * @param seq_blk The sequence block structure to modify [in/out]
112  * @param sequence Actual sequence buffer. The first byte must be a sentinel
113  * byte [in]
114  * @param seqlen Length of the sequence buffer above [in]
115  */
118  const Uint1* sequence,
119  Int4 seqlen);
120 
121 /** Stores the compressed nucleotide sequence in the sequence block structure
122  * for the subject sequence when BLASTing 2 sequences. This sequence should be
123  * encoded in eBlastEncodingNcbi2na and NOT have sentinel bytes (as this
124  * encoding doesn't allow them).
125  * @param seq_blk The sequence block structure to modify [in/out]
126  * @param sequence Actual sequence buffer. [in]
127  */
130  const Uint1* sequence);
131 
132 /** Sets the seq_range and related fields appropriately in the
133  * BLAST_SequenceBlk structure
134  * @param seq_blk The sequence block structure to modify [in/out]
135  * @param seq_ranges sequence ranges to copy [in]
136  * @param num_seq_ranges number of elements in array above [in]
137  * @param copy_seq_ranges set to TRUE if seq_ranges should be copied to the
138  * @param mask_type either kSoftDBMask or kHardDBMask [in]
139  * BLAST_SequenceBlk and assume its ownership, set to FALSE if the pointer
140  * should be copied and the ownership of the seq_ranges remains in the caller's
141  * possession.
142  * @note this function will free the memory previously allocated to
143  * BLAST_SequenceBlk::seq_ranges (if applicable) and overwrite it with the
144  * seq_ranges argument. This might invalidate BLAST_SequenceBlk structures that
145  * were copied off of this one.
146  */
149  SSeqRange* seq_ranges,
150  Uint4 num_seq_ranges,
151  Boolean copy_seq_ranges,
152  ESubjectMaskingType mask_type);
153 
154 /** Adds a specialized representation of sequence data to a sequence
155  * block. In the specialized representation, the byte at offset i
156  * packs together nucleotide bases i to i+3
157  * @param seq_blk structure containing sequence data. Data is assumed
158  * to be in blastna format [in][out]
159  */
162 
163 
164 /** GetTranslation to get the translation of the nucl. sequence in the
165  * appropriate frame and with the appropriate GeneticCode.
166  * The function return an allocated char*, the caller must delete this.
167  * The first and last spaces of this char* contain NULLB's.
168  * @param query_seq Forward strand of the nucleotide sequence [in]
169  * @param query_seq_rev Reverse strand of the nucleotide sequence [in]
170  * @param nt_length Length of the nucleotide sequence [in]
171  * @param frame What frame to translate into? [in]
172  * @param buffer Preallocated buffer for the translated sequence [in][out]
173  * @param genetic_code Genetic code to use for translation,
174  * in ncbistdaa encoding [in]
175  * @return Length of the translated protein sequence.
176 */
178 Int4 BLAST_GetTranslation(const Uint1* query_seq,
179  const Uint1* query_seq_rev, Int4 nt_length, Int2 frame, Uint1* buffer,
180  const Uint1* genetic_code);
181 
182 
183 
184 /** Translate a nucleotide sequence without ambiguity codes.
185  * This is used for the first-pass translation of the database.
186  * The genetic code to be used is determined by the translation_table
187  * This function translates a packed (ncbi2na) nucl. alphabet. It
188  * views a basepair as being in one of four sets of 2-bits:
189  * |0|1|2|3||0|1|2|3||0|1|2|3||...
190  *
191  * 1st byte | 2 byte | 3rd byte...
192  *
193  * A codon that starts at the beginning of the above sequence starts in
194  * state "0" and includes basepairs 0, 1, and 2. The next codon, in the
195  * same frame, after that starts in state "3" and includes 3, 0, and 1.
196  *
197  *** Optimization:
198  * changed the single main loop to
199  * - advance to state 0,
200  * - optimized inner loop does two (3 byte->4 codon) translation per iteration
201  * (loads are moved earlier so they can be done in advance.)
202  * - do remainder
203  *
204  * @param translation The translation table [in]
205  * @param length Length of the nucleotide sequence [in]
206  * @param nt_seq The original nucleotide sequence [in]
207  * @param frame What frame to translate to? [in]
208  * @param prot_seq Preallocated buffer for the (translated) protein sequence,
209  * with NULLB sentinels on either end. [out]
210 */
212 Int4 BLAST_TranslateCompressedSequence(Uint1* translation, Int4 length,
213  const Uint1* nt_seq, Int2 frame, Uint1* prot_seq);
214 
215 /** Reverse a nucleotide sequence in the blastna encoding, adding sentinel
216  * bytes on both ends.
217  * @param sequence Forward strand of the sequence [in]
218  * @param length Length of the sequence plus 1 for the sentinel byte [in]
219  * @param rev_sequence_ptr Reverse strand of the sequence [out]
220  */
222 Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length,
223  Uint1** rev_sequence_ptr);
224 
225 /** This function translates the context number of a context into the frame of
226  * the sequence.
227  * @param prog_number Integer corresponding to the BLAST program
228  * @param context_number Context number
229  * @return Sequence frame: -1,1 for nucleotides, -3,-2,-1,1,2,3 for
230  * translations, 0 for proteins and INT1_MAX in case of unsupported program
231 */
233 Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number);
234 
235 /** Convert a sequence in ncbi4na or blastna encoding into a packed sequence
236  * in ncbi2na encoding. Needed for 2 sequences BLASTn comparison.
237  * @param buffer original sequence data (one base per byte) [in]
238  * @param length length of the sequence data above [in]
239  * @param encoding source encoding of the sequence data above [in]
240  * @param packed_seq output buffer containing compressed sequence. Its length
241  * will be (length/COMPRESSION_RATIO + 1), caller is responsible for
242  * deallocating it [out]
243  * @return 0 in case of success, -1 in case of memory allocation failure
244  */
246 Int2 BLAST_PackDNA(const Uint1* buffer, Int4 length,
247  EBlastEncoding encoding, Uint1** packed_seq);
248 
249 /**
250  * @brief Calculates the length of frame for a translated protein
251  *
252  * @param nucleotide_length Length of the nucleotide sequence translated [in]
253  * @param context Index of the translated frame (values: 0 to 5, inclusive)
254  * [in]
255  *
256  * @return The requested length, or 0 if the nucleotide length is 0
257  */
259 size_t
260 BLAST_GetTranslatedProteinLength(size_t nucleotide_length,
261  unsigned int context);
262 
263 /** Initialize the mixed-frame sequence for out-of-frame gapped extension.
264  * @param query_blk Sequence block containing the concatenated frames of the
265  * query. The mixed-frame sequence is saved here. [in] [out]
266  * @param query_info Query information structure containing offsets into the*
267  * concatenated sequence. [in]
268  */
271  const BlastQueryInfo* query_info);
272 
273 /** Translate nucleotide into 6 frames. All frames are put into a
274  * translation buffer, with sentinel NULLB bytes in between.
275  * Array of offsets into the translation buffer is also returned.
276  * For out-of-frame gapping option, a mixed frame sequence is created.
277  * @param nucl_seq The nucleotide sequence [in]
278  * @param encoding Sequence encoding: ncbi2na or ncbi4na [in]
279  * @param nucl_length Length of the nucleotide sequence [in]
280  * @param genetic_code The genetic code to be used for translations,
281  * in ncbistdaa encoding [in]
282  * @param translation_buffer_ptr Buffer to hold the frames of the translated
283  * sequence. [out]
284  * @param frame_offsets_ptr Offsets into the translation buffer for each
285  * frame. [out]
286  * @param mixed_seq_ptr Pointer to buffer for the mixed frame sequence [out]
287  */
289 Int2 BLAST_GetAllTranslations(const Uint1* nucl_seq, EBlastEncoding encoding,
290  Int4 nucl_length, const Uint1* genetic_code,
291  Uint1** translation_buffer_ptr, Uint4** frame_offsets_ptr,
292  Uint1** mixed_seq_ptr);
293 
294 /** Get one frame translation - needed when only parts of subject sequences
295  * are translated.
296  * @param nucl_seq Pointer to start of nucleotide sequence to be translated [in]
297  * @param nucl_length Length of nucleotide sequence to be translated [in]
298  * @param frame What frame to translate into [in]
299  * @param genetic_code What genetic code to use? [in]
300  * @param translation_buffer_ptr Pointer to buffer with translated
301  * sequence [out]
302  * @param protein_length Length of the translation buffer [out]
303  * @param mixed_seq_ptr Pointer to buffer with mixed frame sequence, in case
304  * of out-of-frame gapping; buffer filled only if argument
305  * not NULL. [out]
306  */
308 int Blast_GetPartialTranslation(const Uint1* nucl_seq,
309  Int4 nucl_length, Int2 frame, const Uint1* genetic_code,
310  Uint1** translation_buffer_ptr, Int4* protein_length,
311  Uint1** mixed_seq_ptr);
312 
313 
314 /** Convert translation frame or strand into a context number suitable for
315  * indexing into the BlastQueryInfo::contexts array
316  * @param frame Frame (allowed values: 1,2,3,-1,-2,-3, 0) [in]
317  * @param program Type of BLAST program [in]
318  * @return context number: 0 or 1 for nucleotide query/subjects,
319  * a value between 0 and 5 (inclusive) for translated query/subjects, and 0 for
320  * protein query/subjects.
321  */
324 
325 
326 /** The following binary search routine assumes that array A is filled. */
329 
330 /** Get the standard amino acid probabilities. This is basically a wrapper for
331  * BlastScoreBlkNew() and Blast_ResFreqStdComp() from blast_stat.c with a more
332  * intention-revealing name :)
333  * Caller is responsible for deallocating return value via sfree().
334  * @return NULL if there is not enough memory otherwise an array of length
335  * BLASTAA_SIZE, where each index corresponds to an amino acid as specified in
336  * the NCBIstdaa encoding.
337  */
339 double*
341 
342 /** Returns a copy of the input string with all its characters turned to
343  * uppercase. Useful for saving score matrix names. Caller is responsible for
344  * deallocating return value.
345  * @param string string to copy [in]
346  * @return newly allocated string in upper case or NULL if string is NULL or
347  * out of memory
348  */
350 char*
351 BLAST_StrToUpper(const char* string);
352 
353 /** Maximal unpacked subject sequence length for which full translation is
354  * performed up front.
355  */
356 #define MAX_FULL_TRANSLATION 2100
357 
358 /** This sentry value is used as a 'fence' around the valid portions
359  * of partially decoded sequences. If an alignment finds this value
360  * in a subject sequence, the fence_hit flag should be used to request
361  * a refetch of the whole sequence, and the alignment restarted.
362  * @note this value is repeated in seqdbgeneral.hpp
363  */
364 #define FENCE_SENTRY 201
365 
366 /** Get the number of contexts for a given program. This corresponds to the
367  * number of translation frames or strands whenever applicable.
368  * @return 0 on unsupported program, non-zero otherwise
369  */
371 unsigned int BLAST_GetNumberOfContexts(EBlastProgramType program);
372 
373 /** Free SBlastTargetTranslation
374  * @param target_t object to be freed [in]
375  */
379 
380 /** Sets up structure for target translation.
381  * @param subject_blk Target sequence information [in]
382  * @param gen_code_string Genetic code translation information [in]
383  * @param program_number BLAST program [in]
384  * @param is_ooframe Out-of-frame translation if true [in]
385  * @param target Structure being set up. [out]
386  */
388 Int2
390  const Uint1* gen_code_string,
391  EBlastProgramType program_number,
392  Boolean is_ooframe,
393  SBlastTargetTranslation** target);
394 #ifdef __cplusplus
395 }
396 #endif
397 #endif /* !ALGO_BLAST_CORE__BLAST_UTIL__H */
Definitions used throughout BLAST.
ESubjectMaskingType
Define the possible subject masking types.
Definition: blast_def.h:235
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
Definitions for various programs supported by core BLAST.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
Definitions and functions associated with the BlastQueryInfo structure.
Int4 BLAST_FrameToContext(Int2 frame, EBlastProgramType program)
Convert translation frame or strand into a context number suitable for indexing into the BlastQueryIn...
Definition: blast_util.c:1211
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Definition: blast_util.c:245
Int2 BlastSeqBlkSetSeqRanges(BLAST_SequenceBlk *seq_blk, SSeqRange *seq_ranges, Uint4 num_seq_ranges, Boolean copy_seq_ranges, ESubjectMaskingType mask_type)
Sets the seq_range and related fields appropriately in the BLAST_SequenceBlk structure.
Definition: blast_util.c:182
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence, Int4 seqlen)
Stores the sequence in the sequence block structure.
Definition: blast_util.c:147
size_t BLAST_GetTranslatedProteinLength(size_t nucleotide_length, unsigned int context)
Calculates the length of frame for a translated protein.
Definition: blast_util.c:923
Int2 BLAST_CreateMixedFrameDNATranslation(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info)
Initialize the mixed-frame sequence for out-of-frame gapped extension.
Definition: blast_util.c:931
Int2 BlastNumber2Program(EBlastProgramType number, char **program)
Return string name for program given a number.
Definition: blast_util.c:312
Int2 BlastSeqBlkSetCompressedSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence)
Stores the compressed nucleotide sequence in the sequence block structure for the subject sequence wh...
Definition: blast_util.c:167
int Blast_GetPartialTranslation(const Uint1 *nucl_seq, Int4 nucl_length, Int2 frame, const Uint1 *genetic_code, Uint1 **translation_buffer_ptr, Int4 *protein_length, Uint1 **mixed_seq_ptr)
Get one frame translation - needed when only parts of subject sequences are translated.
Definition: blast_util.c:1141
Int2 BlastTargetTranslationNew(BLAST_SequenceBlk *subject_blk, const Uint1 *gen_code_string, EBlastProgramType program_number, Boolean is_ooframe, SBlastTargetTranslation **target)
Sets up structure for target translation.
Definition: blast_util.c:1268
Int2 BlastSetUp_SeqBlkNew(const Uint1 *buffer, Int4 length, BLAST_SequenceBlk **seq_blk, Boolean buffer_allocated)
Allocates memory for *sequence_blk and then populates it.
Definition: blast_util.c:101
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
This function translates the context number of a context into the frame of the sequence.
Definition: blast_util.c:839
Int2 BLAST_PackDNA(const Uint1 *buffer, Int4 length, EBlastEncoding encoding, Uint1 **packed_seq)
Convert a sequence in ncbi4na or blastna encoding into a packed sequence in ncbi2na encoding.
Definition: blast_util.c:870
SBlastTargetTranslation * BlastTargetTranslationFree(SBlastTargetTranslation *target_t)
Free SBlastTargetTranslation.
Definition: blast_util.c:1248
Int2 BlastCompressBlastnaSequence(BLAST_SequenceBlk *seq_blk)
Adds a specialized representation of sequence data to a sequence block.
Definition: blast_util.c:459
void BlastSequenceBlkClean(BLAST_SequenceBlk *seq_blk)
Deallocate memory only for the sequence in the sequence block.
Definition: blast_util.c:220
Int2 BLAST_GetAllTranslations(const Uint1 *nucl_seq, EBlastEncoding encoding, Int4 nucl_length, const Uint1 *genetic_code, Uint1 **translation_buffer_ptr, Uint4 **frame_offsets_ptr, Uint1 **mixed_seq_ptr)
Translate nucleotide into 6 frames.
Definition: blast_util.c:1045
Int4 BLAST_GetTranslation(const Uint1 *query_seq, const Uint1 *query_seq_rev, Int4 nt_length, Int2 frame, Uint1 *buffer, const Uint1 *genetic_code)
GetTranslation to get the translation of the nucl.
Definition: blast_util.c:428
Int2 BlastProgram2Number(const char *program, EBlastProgramType *number)
Set number for a given program type.
Definition: blast_util.c:278
Int2 GetReverseNuclSequence(const Uint1 *sequence, Int4 length, Uint1 **rev_sequence_ptr)
Reverse a nucleotide sequence in the blastna encoding, adding sentinel bytes on both ends.
Definition: blast_util.c:807
Int2 BlastSeqBlkNew(BLAST_SequenceBlk **retval)
Allocates a new sequence block structure.
Definition: blast_util.c:133
unsigned int BLAST_GetNumberOfContexts(EBlastProgramType program)
Get the number of contexts for a given program.
Definition: blast_util.c:1373
Int4 BSearchInt4(Int4 n, Int4 *A, Int4 size)
The following binary search routine assumes that array A is filled.
Definition: blast_util.c:1231
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
Definition: blast_util.c:1323
char * BLAST_StrToUpper(const char *string)
Returns a copy of the input string with all its characters turned to uppercase.
Definition: blast_util.c:1352
void BlastSequenceBlkCopy(BLAST_SequenceBlk **copy, BLAST_SequenceBlk *src)
Copies contents of the source sequence block without copying sequence buffers; sets all "field_alloca...
Definition: blast_util.c:259
Int4 BLAST_TranslateCompressedSequence(Uint1 *translation, Int4 length, const Uint1 *nt_seq, Int2 frame, Uint1 *prot_seq)
Translate a nucleotide sequence without ambiguity codes.
Definition: blast_util.c:508
#define A(i)
Definition: ecp_curves.c:948
EBlastEncoding
Different types of sequence encodings for sequence retrieval from the BLAST database.
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int8_t Int1
1-byte (8-bit) signed integer
Definition: ncbitype.h:98
yy_size_t n
const struct ncbi::grid::netcache::search::fields::SIZE size
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static BOOL number
Definition: pcregrep.c:193
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Structure to hold a sequence.
Definition: blast_def.h:242
The query related information.
Information about target translations.
Definition: blast_def.h:311
A structure containing two integers, used e.g.
Definition: blast_def.h:155
Modified on Mon Feb 26 04:01:32 2024 by modify_doxy.py rev. 669887