NCBI C++ ToolKit
blast_util.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_util.c 100164 2023-06-28 13:36:01Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ilya Dondoshansky
27  *
28  */
29 
30 /** @file blast_util.c
31  * Various BLAST utilities
32  */
33 
34 
38 
39 void
40 __sfree(void **x)
41 {
42  free(*x);
43  *x = NULL;
44  return;
45 }
46 
48 {
49  SSeqRange retval;
50  retval.left = start;
51  retval.right = stop;
52  return retval;
53 }
54 
55 Int4
57  target)
58 {
59  Int4 retval = -1; /* assume failure */
60  Int4 m = 0, b = 0, e = 0;
61 
62  if (ranges == NULL || num_ranges <= 0) {
63  return retval;
64  }
65 
66  b = 0;
67  e = num_ranges;
68  while (b < e - 1) {
69  m = (b + e) / 2;
70  if (ranges[m].left > target) {
71  e = m;
72  } else {
73  b = m;
74  }
75  }
76  /* if the target isn't in the range at index b and there is still more
77  * data, return the next element */
78  if ( (target > ranges[b].right) && (b < (num_ranges-1) ) ) {
79  return b + 1;
80  } else {
81  return b;
82  }
83 }
84 
85 /** Auxiliary function to free the BLAST_SequenceBlk::seq_ranges field if
86  * applicable
87  * @param seq_blk The sequence block structure to manipulate [in|out]
88  */
89 static void
91 {
92  ASSERT(seq_blk);
93  if (seq_blk->seq_ranges_allocated) {
94  sfree(seq_blk->seq_ranges);
95  seq_blk->num_seq_ranges = 0;
96  seq_blk->seq_ranges_allocated = FALSE;
97  }
98 }
99 
100 Int2
102  BLAST_SequenceBlk* *seq_blk, Boolean buffer_allocated)
103 {
104  /* Check if BLAST_SequenceBlk itself needs to be allocated here or not */
105  if (*seq_blk == NULL) {
106  if (BlastSeqBlkNew(seq_blk) != 0) {
107  return -1;
108  }
109  }
110  ASSERT(seq_blk && *seq_blk);
111 
112  if (buffer_allocated) {
113  (*seq_blk)->sequence_start_allocated = TRUE;
114  (*seq_blk)->sequence_start = (Uint1 *) buffer;
115  /* The first byte is a sentinel byte. */
116  (*seq_blk)->sequence = (*seq_blk)->sequence_start+1;
117 
118  } else {
119  (*seq_blk)->sequence = (Uint1 *) buffer;
120  (*seq_blk)->sequence_start = NULL;
121  }
122 
123  (*seq_blk)->sequence_start_nomask = (*seq_blk)->sequence_start;
124  (*seq_blk)->sequence_nomask = (*seq_blk)->sequence;
125  (*seq_blk)->nomask_allocated = FALSE;
126 
127  (*seq_blk)->length = length;
128  (*seq_blk)->bases_offset = 0;
129 
130  return 0;
131 }
132 
134 {
135  if ( !retval ) {
136  return -1;
137  } else {
138  *retval = (BLAST_SequenceBlk*) calloc(1, sizeof(BLAST_SequenceBlk));
139  if ( !*retval ) {
140  return -1;
141  }
142  }
143 
144  return 0;
145 }
146 
148  const Uint1* sequence,
149  Int4 seqlen)
150 {
151  if ( !seq_blk ) {
152  return -1;
153  }
154 
155  seq_blk->sequence_start_allocated = TRUE;
156  seq_blk->sequence_start = (Uint1*) sequence;
157  seq_blk->sequence = (Uint1*) sequence + 1;
158  seq_blk->sequence_start_nomask = seq_blk->sequence_start;
159  seq_blk->sequence_nomask = seq_blk->sequence_start_nomask + 1;
160  seq_blk->nomask_allocated = FALSE;
161  seq_blk->length = seqlen;
162  seq_blk->oof_sequence = NULL;
163 
164  return 0;
165 }
166 
168  const Uint1* sequence)
169 {
170  if ( !seq_blk ) {
171  return -1;
172  }
173 
174  seq_blk->sequence_allocated = TRUE;
175  seq_blk->sequence = (Uint1*) sequence;
176  seq_blk->oof_sequence = NULL;
177 
178  return 0;
179 }
180 
181 Int2
183  SSeqRange* seq_ranges,
184  Uint4 num_seq_ranges,
185  Boolean copy_seq_ranges,
186  ESubjectMaskingType mask_type)
187 {
188  SSeqRange* tmp;
189 
190  if ( !seq_blk || !seq_ranges ) {
191  return -1;
192  }
193 
194  ASSERT(num_seq_ranges >= 1);
195 
197  if (copy_seq_ranges) {
198  // allocate one more space for easy complimentary operations
199  seq_blk->seq_ranges_allocated = TRUE;
200  tmp = (SSeqRange *) calloc(num_seq_ranges, sizeof(SSeqRange));
201  if ( !tmp ) { return -1; }
202  memcpy((void*) tmp,
203  (void*) seq_ranges,
204  num_seq_ranges * sizeof(*seq_ranges));
205  } else {
206  // CSeqDB has allocated one more space before and after seq_range
207  seq_blk->seq_ranges_allocated = FALSE;
208  tmp = seq_ranges;
209  }
210 
211  // Fill out the boundary of the sequence to compliment the masks
212  tmp[0].left = 0;
213  tmp[num_seq_ranges - 1].right = seq_blk->length;
214  seq_blk->seq_ranges = tmp;
215  seq_blk->num_seq_ranges = num_seq_ranges;
216  seq_blk->mask_type = mask_type;
217  return 0;
218 }
219 
221 {
222  if (!seq_blk)
223  return;
224 
225  if (seq_blk->sequence_allocated) {
226  sfree(seq_blk->sequence);
227  seq_blk->sequence_allocated = FALSE;
228  }
229  if (seq_blk->sequence_start_allocated) {
230  sfree(seq_blk->sequence_start);
231  seq_blk->sequence_start_allocated = FALSE;
232  }
233  if (seq_blk->oof_sequence_allocated) {
234  sfree(seq_blk->oof_sequence);
235  seq_blk->oof_sequence_allocated = FALSE;
236  }
237  if (seq_blk->nomask_allocated) {
238  sfree(seq_blk->sequence_start_nomask);
239  seq_blk->nomask_allocated = FALSE;
240  }
242  return;
243 }
244 
246 {
247  if (!seq_blk)
248  return NULL;
249 
250  BlastSequenceBlkClean(seq_blk);
251  if (seq_blk->lcase_mask_allocated)
252  BlastMaskLocFree(seq_blk->lcase_mask);
253  if (seq_blk->compressed_nuc_seq_start)
255  sfree(seq_blk);
256  return NULL;
257 }
258 
260  BLAST_SequenceBlk* src)
261 {
262  ASSERT(copy);
263  ASSERT(src);
264 
265  if (*copy) {
266  memcpy(*copy, src, sizeof(BLAST_SequenceBlk));
267  } else {
268  *copy = BlastMemDup(src, sizeof(BLAST_SequenceBlk));
269  }
270 
271  (*copy)->sequence_allocated = FALSE;
272  (*copy)->sequence_start_allocated = FALSE;
273  (*copy)->oof_sequence_allocated = FALSE;
274  (*copy)->lcase_mask_allocated = FALSE;
275  (*copy)->seq_ranges_allocated = FALSE;
276 }
277 
279 {
281  if (program == NULL)
282  return 1;
283 
284  if (strcasecmp("blastn", program) == 0)
286  else if (strcasecmp("blastp", program) == 0)
288  else if (strcasecmp("blastx", program) == 0)
290  else if (strcasecmp("tblastn", program) == 0)
292  else if (strcasecmp("tblastx", program) == 0)
294  else if (strcasecmp("rpsblast", program) == 0)
296  else if (strcasecmp("rpstblastn", program) == 0)
298  else if (strcasecmp("psiblast", program) == 0)
300  else if (strcasecmp("psitblastn", program) == 0)
302  else if (strcasecmp("phiblastn", program) == 0)
304  else if (strcasecmp("phiblastp", program) == 0)
306  else if (strcasecmp("mapper", program) == 0)
308 
309  return 0;
310 }
311 
313 {
314 
315  if (program == NULL)
316  return 1;
317 
318  switch (number) {
319  case eBlastTypeBlastn:
320  *program = strdup("blastn");
321  break;
322  case eBlastTypeBlastp:
323  *program = strdup("blastp");
324  break;
325  case eBlastTypeBlastx:
326  *program = strdup("blastx");
327  break;
328  case eBlastTypeTblastn:
329  *program = strdup("tblastn");
330  break;
331  case eBlastTypeTblastx:
332  *program = strdup("tblastx");
333  break;
334  case eBlastTypeRpsBlast:
335  *program = strdup("rpsblast");
336  break;
338  *program = strdup("rpstblastn");
339  break;
340  case eBlastTypePsiBlast:
341  *program = strdup("psiblast");
342  break;
344  *program = strdup("psitblastn");
345  break;
346  case eBlastTypePhiBlastp:
347  *program = strdup("phiblastp");
348  break;
349  case eBlastTypePhiBlastn:
350  *program = strdup("phiblastn");
351  break;
352  case eBlastTypeMapping:
353  *program = strdup("mapper");
354  break;
355  default:
356  *program = strdup("unknown");
357  break;
358  }
359 
360  return 0;
361 }
362 
363 /** Translate 3 nucleotides into an amino acid
364  * MUST have 'X' as unknown amino acid
365  * @param codon 3 values in ncbi4na code
366  * @param codes Geneic code string to use (must be in ncbistdaa encoding!)
367  * @return Amino acid in ncbistdaa
368  */
369 static Uint1
370 s_CodonToAA (Uint1* codon, const Uint1* codes)
371 {
372  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
373  register Uint1 aa = 0, taa;
374  register int i, j, k, index0, index1, index2;
375  static Uint1 mapping[4] = { 8, /* T in ncbi4na */
376  2, /* C */
377  1, /* A */
378  4 }; /* G */
379 
380  /* Arithmetic should be faster than conditionals (i.e. with &&s.)
381  The OR cannot result in anything larger than 15 unless it is a
382  FENCE_SENTRY byte or an error, but I poll the individual bytes
383  just in case. */
384 
385  if ((codon[0] | codon[1] | codon[2]) > 15) {
386  if ((codon[0] == FENCE_SENTRY) ||
387  (codon[1] == FENCE_SENTRY) ||
388  (codon[2] == FENCE_SENTRY)) {
389 
390  return FENCE_SENTRY;
391  }
392  }
393 
394  for (i = 0; i < 4; i++) {
395  if (codon[0] & mapping[i]) {
396  index0 = i * 16;
397  for (j = 0; j < 4; j++) {
398  if (codon[1] & mapping[j]) {
399  index1 = index0 + (j * 4);
400  for (k = 0; k < 4; k++) {
401  if (codon[2] & mapping[k]) {
402  index2 = index1 + k;
403  taa = codes[index2];
404  if (! aa)
405  aa = taa;
406  else {
407  if (taa != aa) {
408  aa = kXResidue;
409  break;
410  }
411  }
412  }
413  if (aa == kXResidue)
414  break;
415  }
416  }
417  if (aa == kXResidue)
418  break;
419  }
420  }
421  if (aa == kXResidue)
422  break;
423  }
424  return aa;
425 }
426 
427 Int4
428 BLAST_GetTranslation(const Uint1* query_seq, const Uint1* query_seq_rev,
429  Int4 nt_length, Int2 frame, Uint1* prot_seq, const Uint1* genetic_code)
430 {
431  Uint1 codon[CODON_LENGTH];
432  Int4 index, index_prot;
433  Uint1 residue;
434  Uint1* nucl_seq;
435 
436  nucl_seq = (frame >= 0 ? (Uint1 *)query_seq : (Uint1 *)(query_seq_rev+1));
437 
438  /* The first character in the protein is the NULLB sentinel. */
439  prot_seq[0] = NULLB;
440  index_prot = 1;
441  for (index=ABS(frame)-1; index<nt_length-2; index += CODON_LENGTH)
442  {
443  codon[0] = nucl_seq[index];
444  codon[1] = nucl_seq[index+1];
445  codon[2] = nucl_seq[index+2];
446  residue = s_CodonToAA(codon, genetic_code);
447  if (IS_residue(residue) || residue == FENCE_SENTRY)
448  {
449  prot_seq[index_prot] = residue;
450  index_prot++;
451  }
452  }
453  prot_seq[index_prot] = NULLB;
454 
455  return index_prot - 1;
456 }
457 
458 Int2
460 {
461  Int4 i;
462  Int4 curr_letter;
463  Int4 max_start;
464  Int4 len = seq_blk->length;
465  Uint1* old_seq = seq_blk->sequence;
466  Uint1* new_seq;
467 
468  seq_blk->compressed_nuc_seq_start =
469  (Uint1 *)malloc((len + 3) * sizeof(Uint1));
470  new_seq = seq_blk->compressed_nuc_seq =
471  seq_blk->compressed_nuc_seq_start + 3;
472 
473  new_seq[-1] = new_seq[-2] = new_seq[-3] = 0;
474  new_seq[len-3] = new_seq[len-2] = new_seq[len-1] = 0;
475 
476  /* the first 3 bytes behind new_seq contain right-justified
477  versions of the first 3 (or less) bases */
478  max_start = MIN(3, len);
479  curr_letter = 0;
480  for (i = 0; i < max_start; i++) {
481  curr_letter = curr_letter << 2 | (old_seq[i] & 3);
482  new_seq[i - max_start] = curr_letter;
483  }
484 
485  /* offset i into new_seq points to bases i to i+3
486  packed together into one byte */
487 
488  for (; i < len; i++) {
489  curr_letter = curr_letter << 2 | (old_seq[i] & 3);
490  new_seq[i - max_start] = curr_letter;
491  }
492 
493  /* the last 3 bytes contain left-justified versions of
494  the last 3 (or less) bases */
495  max_start = MIN(3, len);
496  for (i = 0; i < max_start; i++) {
497  curr_letter = curr_letter << 2;
498  new_seq[len - (max_start - i)] = curr_letter;
499  }
500 
501  return 0;
502 }
503 
504 /*
505  Translate a compressed nucleotide sequence without ambiguity codes.
506 */
507 Int4
509  const Uint1* nt_seq, Int2 frame, Uint1* prot_seq)
510 {
511  int state;
512  Int2 total_remainder;
513  Int4 prot_length;
514  int byte_value, codon=-1;
515  Uint1 last_remainder, last_byte, remainder;
516  Uint1* nt_seq_end,* nt_seq_start;
517  Uint1* prot_seq_start;
518  int byte_value1,byte_value2,byte_value3,byte_value4,byte_value5;
519 
520  prot_length=0;
521  if (nt_seq == NULL || prot_seq == NULL ||
522  (length-ABS(frame)+1) < CODON_LENGTH)
523  return prot_length;
524 
525  *prot_seq = NULLB;
526  prot_seq++;
527 
528  /* record to determine protein length. */
529  prot_seq_start = prot_seq;
530 
531  remainder = length%4;
532 
533  if (frame > 0) {
534  nt_seq_end = (Uint1 *) (nt_seq + (length)/4 - 1);
535  last_remainder = (4*(length/4) - frame + 1)%CODON_LENGTH;
536  total_remainder = last_remainder+remainder;
537 
538  state = frame-1;
539  byte_value = *nt_seq;
540 
541  /* If there's lots to do, advance to state 0, then enter fast loop */
542  while (nt_seq < nt_seq_end) {
543  switch (state) {
544  case 0:
545  codon = (byte_value >> 2);
546  *prot_seq = translation[codon];
547  prot_seq++;
548  /* do state = 3 now, break is NOT missing. */
549  case 3:
550  codon = ((byte_value & 3) << 4);
551  nt_seq++;
552  byte_value = *nt_seq;
553  codon += (byte_value >> 4);
554  *prot_seq = translation[codon];
555  prot_seq++;
556  if (nt_seq >= nt_seq_end) {
557  state = 2;
558  break;
559  }
560  /* Go on to state = 2 if not at end. */
561  case 2:
562  codon = ((byte_value & 15) << 2);
563  nt_seq++;
564  byte_value = *nt_seq;
565  codon += (byte_value >> 6);
566  *prot_seq = translation[codon];
567  prot_seq++;
568  if (nt_seq >= nt_seq_end) {
569  state = 1;
570  break;
571  }
572  /* Go on to state = 1 if not at end. */
573  case 1:
574  codon = byte_value & 63;
575  *prot_seq = translation[codon];
576  prot_seq++;
577  nt_seq++;
578  byte_value = *nt_seq;
579  state = 0;
580  break;
581  } /* end switch */
582  /* switch ends at state 0, except when at end */
583 
584  /********************************************/
585  /* optimized loop: start in state 0. continue til near end */
586  while (nt_seq < (nt_seq_end-10)) {
587  byte_value1 = *(++nt_seq);
588  byte_value2 = *(++nt_seq);
589  byte_value3 = *(++nt_seq);
590  /* case 0: */
591  codon = (byte_value >> 2);
592  *prot_seq = translation[codon];
593  prot_seq++;
594 
595  /* case 3: */
596  codon = ((byte_value & 3) << 4);
597  codon += (byte_value1 >> 4);
598  *prot_seq = translation[codon];
599  prot_seq++;
600 
601  byte_value4 = *(++nt_seq);
602  /* case 2: */
603  codon = ((byte_value1 & 15) << 2);
604 
605  codon += (byte_value2 >> 6);
606  *prot_seq = translation[codon];
607  prot_seq++;
608  /* case 1: */
609  codon = byte_value2 & 63;
610  byte_value5 = *(++nt_seq);
611  *prot_seq = translation[codon];
612  prot_seq++;
613 
614  /* case 0: */
615  codon = (byte_value3 >> 2);
616  *prot_seq = translation[codon];
617  prot_seq++;
618  /* case 3: */
619  byte_value = *(++nt_seq);
620  codon = ((byte_value3 & 3) << 4);
621  codon += (byte_value4 >> 4);
622  *prot_seq = translation[codon];
623  prot_seq++;
624  /* case 2: */
625  codon = ((byte_value4 & 15) << 2);
626  codon += (byte_value5 >> 6);
627  *prot_seq = translation[codon];
628  prot_seq++;
629  /* case 1: */
630  codon = byte_value5 & 63;
631  *prot_seq = translation[codon];
632  prot_seq++;
633  state=0;
634  } /* end optimized while */
635  /********************************************/
636  } /* end while */
637 
638  if (state == 1) {
639  /* This doesn't get done above, DON't do the state = 0
640  below if this is done. */
641  byte_value = *nt_seq;
642  codon = byte_value & 63;
643  state = 0;
644  *prot_seq = translation[codon];
645  prot_seq++;
646  } else if (state == 0) { /* This one doesn't get done above. */
647  byte_value = *nt_seq;
648  codon = ((byte_value) >> 2);
649  state = 3;
650  *prot_seq = translation[codon];
651  prot_seq++;
652  }
653 
654  if (total_remainder >= CODON_LENGTH) {
655  byte_value = *(nt_seq_end);
656  last_byte = *(nt_seq_end+1);
657  if (state == 0) {
658  codon = (last_byte >> 2);
659  } else if (state == 2) {
660  codon = ((byte_value & 15) << 2);
661  codon += (last_byte >> 6);
662  } else if (state == 3) {
663  codon = ((byte_value & 3) << 4);
664  codon += (last_byte >> 4);
665  }
666  *prot_seq = translation[codon];
667  prot_seq++;
668  }
669  } else {
670  nt_seq_start = (Uint1 *) nt_seq;
671  nt_seq += length/4;
672  state = remainder+frame;
673  /* Do we start in the last byte? This one has the lowest order
674  bits set to represent the remainder, hence the odd coding here. */
675  if (state >= 0) {
676  last_byte = *nt_seq;
677  nt_seq--;
678  if (state == 0) {
679  codon = (last_byte >> 6);
680  byte_value = *nt_seq;
681  codon += ((byte_value & 15) << 2);
682  state = 1;
683  } else if (state == 1) {
684  codon = (last_byte >> 4);
685  byte_value = *nt_seq;
686  codon += ((byte_value & 3) << 4);
687  state = 2;
688  } else if (state == 2) {
689  codon = (last_byte >> 2);
690  state = 3;
691  }
692  *prot_seq = translation[codon];
693  prot_seq++;
694  } else {
695  state = 3 + (remainder + frame + 1);
696  nt_seq--;
697  }
698 
699  byte_value = *nt_seq;
700 
701  /* If there's lots to do, advance to state 3, then enter fast loop */
702  while (nt_seq > nt_seq_start) {
703  switch (state) {
704  case 3:
705  codon = (byte_value & 63);
706  *prot_seq = translation[codon];
707  prot_seq++;
708  /* do state = 0 now, break is NOT missing. */
709  case 0:
710  codon = (byte_value >> 6);
711  nt_seq--;
712  byte_value = *nt_seq;
713  codon += ((byte_value & 15) << 2);
714  *prot_seq = translation[codon];
715  prot_seq++;
716  if (nt_seq <= nt_seq_start) {
717  state = 1;
718  break;
719  }
720  /* Go on to state = 2 if not at end. */
721  case 1:
722  codon = (byte_value >> 4);
723  nt_seq--;
724  byte_value = *nt_seq;
725  codon += ((byte_value & 3) << 4);
726  *prot_seq = translation[codon];
727  prot_seq++;
728  if (nt_seq <= nt_seq_start) {
729  state = 2;
730  break;
731  }
732  /* Go on to state = 2 if not at end. */
733  case 2:
734  codon = (byte_value >> 2);
735  *prot_seq = translation[codon];
736  prot_seq++;
737  nt_seq--;
738  byte_value = *nt_seq;
739  state = 3;
740  break;
741  } /* end switch */
742  /* switch ends at state 3, except when at end */
743 
744  /********************************************/
745  /* optimized area: start in state 0. continue til near end */
746  while (nt_seq > (nt_seq_start+10)) {
747  byte_value1 = *(--nt_seq);
748  byte_value2 = *(--nt_seq);
749  byte_value3 = *(--nt_seq);
750 
751  codon = (byte_value & 63);
752  *prot_seq = translation[codon];
753  prot_seq++;
754  codon = (byte_value >> 6);
755  codon += ((byte_value1 & 15) << 2);
756  *prot_seq = translation[codon];
757  prot_seq++;
758  byte_value4 = *(--nt_seq);
759  codon = (byte_value1 >> 4);
760  codon += ((byte_value2 & 3) << 4);
761  *prot_seq = translation[codon];
762  prot_seq++;
763  codon = (byte_value2 >> 2);
764  *prot_seq = translation[codon];
765  prot_seq++;
766  byte_value5 = *(--nt_seq);
767 
768  codon = (byte_value3 & 63);
769  *prot_seq = translation[codon];
770  prot_seq++;
771  byte_value = *(--nt_seq);
772  codon = (byte_value3 >> 6);
773  codon += ((byte_value4 & 15) << 2);
774  *prot_seq = translation[codon];
775  prot_seq++;
776  codon = (byte_value4 >> 4);
777  codon += ((byte_value5 & 3) << 4);
778  *prot_seq = translation[codon];
779  prot_seq++;
780  codon = (byte_value5 >> 2);
781  *prot_seq = translation[codon];
782  prot_seq++;
783  } /* end optimized while */
784  /********************************************/
785 
786  } /* end while */
787 
788  byte_value = *nt_seq;
789  if (state == 3) {
790  codon = (byte_value & 63);
791  *prot_seq = translation[codon];
792  prot_seq++;
793  } else if (state == 2) {
794  codon = (byte_value >> 2);
795  *prot_seq = translation[codon];
796  prot_seq++;
797  }
798  }
799 
800  *prot_seq = NULLB;
801 
802  return (Int4)(prot_seq - prot_seq_start);
803 } /* BlastTranslateUnambiguousSequence */
804 
805 
806 /* Reverse a nucleotide sequence in the ncbi4na encoding */
807 Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length,
808  Uint1** rev_sequence_ptr)
809 {
810  Uint1* rev_sequence;
811  Int4 index;
812  /* Conversion table from forward to reverse strand residue in the blastna
813  encoding */
814  Uint1 conversion_table[16] = {
815  0, 8, 4, 12,
816  2, 10, 6, 14,
817  1, 9, 5, 13,
818  3, 11, 7, 15
819  };
820 
821  if (!rev_sequence_ptr)
822  return -1;
823 
824  rev_sequence = (Uint1*) malloc(length + 2);
825 
826  rev_sequence[0] = rev_sequence[length+1] = NULLB;
827 
828  for (index = 0; index < length; ++index) {
829  if (sequence[index] == FENCE_SENTRY)
830  rev_sequence[length-index] = FENCE_SENTRY;
831  else
832  rev_sequence[length-index] = conversion_table[sequence[index]];
833  }
834 
835  *rev_sequence_ptr = rev_sequence;
836  return 0;
837 }
838 
839 Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
840 {
841  Int1 frame = INT1_MAX; /* INT1_MAX is used to indicate error */
842 
843  if (prog_number == eBlastTypeBlastn || prog_number == eBlastTypeMapping) {
844  if (context_number % NUM_STRANDS == 0)
845  frame = 1;
846  else
847  frame = -1;
848  } else if (Blast_QueryIsProtein(prog_number) ||
849  prog_number == eBlastTypePhiBlastn) {
850  /* Query is an untranslated protein, a pattern, or a PSSM, no frame. */
851  frame = 0;
852  } else if (prog_number == eBlastTypeBlastx ||
853  prog_number == eBlastTypeTblastx ||
854  prog_number == eBlastTypeRpsTblastn) {
855  context_number = context_number % NUM_FRAMES;
856  switch (context_number) {
857  case 0: frame = 1; break;
858  case 1: frame = 2; break;
859  case 2: frame = 3; break;
860  case 3: frame = -1; break;
861  case 4: frame = -2; break;
862  case 5: frame = -3; break;
863  default: abort(); break; /* should never happen */
864  }
865  }
866 
867  return frame;
868 }
869 
870 Int2 BLAST_PackDNA(const Uint1* buffer, Int4 length, EBlastEncoding encoding,
871  Uint1** packed_seq)
872 {
873  Int4 new_length = length/COMPRESSION_RATIO + 1;
874  Uint1* new_buffer = (Uint1*) malloc(new_length);
875  Int4 index, new_index;
876  Uint1 shift; /* bit shift to pack bases */
877 
878  if ( !new_buffer ) {
879  return -1;
880  }
881 
882  for (index=0, new_index=0; new_index < new_length-1;
883  ++new_index, index += COMPRESSION_RATIO) {
884  if (encoding == eBlastEncodingNucleotide)
885  new_buffer[new_index] =
886  ((buffer[index]&NCBI2NA_MASK)<<6) |
887  ((buffer[index+1]&NCBI2NA_MASK)<<4) |
888  ((buffer[index+2]&NCBI2NA_MASK)<<2) |
889  (buffer[index+3]&NCBI2NA_MASK);
890  else
891  new_buffer[new_index] =
892  ((NCBI4NA_TO_BLASTNA[buffer[index]]&NCBI2NA_MASK)<<6) |
893  ((NCBI4NA_TO_BLASTNA[buffer[index+1]]&NCBI2NA_MASK)<<4) |
894  ((NCBI4NA_TO_BLASTNA[buffer[index+2]]&NCBI2NA_MASK)<<2) |
896  }
897 
898  /* Handle the last byte of the compressed sequence.
899  Last 2 bits of the last byte tell the number of valid
900  packed sequence bases in it. */
901  new_buffer[new_index] = length % COMPRESSION_RATIO;
902 
903  for (; index < length; index++) {
904  switch (index%COMPRESSION_RATIO) {
905  case 0: shift = 6; break;
906  case 1: shift = 4; break;
907  case 2: shift = 2; break;
908  default: abort(); /* should never happen */
909  }
910  if (encoding == eBlastEncodingNucleotide)
911  new_buffer[new_index] |= ((buffer[index]&NCBI2NA_MASK)<<shift);
912  else
913  new_buffer[new_index] |=
914  ((NCBI4NA_TO_BLASTNA[buffer[index]]&NCBI2NA_MASK)<<shift);
915  }
916 
917  *packed_seq = new_buffer;
918 
919  return 0;
920 }
921 
922 size_t
923 BLAST_GetTranslatedProteinLength(size_t nucleotide_length, unsigned int context)
924 {
925  if (nucleotide_length == 0 || nucleotide_length <= context % CODON_LENGTH) {
926  return 0;
927  }
928  return (nucleotide_length - context % CODON_LENGTH) / CODON_LENGTH;
929 }
930 
932  const BlastQueryInfo* query_info)
933 {
934  Uint1* buffer,* seq = NULL;
935  Int4 index;
936  Int4 length[CODON_LENGTH];
937  Int4 total_length = QueryInfo_GetSeqBufLen(query_info);
938 
939  /* Allocate 1 extra byte for a final sentinel. */
940  buffer = (Uint1*) malloc(total_length+1);
941  if (!buffer)
942  return -1;
943 
944  for (index = 0; index <= query_info->last_context; index += CODON_LENGTH) {
945  int i;
946 
947  if (query_info->contexts[index].query_length == 0) /* Indicates this context is not searched. */
948  continue;
949 
950  seq = &buffer[query_info->contexts[index].query_offset];
951 
952  for (i = 0; i < CODON_LENGTH; ++i) {
953  *seq++ = NULLB;
954  length[i] = query_info->contexts[index + i].query_length;
955  }
956 
957  for (i = 0; ; ++i) {
958  Uint1 *tmp_seq;
959  Int4 context = i % 3;
960  Int4 offset = i / 3;
961  if (offset >= length[context]) {
962  /* Once one frame is past its end, we are done */
963  break;
964  }
965  tmp_seq =
966  &query_blk->sequence[query_info->contexts[index+context].query_offset];
967  *seq++ = tmp_seq[offset];
968  }
969  }
970  /* Add a sentinel null byte at the end. */
971  if (seq)
972  *seq = NULLB;
973 
974  /* The mixed-frame protein sequence buffer will be saved in
975  'sequence_start' */
976  query_blk->oof_sequence = buffer;
977  query_blk->oof_sequence_allocated = TRUE;
978 
979  return 0;
980 }
981 
982 /** Gets the translation array for a given genetic code.
983  * This array is optimized for the NCBI2na alphabet.
984  * The reverse complement can also be spcified.
985  * @param genetic_code Genetic code string in ncbistdaa encoding [in]
986  * @param reverse_complement Get translation table for the reverse strand? [in]
987  * @return The translation table.
988 */
989 static Uint1*
991 
992 {
993  Int2 index1, index2, index3, bp1, bp2, bp3;
994  Int2 codon;
995  Uint1* translation;
996  /* The next array translate between the ncbi2na rep's and
997  the rep's used by the genetic_code tables. The rep used by the
998  genetic code arrays is in mapping: T=0, C=1, A=2, G=3 */
999  static Uint1 mapping[4] = {2, /* A in ncbi2na */
1000  1, /* C in ncbi2na. */
1001  3, /* G in ncbi2na. */
1002  0 /* T in ncbi2na. */ };
1003 
1004  if (genetic_code == NULL)
1005  return NULL;
1006 
1007  translation = calloc(64, sizeof(Uint1));
1008  if (translation == NULL)
1009  return NULL;
1010 
1011  for (index1=0; index1<4; index1++)
1012  {
1013  for (index2=0; index2<4; index2++)
1014  {
1015  for (index3=0; index3<4; index3++)
1016  {
1017  /* The reverse complement codon is saved in it's orginal
1018  (non-complement) form AND with the high-order bits reversed
1019  from the non-complement form, as this is how they appear in
1020  the sequence.
1021  */
1022  if (reverse_complement)
1023  {
1024  bp1 = 3 - index1;
1025  bp2 = 3 - index2;
1026  bp3 = 3 - index3;
1027  codon = (mapping[bp1]<<4) + (mapping[bp2]<<2) + (mapping[bp3]);
1028  translation[(index3<<4) + (index2<<2) + index1] =
1029  genetic_code[codon];
1030  }
1031  else
1032  {
1033  codon = (mapping[index1]<<4) + (mapping[index2]<<2) +
1034  (mapping[index3]);
1035  translation[(index1<<4) + (index2<<2) + index3] =
1036  genetic_code[codon];
1037  }
1038  }
1039  }
1040  }
1041  return translation;
1042 }
1043 
1044 
1046  Int4 nucl_length, const Uint1* genetic_code,
1047  Uint1** translation_buffer_ptr, Uint4** frame_offsets_ptr,
1048  Uint1** mixed_seq_ptr)
1049 {
1050  Uint1* translation_buffer,* mixed_seq;
1051  Uint1* translation_table = NULL,* translation_table_rc = NULL;
1052  Uint1* nucl_seq_rev;
1053  Uint4 offset = 0, length;
1054  Int4 context;
1055  Uint4* frame_offsets;
1056  Int2 frame;
1057 
1058  Uint4 buffer_length =2*(nucl_length+1)+2;
1059 
1060  if (encoding != eBlastEncodingNcbi2na && encoding != eBlastEncodingNcbi4na)
1061  return -1;
1062 
1063  if ((translation_buffer =
1064  (Uint1*) malloc(buffer_length)) == NULL)
1065  return -1;
1066 
1067  if (encoding == eBlastEncodingNcbi4na) {
1068  /* First produce the reverse strand of the nucleotide sequence */
1069  GetReverseNuclSequence(nucl_seq, nucl_length,
1070  &nucl_seq_rev);
1071  } else {
1072  translation_table = s_BlastGetTranslationTable(genetic_code, FALSE);
1073  translation_table_rc = s_BlastGetTranslationTable(genetic_code, TRUE);
1074  }
1075 
1076  frame_offsets = (Uint4*) malloc((NUM_FRAMES+1)*sizeof(Uint4));
1077 
1078  frame_offsets[0] = 0;
1079 
1080  for (context = 0; context < NUM_FRAMES; ++context) {
1082  if (encoding == eBlastEncodingNcbi2na) {
1083  if (frame > 0) {
1084  length =
1085  BLAST_TranslateCompressedSequence(translation_table,
1086  nucl_length, nucl_seq, frame, translation_buffer+offset);
1087  } else {
1088  length =
1089  BLAST_TranslateCompressedSequence(translation_table_rc,
1090  nucl_length, nucl_seq, frame, translation_buffer+offset);
1091  }
1092  } else {
1093  length =
1094  BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1095  nucl_length, frame, translation_buffer+offset, genetic_code);
1096  }
1097 
1098  /* Increment offset by 1 extra byte for the sentinel NULLB
1099  between frames. */
1100  offset += length + 1;
1101  frame_offsets[context+1] = offset;
1102  }
1103 
1104  if (encoding == eBlastEncodingNcbi4na) {
1105  sfree(nucl_seq_rev);
1106  } else {
1107  free(translation_table);
1108  sfree(translation_table_rc);
1109  }
1110 
1111  /* All frames are ready. For the out-of-frame gapping option, allocate
1112  and fill buffer with the mixed frame sequence */
1113  if (mixed_seq_ptr) {
1114  Uint1* seq;
1115  Int4 index, i;
1116 
1117  *mixed_seq_ptr = mixed_seq = (Uint1*) malloc(2*nucl_length+3);
1118  seq = mixed_seq;
1119  for (index = 0; index < NUM_FRAMES; index += CODON_LENGTH) {
1120  for (i = 0; i <= nucl_length; ++i) {
1121  context = i % CODON_LENGTH;
1122  offset = i / CODON_LENGTH;
1123  *seq++ = translation_buffer[frame_offsets[index+context]+offset];
1124  }
1125  }
1126  *seq = NULLB;
1127  }
1128  if (translation_buffer_ptr)
1129  *translation_buffer_ptr = translation_buffer;
1130  else
1131  sfree(translation_buffer);
1132 
1133  if (frame_offsets_ptr)
1134  *frame_offsets_ptr = frame_offsets;
1135  else
1136  sfree(frame_offsets);
1137 
1138  return 0;
1139 }
1140 
1142  Int4 nucl_length, Int2 frame, const Uint1* genetic_code,
1143  Uint1** translation_buffer_ptr, Int4* protein_length,
1144  Uint1** mixed_seq_ptr)
1145 {
1146  Uint1* translation_buffer;
1147  Uint1* nucl_seq_rev = NULL;
1148  Int4 length;
1149 
1150  if (frame < 0) {
1151  /* First produce the reverse strand of the nucleotide sequence */
1152  GetReverseNuclSequence(nucl_seq, nucl_length, &nucl_seq_rev);
1153  }
1154 
1155  if (!mixed_seq_ptr) {
1156  if ((translation_buffer =
1157  (Uint1*) malloc(nucl_length/CODON_LENGTH+2)) == NULL)
1158  {
1159  sfree(nucl_seq_rev);
1160  return -1;
1161  }
1162 
1163  length =
1164  BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1165  nucl_length, frame, translation_buffer,
1166  genetic_code);
1167  if (protein_length)
1168  *protein_length = length;
1169  } else {
1170  Int2 index;
1171  Int2 frame_sign = ((frame < 0) ? -1 : 1);
1172  Uint4 offset = 0;
1173  Uint4 frame_offsets[CODON_LENGTH];
1174  Uint1* seq;
1175 
1176  if ((translation_buffer = (Uint1*) malloc(nucl_length+2)) == NULL)
1177  {
1178  sfree(nucl_seq_rev);
1179  return -1;
1180  }
1181 
1182  for (index = 1; index <= CODON_LENGTH; ++index) {
1183  length =
1184  BLAST_GetTranslation(nucl_seq, nucl_seq_rev,
1185  nucl_length, (short)(frame_sign*index),
1186  translation_buffer+offset, genetic_code);
1187  frame_offsets[index-1] = offset;
1188  offset += length + 1;
1189  }
1190 
1191  *mixed_seq_ptr = (Uint1*) malloc(nucl_length+2);
1192  if (protein_length)
1193  *protein_length = nucl_length;
1194  for (index = 0, seq = *mixed_seq_ptr; index <= nucl_length;
1195  ++index, ++seq) {
1196  *seq = translation_buffer[frame_offsets[index%CODON_LENGTH] +
1197  (index/CODON_LENGTH)];
1198  }
1199  }
1200 
1201  sfree(nucl_seq_rev);
1202  if (translation_buffer_ptr)
1203  *translation_buffer_ptr = translation_buffer;
1204  else
1205  sfree(translation_buffer);
1206 
1207  return 0;
1208 }
1209 
1210 
1212 {
1213  if (Blast_QueryIsTranslated(program) ||
1214  Blast_SubjectIsTranslated(program)) {
1215  ASSERT(frame >= -3 && frame <= 3 && frame != 0);
1216  if (frame > 0) {
1217  return frame - 1;
1218  } else {
1219  return 2 - frame;
1220  }
1221  } else if (Blast_QueryIsNucleotide(program) ||
1222  Blast_SubjectIsNucleotide(program)) {
1223  ASSERT(frame == 1 || frame == -1);
1224  return frame == 1 ? 0 : 1;
1225  } else {
1226  ASSERT(frame == 0);
1227  return 0;
1228  }
1229 }
1230 
1232 {
1233  Int4 m, b, e;
1234 
1235  b = 0;
1236  e = size;
1237  while (b < e - 1) {
1238  m = (b + e) / 2;
1239  if (A[m] > n)
1240  e = m;
1241  else
1242  b = m;
1243  }
1244  return b;
1245 }
1246 
1249 {
1250 
1251  if (target_t)
1252  {
1253  if (target_t->translations)
1254  {
1255  int index;
1256  for (index=0; index<target_t->num_frames; index++)
1257  sfree(target_t->translations[index]);
1258  sfree(target_t->translations);
1259  }
1260  if (target_t->range)
1261  sfree(target_t->range);
1262  sfree(target_t);
1263  }
1264  return NULL;
1265 }
1266 
1267 Int2
1269  const Uint1* gen_code_string,
1270  EBlastProgramType program_number,
1271  Boolean is_ooframe,
1272  SBlastTargetTranslation** target)
1273 {
1275  Int4 num_frames = retval->num_frames = NUM_FRAMES;
1276  *target = retval;
1277 
1278  retval->gen_code_string = gen_code_string;
1279  retval->program_number = program_number;
1280 
1281  /* If target is OOF do translation now, otherwise do it as needed. */
1282  retval->partial = !is_ooframe;
1283 
1284  retval->translations = (Uint1**) calloc(num_frames, sizeof(Uint1*));
1285 
1286  if (!retval->partial)
1287  {
1288  if (is_ooframe) {
1290  eBlastEncodingNcbi4na, subject_blk->length, gen_code_string,
1291  NULL, NULL, &subject_blk->oof_sequence);
1292  subject_blk->oof_sequence_allocated = TRUE;
1293  }
1294  else
1295  {
1296  int context = 0;
1297  Uint1* nucl_seq_rev = NULL;
1298 
1299  /* First produce the reverse strand of the nucleotide sequence */
1300  GetReverseNuclSequence(subject_blk->sequence_start, subject_blk->length,
1301  &nucl_seq_rev);
1302 
1303  for (context = 0; context < num_frames; ++context) {
1305  retval->translations[context] = (Uint1*) malloc((2+subject_blk->length/3)*sizeof(Uint1));
1306  BLAST_GetTranslation(subject_blk->sequence_start, nucl_seq_rev,
1307  subject_blk->length, frame, retval->translations[context], gen_code_string);
1308  }
1309  sfree(nucl_seq_rev);
1310  }
1311  }
1312  else
1313  {
1314  retval->range = (Int4*) calloc(2*num_frames, sizeof(Int4));
1315  retval->subject_blk = subject_blk; /* Get pointer for later translations. */
1316  }
1317 
1318 
1319  return 0;
1320 }
1321 
1322 double*
1324 {
1325  Blast_ResFreq* standard_probabilities = NULL;
1326  Uint4 i = 0;
1327  double* retval = NULL;
1328 
1329  /* Manually build a BlastScoreBlk, we only need a few fields populated */
1330  BlastScoreBlk sbp;
1331  memset((void*)&sbp, 0, sizeof(BlastScoreBlk));
1334  sbp.protein_alphabet = TRUE;
1335 
1336  retval = (double*) malloc(sbp.alphabet_size * sizeof(double));
1337  if ( !retval ) {
1338  return NULL;
1339  }
1340 
1341  standard_probabilities = Blast_ResFreqNew(&sbp);
1342  Blast_ResFreqStdComp(&sbp, standard_probabilities);
1343 
1344  for (i = 0; i < (Uint4) sbp.alphabet_size; i++) {
1345  retval[i] = standard_probabilities->prob[i];
1346  }
1347 
1348  Blast_ResFreqFree(standard_probabilities);
1349  return retval;
1350 }
1351 
1352 char* BLAST_StrToUpper(const char* string)
1353 {
1354  char* retval = NULL; /* the return value */
1355  char* p = NULL; /* auxiliary pointer */
1356 
1357  if ( ! string ) {
1358  return NULL;
1359  }
1360 
1361  retval = strdup(string);
1362  if ( !retval ) {
1363  return NULL;
1364  }
1365 
1366  for (p = retval; *p != NULLB; p++) {
1367  *p = toupper((unsigned char)(*p));
1368  }
1369  return retval;
1370 }
1371 
1372 unsigned int
1374 {
1375  if (Blast_QueryIsTranslated(p)) {
1376  return NUM_FRAMES;
1377  } else if (Blast_QueryIsNucleotide(p)) {
1378  return NUM_STRANDS;
1379  } else if (Blast_ProgramIsValid(p)){
1380  return 1;
1381  } else {
1382  return 0;
1383  }
1384 }
1385 
1386 
1388 {
1389  SBlastProgress* retval = (SBlastProgress*)calloc(1, sizeof(SBlastProgress));
1390  if ( !retval ) {
1391  return NULL;
1392  }
1393  retval->user_data = user_data;
1394  return retval;
1395 }
1396 
1398 {
1399  if ( !progress_info ) {
1400  return NULL;
1401  }
1402  sfree(progress_info);
1403  return NULL;
1404 }
1405 
1407 {
1408  if ( !progress_info ) {
1409  return;
1410  }
1411  progress_info->stage = ePrelimSearch;
1412 }
1413 
ESubjectMaskingType
Define the possible subject masking types.
Definition: blast_def.h:235
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
Definition: blast_def.h:83
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
#define CODON_LENGTH
Codons are always of length 3.
Definition: blast_def.h:63
#define NUM_STRANDS
Number of frames in a nucleotide sequence.
Definition: blast_def.h:93
#define NUM_FRAMES
Number of frames to which we translate in translating searches.
Definition: blast_def.h:88
@ ePrelimSearch
Preliminary stage.
Definition: blast_def.h:328
BLAST filtering functions.
BlastMaskLoc * BlastMaskLocFree(BlastMaskLoc *mask_loc)
Deallocate memory for a BlastMaskLoc structure as well as the BlastSeqLoc's pointed to.
Definition: blast_filter.c:789
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
Boolean Blast_SubjectIsNucleotide(EBlastProgramType p)
Returns true if the subject is nucleotide.
Definition: blast_program.c:53
Boolean Blast_QueryIsNucleotide(EBlastProgramType p)
Returns true if the query is nucleotide.
Definition: blast_program.c:43
Boolean Blast_QueryIsProtein(EBlastProgramType p)
Returns true if the query is protein.
Definition: blast_program.c:40
Boolean Blast_ProgramIsValid(EBlastProgramType p)
Returns true if program is not undefined.
Definition: blast_program.c:87
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypePsiTblastn
Definition: blast_program.h:83
@ eBlastTypeRpsTblastn
Definition: blast_program.h:85
@ eBlastTypePhiBlastn
Definition: blast_program.h:87
@ eBlastTypeMapping
Definition: blast_program.h:88
@ eBlastTypeTblastx
Definition: blast_program.h:79
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypePhiBlastp
Definition: blast_program.h:86
@ eBlastTypeRpsBlast
Definition: blast_program.h:84
@ eBlastTypeUndefined
Definition: blast_program.h:89
@ eBlastTypeTblastn
Definition: blast_program.h:77
@ eBlastTypeBlastp
Definition: blast_program.h:73
Boolean Blast_SubjectIsTranslated(EBlastProgramType p)
Returns true if the subject is translated.
Definition: blast_program.c:63
Uint4 QueryInfo_GetSeqBufLen(const BlastQueryInfo *qinfo)
Get the number of bytes required for the concatenated sequence buffer, given a query info structure.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
Blast_ResFreq * Blast_ResFreqFree(Blast_ResFreq *rfp)
Deallocates Blast_ResFreq and prob0 element.
Definition: blast_stat.c:1689
Int2 Blast_ResFreqStdComp(const BlastScoreBlk *sbp, Blast_ResFreq *rfp)
Calculates residues frequencies given a standard distribution.
Definition: blast_stat.c:1887
Blast_ResFreq * Blast_ResFreqNew(const BlastScoreBlk *sbp)
Allocates a new Blast_ResFreq structure and fills in the prob element based upon the contents of sbp.
Definition: blast_stat.c:1708
Int4 BLAST_FrameToContext(Int2 frame, EBlastProgramType program)
Convert translation frame or strand into a context number suitable for indexing into the BlastQueryIn...
Definition: blast_util.c:1211
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Definition: blast_util.c:245
static void s_BlastSequenceBlkFreeSeqRanges(BLAST_SequenceBlk *seq_blk)
Auxiliary function to free the BLAST_SequenceBlk::seq_ranges field if applicable.
Definition: blast_util.c:90
Int2 BlastSeqBlkSetSeqRanges(BLAST_SequenceBlk *seq_blk, SSeqRange *seq_ranges, Uint4 num_seq_ranges, Boolean copy_seq_ranges, ESubjectMaskingType mask_type)
Sets the seq_range and related fields appropriately in the BLAST_SequenceBlk structure.
Definition: blast_util.c:182
void SBlastProgressReset(SBlastProgress *progress_info)
Resets the progress structure to its original state (as if newly allocated) for a fresh start without...
Definition: blast_util.c:1406
Int4 SSeqRangeArrayLessThanOrEqual(const SSeqRange *ranges, Int4 num_ranges, Int4 target)
Returns the index of the range, such that this element is the first range that either contains the ta...
Definition: blast_util.c:56
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence, Int4 seqlen)
Stores the sequence in the sequence block structure.
Definition: blast_util.c:147
SBlastProgress * SBlastProgressNew(void *user_data)
Allocates and initializes a new SBlastProgress structure.
Definition: blast_util.c:1387
SSeqRange SSeqRangeNew(Int4 start, Int4 stop)
Create a new SSeqRange structure with both fields initialized.
Definition: blast_util.c:47
size_t BLAST_GetTranslatedProteinLength(size_t nucleotide_length, unsigned int context)
Calculates the length of frame for a translated protein.
Definition: blast_util.c:923
SBlastProgress * SBlastProgressFree(SBlastProgress *progress_info)
Deallocates a SBlastProgress structure.
Definition: blast_util.c:1397
Int2 BLAST_CreateMixedFrameDNATranslation(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info)
Initialize the mixed-frame sequence for out-of-frame gapped extension.
Definition: blast_util.c:931
Int2 BlastNumber2Program(EBlastProgramType number, char **program)
Return string name for program given a number.
Definition: blast_util.c:312
static Uint1 s_CodonToAA(Uint1 *codon, const Uint1 *codes)
Translate 3 nucleotides into an amino acid MUST have 'X' as unknown amino acid.
Definition: blast_util.c:370
Int2 BlastSeqBlkSetCompressedSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence)
Stores the compressed nucleotide sequence in the sequence block structure for the subject sequence wh...
Definition: blast_util.c:167
int Blast_GetPartialTranslation(const Uint1 *nucl_seq, Int4 nucl_length, Int2 frame, const Uint1 *genetic_code, Uint1 **translation_buffer_ptr, Int4 *protein_length, Uint1 **mixed_seq_ptr)
Get one frame translation - needed when only parts of subject sequences are translated.
Definition: blast_util.c:1141
Int2 BlastTargetTranslationNew(BLAST_SequenceBlk *subject_blk, const Uint1 *gen_code_string, EBlastProgramType program_number, Boolean is_ooframe, SBlastTargetTranslation **target)
Sets up structure for target translation.
Definition: blast_util.c:1268
Int2 BlastSetUp_SeqBlkNew(const Uint1 *buffer, Int4 length, BLAST_SequenceBlk **seq_blk, Boolean buffer_allocated)
Allocates memory for *sequence_blk and then populates it.
Definition: blast_util.c:101
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
This function translates the context number of a context into the frame of the sequence.
Definition: blast_util.c:839
Int2 BLAST_PackDNA(const Uint1 *buffer, Int4 length, EBlastEncoding encoding, Uint1 **packed_seq)
Convert a sequence in ncbi4na or blastna encoding into a packed sequence in ncbi2na encoding.
Definition: blast_util.c:870
SBlastTargetTranslation * BlastTargetTranslationFree(SBlastTargetTranslation *target_t)
Free SBlastTargetTranslation.
Definition: blast_util.c:1248
Int2 BlastCompressBlastnaSequence(BLAST_SequenceBlk *seq_blk)
Adds a specialized representation of sequence data to a sequence block.
Definition: blast_util.c:459
Int4 BLAST_GetTranslation(const Uint1 *query_seq, const Uint1 *query_seq_rev, Int4 nt_length, Int2 frame, Uint1 *prot_seq, const Uint1 *genetic_code)
GetTranslation to get the translation of the nucl.
Definition: blast_util.c:428
void BlastSequenceBlkClean(BLAST_SequenceBlk *seq_blk)
Deallocate memory only for the sequence in the sequence block.
Definition: blast_util.c:220
Int2 BLAST_GetAllTranslations(const Uint1 *nucl_seq, EBlastEncoding encoding, Int4 nucl_length, const Uint1 *genetic_code, Uint1 **translation_buffer_ptr, Uint4 **frame_offsets_ptr, Uint1 **mixed_seq_ptr)
Translate nucleotide into 6 frames.
Definition: blast_util.c:1045
unsigned int BLAST_GetNumberOfContexts(EBlastProgramType p)
Get the number of contexts for a given program.
Definition: blast_util.c:1373
Int2 BlastProgram2Number(const char *program, EBlastProgramType *number)
Set number for a given program type.
Definition: blast_util.c:278
Int2 GetReverseNuclSequence(const Uint1 *sequence, Int4 length, Uint1 **rev_sequence_ptr)
Reverse a nucleotide sequence in the blastna encoding, adding sentinel bytes on both ends.
Definition: blast_util.c:807
Int2 BlastSeqBlkNew(BLAST_SequenceBlk **retval)
Allocates a new sequence block structure.
Definition: blast_util.c:133
Int4 BSearchInt4(Int4 n, Int4 *A, Int4 size)
The following binary search routine assumes that array A is filled.
Definition: blast_util.c:1231
static Uint1 * s_BlastGetTranslationTable(const Uint1 *genetic_code, Boolean reverse_complement)
Gets the translation array for a given genetic code.
Definition: blast_util.c:990
char * BLAST_StrToUpper(const char *string)
Returns a copy of the input string with all its characters turned to uppercase.
Definition: blast_util.c:1352
void BlastSequenceBlkCopy(BLAST_SequenceBlk **copy, BLAST_SequenceBlk *src)
Copies contents of the source sequence block without copying sequence buffers; sets all "field_alloca...
Definition: blast_util.c:259
void __sfree(void **x)
Implemented in blast_util.c.
Definition: blast_util.c:40
double * BLAST_GetStandardAaProbabilities()
Get the standard amino acid probabilities.
Definition: blast_util.c:1323
Int4 BLAST_TranslateCompressedSequence(Uint1 *translation, Int4 length, const Uint1 *nt_seq, Int2 frame, Uint1 *prot_seq)
Translate a nucleotide sequence without ambiguity codes.
Definition: blast_util.c:508
Various auxiliary BLAST utility functions.
#define NCBI2NA_MASK
Bit mask for obtaining a single base from a byte in ncbi2na format.
Definition: blast_util.h:52
#define FENCE_SENTRY
This sentry value is used as a 'fence' around the valid portions of partially decoded sequences.
Definition: blast_util.h:364
#define IS_residue(x)
Does character encode a residue?
Definition: blast_util.h:48
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
EBlastEncoding
Different types of sequence encodings for sequence retrieval from the BLAST database.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
const Uint1 NCBI4NA_TO_BLASTNA[]
Translates between ncbi4na and blastna.
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
@ eBlastEncodingNcbi4na
NCBI4na.
@ eBlastEncodingNucleotide
Special encoding for preliminary stage of BLAST: permutation of NCBI4na.
@ eBlastEncodingNcbi2na
NCBI2na.
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int8_t Int1
1-byte (8-bit) signed integer
Definition: ncbitype.h:98
int i
yy_size_t n
int len
const TrnaAa taa[]
Definition: loadfeat.cpp:126
const struct ncbi::grid::netcache::search::fields::SIZE size
#define strdup
Definition: ncbi_ansi_ext.h:70
#define strcasecmp
#define INT1_MAX
largest number represented by signed short (one byte)
Definition: ncbi_std.h:166
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
void * BlastMemDup(const void *orig, size_t size)
Copies memory using memcpy and malloc.
Definition: ncbi_std.c:35
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ABS(a)
returns absolute value of a (|a|)
Definition: ncbi_std.h:122
#define NULLB
terminating byte of a char* string.
Definition: ncbi_std.h:181
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
int toupper(Uchar c)
Definition: ncbictype.hpp:73
void abort()
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
#define A
static BOOL number
Definition: pcre2grep.c:285
static uint8_t * buffer
Definition: pcre2test.c:1016
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence_start
Start of sequence, usually one byte before sequence as that byte is a NULL sentinel byte.
Definition: blast_def.h:244
Uint1 * compressed_nuc_seq_start
start of compressed_nuc_seq
Definition: blast_def.h:264
Uint4 num_seq_ranges
Number of elements in seq_ranges.
Definition: blast_def.h:281
Boolean sequence_allocated
TRUE if memory has been allocated for sequence.
Definition: blast_def.h:251
BlastMaskLoc * lcase_mask
Locations to be masked from operations on this sequence: lookup table for query; scanning for subject...
Definition: blast_def.h:265
SSeqRange * seq_ranges
Ranges of the sequence to search.
Definition: blast_def.h:280
Boolean lcase_mask_allocated
TRUE if memory has been allocated for lcase_mask.
Definition: blast_def.h:268
Int4 length
Length of sequence.
Definition: blast_def.h:246
ESubjectMaskingType mask_type
type of subject masking
Definition: blast_def.h:284
Uint1 * sequence_nomask
Start of query sequence without masking.
Definition: blast_def.h:256
Boolean seq_ranges_allocated
TRUE if memory has been allocated for seq_ranges.
Definition: blast_def.h:282
Uint1 * sequence_start_nomask
Query sequence without masking.
Definition: blast_def.h:255
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Boolean oof_sequence_allocated
TRUE if memory has been allocated for oof_sequence.
Definition: blast_def.h:261
Boolean nomask_allocated
If false the two above are just pointers to sequence and sequence_start.
Definition: blast_def.h:257
Uint1 * compressed_nuc_seq
4-to-1 compressed version of sequence
Definition: blast_def.h:263
Boolean sequence_start_allocated
TRUE if memory has been allocated for sequence_start.
Definition: blast_def.h:253
Uint1 * oof_sequence
Mixed-frame protein representation of a nucleotide sequence for out-of-frame alignment.
Definition: blast_def.h:259
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
The query related information.
BlastContextInfo * contexts
Information per context.
Int4 last_context
Index of the last element of the context array.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Boolean protein_alphabet
TRUE if alphabet_code is for a protein alphabet (e.g., ncbistdaa etc.), FALSE for nt.
Definition: blast_stat.h:178
Int2 alphabet_size
size of alphabet.
Definition: blast_stat.h:181
Uint1 alphabet_code
NCBI alphabet code.
Definition: blast_stat.h:180
Stores the letter frequency of a sequence or database.
Definition: blast_stat.h:273
double * prob
letter probs, (possible) non-zero offset.
Definition: blast_stat.h:275
Progress monitoring structure.
Definition: blast_def.h:341
EBlastStage stage
Stage of the BLAST search currently in progress.
Definition: blast_def.h:342
void * user_data
Pointer to user-provided data.
Definition: blast_def.h:344
Information about target translations.
Definition: blast_def.h:311
EBlastProgramType program_number
Program being run.
Definition: blast_def.h:312
Int4 * range
start and stop of translated sequences.
Definition: blast_def.h:317
Int4 num_frames
how many frames, one dimension of translation_buffer.
Definition: blast_def.h:316
const Uint1 * gen_code_string
Genetic code string for translation.
Definition: blast_def.h:313
BLAST_SequenceBlk * subject_blk
target sequence being translated.
Definition: blast_def.h:318
Uint1 ** translations
two dimensional array for translations.
Definition: blast_def.h:314
Boolean partial
specifies that nucleotide sequence is too long to translated.
Definition: blast_def.h:315
A structure containing two integers, used e.g.
Definition: blast_def.h:155
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
static CS_CONTEXT * context
Definition: will_convert.c:21
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Fri Sep 20 14:57:50 2024 by modify_doxy.py rev. 669887