NCBI C++ ToolKit
blast_setup_cxx.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 /* ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  * ===========================================================================
29  */
30 
31 /// @file blast_setup_cxx.cpp
32 /// Auxiliary setup functions for Blast objects interface.
33 
34 #include <ncbi_pch.hpp>
35 #include <util/util_misc.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/metareg.hpp>
42 
44 #include <objects/seqloc/Seq_point.hpp> // needed in s_SeqLoc2MaskedSubjRanges
45 
46 #include "blast_setup.hpp"
47 
48 /** @addtogroup AlgoBlast
49  *
50  * @{
51  */
52 
55 BEGIN_SCOPE(blast)
56 
57 /** Set field values for one element of the context array of a
58  * concatenated query. All previous contexts should have already been
59  * assigned correct values.
60  * @param qinfo Query info structure containing contexts. [in/out]
61  * @param index Index of the context to fill. [in]
62  * @param length Length of this context. [in]
63  */
64 static void
66  Uint4 index,
67  Uint4 length)
68 {
69  _ASSERT(index <= static_cast<Uint4>(qinfo->last_context));
70 
71  if (index) {
72  Uint4 prev_loc = qinfo->contexts[index-1].query_offset;
73  Uint4 prev_len = qinfo->contexts[index-1].query_length;
74 
75  Uint4 shift = prev_len ? prev_len + 1 : 0;
76 
77  qinfo->contexts[index].query_offset = prev_loc + shift;
78  qinfo->contexts[index].query_length = length;
79  if (length == 0)
80  qinfo->contexts[index].is_valid = false;
81  } else {
82  // First context
83  qinfo->contexts[0].query_offset = 0;
84  qinfo->contexts[0].query_length = length;
85  if (length == 0)
86  qinfo->contexts[0].is_valid = false;
87  }
88 }
89 
90 /// Internal function to choose between the strand specified in a Seq-loc
91 /// (which specified the query strand) and the strand obtained
92 /// from the CBlastOptions
93 /// @param seqloc_strand strand extracted from the query Seq-loc [in]
94 /// @param program program type from the CORE's point of view [in]
95 /// @param strand_opt strand as specified by the BLAST options [in]
96 static objects::ENa_strand
98  EBlastProgramType program,
99  objects::ENa_strand strand_opt)
100 {
101  if (Blast_QueryIsProtein(program)) {
102  return eNa_strand_unknown;
103  }
104 
105  // Only if the strand specified by the options is NOT both or unknown,
106  // it takes precedence over what is specified by the query's strand
107  ENa_strand retval = (strand_opt == eNa_strand_both ||
108  strand_opt == eNa_strand_unknown)
109  ? seqloc_strand : strand_opt;
110  if (Blast_QueryIsNucleotide(program) && retval == eNa_strand_unknown) {
111  retval = eNa_strand_both;
112  }
113  return retval;
114 }
115 
117 BlastSetup_GetStrand(const objects::CSeq_loc& query_seqloc,
118  EBlastProgramType program,
119  objects::ENa_strand strand_opt)
120 {
121  return s_BlastSetup_GetStrand(query_seqloc.GetStrand(), program,
122  strand_opt);
123 }
124 
125 /// Adjust first context depending on the first query strand
126 static void
129  ENa_strand strand_opt,
130  const IBlastQuerySource& queries)
131 {
132  _ASSERT(query_info);
133 
134 #if _DEBUG /* to eliminate compiler warning in release mode */
135  bool is_na = (prog == eBlastTypeBlastn || prog == eBlastTypeMapping)
136  ? true : false;
137 #endif
138  bool translate = Blast_QueryIsTranslated(prog) ? true : false;
139 
140  _ASSERT(is_na || translate);
141 
142  ENa_strand strand = s_BlastSetup_GetStrand(queries.GetStrand(0), prog,
143  strand_opt);
144  _ASSERT(strand != eNa_strand_unknown);
145 
146  // Adjust the first context if the requested strand is the minus strand
147  if (strand == eNa_strand_minus) {
148  query_info->first_context = translate ? 3 : 1;
149  }
150 }
151 
152 void
155  objects::ENa_strand strand_opt,
156  BlastQueryInfo** qinfo)
157 {
158  _ASSERT(qinfo);
159  CBlastQueryInfo query_info(BlastQueryInfoNew(prog, queries.Size()));
160  if (query_info.Get() == NULL) {
161  NCBI_THROW(CBlastSystemException, eOutOfMemory, "Query info");
162  }
163 
164  const unsigned int kNumContexts = GetNumberOfContexts(prog);
165  bool is_na = (prog == eBlastTypeBlastn || prog == eBlastTypeMapping)
166  ? true : false;
167 
168  bool translate = Blast_QueryIsTranslated(prog) ? true : false;
169 
170  if (is_na || translate) {
171  s_AdjustFirstContext(query_info, prog, strand_opt, queries);
172  }
173 
174  // Set up the context offsets into the sequence that will be added
175  // to the sequence block structure.
176  unsigned int ctx_index = 0; // index into BlastQueryInfo::contexts array
177  // Longest query length, to be saved in the query info structure
178  Uint4 max_length = 0;
179  Uint4 min_length = INT4_MAX;
180 
181  for(TSeqPos j = 0; j < queries.Size(); j++) {
182  TSeqPos length = 0;
183  try { length = queries.GetLength(j); }
184  catch (const CException&) {
185  // Ignore exceptions in this function as they will be caught in
186  // SetupQueries
187  }
188 
189  ENa_strand strand = s_BlastSetup_GetStrand(queries.GetStrand(j), prog,
190  strand_opt);
191 
192  if (translate) {
193  for (unsigned int i = 0; i < kNumContexts; i++) {
194  unsigned int prot_length =
195  static_cast<unsigned int>(BLAST_GetTranslatedProteinLength(length, i));
196  max_length = MAX(max_length, prot_length);
197  min_length = MIN(min_length, prot_length);
198 
199  Uint4 ctx_len(0);
200 
201  switch (strand) {
202  case eNa_strand_plus:
203  ctx_len = (i<3) ? prot_length : 0;
204  s_QueryInfo_SetContext(query_info, ctx_index + i, ctx_len);
205  // the missing frame is present in query_info as
206  // zero-lenghth context
207  min_length = 0;
208  break;
209 
210  case eNa_strand_minus:
211  ctx_len = (i<3) ? 0 : prot_length;
212  s_QueryInfo_SetContext(query_info, ctx_index + i, ctx_len);
213  min_length = 0;
214  break;
215 
216  case eNa_strand_both:
217  case eNa_strand_unknown:
218  s_QueryInfo_SetContext(query_info, ctx_index + i,
219  prot_length);
220  break;
221 
222  default:
223  abort();
224  }
225  }
226  } else {
227  max_length = MAX(max_length, length);
228  min_length = MIN(min_length, length);
229 
230  if (is_na) {
231  switch (strand) {
232  case eNa_strand_plus:
233  s_QueryInfo_SetContext(query_info, ctx_index, length);
234  s_QueryInfo_SetContext(query_info, ctx_index+1, 0);
235  // the missing strand is present in query_info as
236  // zero-lenghth context
237  min_length = 0;
238  break;
239 
240  case eNa_strand_minus:
241  s_QueryInfo_SetContext(query_info, ctx_index, 0);
242  s_QueryInfo_SetContext(query_info, ctx_index+1, length);
243  min_length = 0;
244  break;
245 
246  case eNa_strand_both:
247  case eNa_strand_unknown:
248  s_QueryInfo_SetContext(query_info, ctx_index, length);
249  s_QueryInfo_SetContext(query_info, ctx_index+1, length);
250  break;
251 
252  default:
253  abort();
254  }
255  } else { // protein
256  s_QueryInfo_SetContext(query_info, ctx_index, length);
257  }
258  }
259 
260  // mark queries that have pairs (for mapping)
262  _ASSERT(!translate);
263 
264  int seg_flags = queries.GetSegmentInfo(j);
265  query_info->contexts[ctx_index].segment_flags = seg_flags;
266  query_info->contexts[ctx_index + 1].segment_flags = seg_flags;
267  }
268  ctx_index += kNumContexts;
269  }
270  query_info->max_length = max_length;
271  query_info->min_length = min_length;
272  *qinfo = query_info.Release();
273 }
274 
275 /**
276  * @brief Calculate the starting and ending contexts for a given strand
277  *
278  * @param strand strand to compute contexts for [in]
279  * @param num_contexts number of contexts [in]
280  * @param start starting context [out]
281  * @param end ending context [out]
282  */
283 static void
285  int num_contexts,
286  int & start,
287  int & end)
288 {
289  start = end = num_contexts;
290 
291  switch (strand) {
292  case eNa_strand_minus:
293  start = num_contexts/2;
294  end = num_contexts;
295  break;
296  case eNa_strand_plus:
297  start = 0;
298  end = num_contexts/2;
299  break;
300  case eNa_strand_both:
301  start = 0;
302  end = num_contexts;
303  break;
304  default:
305  abort();
306  }
307 }
308 
309 /**
310  * @brief Adds seqloc_frames to mask.
311  *
312  * @param prog BLAST program [in]
313  * @param mask data structure to add the mask to [in|out]
314  * @param query_index index of the query for which to add the mask [in]
315  * @param seqloc_frames mask to add [in]
316  * @param strand strand on which the mask is being added [in]
317  * @param query_length length of the query [in]
318  */
319 static void
321  BlastMaskLoc * mask,
322  int query_index,
323  CBlastQueryFilteredFrames & seqloc_frames,
324  ENa_strand strand,
325  TSeqPos query_length)
326 {
327  _ASSERT(query_index < mask->total_size);
328  unsigned num_contexts = GetNumberOfContexts(prog);
329 
331  assert(seqloc_frames.QueryHasMultipleFrames());
332 
333  int starting_context(0), ending_context(0);
334 
336  num_contexts,
337  starting_context,
338  ending_context);
339 
340  const TSeqPos dna_length = query_length;
341 
342  BlastSeqLoc** frames_seqloc =
343  & (mask->seqloc_array[query_index*num_contexts]);
344 
345  seqloc_frames.UseProteinCoords(dna_length);
346 
347  for (int i = starting_context; i < ending_context; i++) {
348  short frame = BLAST_ContextToFrame(eBlastTypeBlastx, i);
349  frames_seqloc[i] = *seqloc_frames[frame];
350  seqloc_frames.Release(frame);
351  }
352  } else if (Blast_QueryIsNucleotide(prog) &&
354 
355  int posframe = CSeqLocInfo::eFramePlus1;
356  int negframe = CSeqLocInfo::eFrameMinus1;
357 
358  switch (strand) {
359  case eNa_strand_plus:
360  mask->seqloc_array[query_index*num_contexts] =
361  *seqloc_frames[posframe];
362  seqloc_frames.Release(posframe);
363  break;
364 
365  case eNa_strand_minus:
366  mask->seqloc_array[query_index*num_contexts+1] =
367  *seqloc_frames[negframe];
368  seqloc_frames.Release(negframe);
369  break;
370 
371  case eNa_strand_both:
372  mask->seqloc_array[query_index*num_contexts] =
373  *seqloc_frames[posframe];
374 
375  mask->seqloc_array[query_index*num_contexts+1] =
376  *seqloc_frames[negframe];
377 
378  seqloc_frames.Release(posframe);
379  seqloc_frames.Release(negframe);
380  break;
381 
382  default:
383  abort();
384  }
385 
386  } else {
387  mask->seqloc_array[query_index] = *seqloc_frames[0];
388  seqloc_frames.Release(0);
389  }
390 }
391 
392 /// Restricts the masked locations in frame_to_bsl for the case when the
393 /// BLAST program requires the query to be translated into multiple frames.
394 /// @param frame_to_bsl query filtered frames to adjust [out]
395 /// @param queries all query sequences [in]
396 /// @param query_index index of the query of interest in queries [in]
397 /// @param qinfo BlastQueryInfo structure for the queries above [in]
398 static void
400  const IBlastQuerySource & queries,
401  int query_index,
402  const BlastQueryInfo * qinfo)
403 {
404  typedef set<CSeqLocInfo::ETranslationFrame> TFrameSet;
405  const TFrameSet& frames = frame_to_bsl.ListFrames();
406  const size_t kNumFrames = frame_to_bsl.GetNumFrames();
407  _ASSERT(kNumFrames != 0);
408  const int first_ctx = static_cast<int>(kNumFrames) * query_index;
409  const int last_ctx = static_cast<int>(kNumFrames) * (query_index + 1);
410 
411  ITERATE(TFrameSet, iter, frames) {
412  int seqloc_frame = *iter;
413  BlastSeqLoc ** bsl = frame_to_bsl[seqloc_frame];
414 
415  for(int ci = first_ctx; ci <= last_ctx; ci++) {
416  _ASSERT(qinfo->contexts[ci].query_index == query_index);
417  int context_frame = qinfo->contexts[ci].frame;
418 
419  if (context_frame == seqloc_frame) {
420  CConstRef<CSeq_loc> qseqloc = queries.GetSeqLoc(query_index);
421 
423  qseqloc->GetStart(eExtreme_Positional),
424  qseqloc->GetStop (eExtreme_Positional));
425 
426  break;
427  }
428  }
429  }
430 }
431 
432 /// Extract the masking locations for a single query into a
433 /// CBlastQueryFilteredFrames object and adjust the masks so that they
434 /// correspond to the range specified by the Seq-loc in queries.
435 /// @param queries all query sequences [in]
436 /// @param query_index index of the query of interest in queries [in]
437 /// @param qinfo BlastQueryInfo structure for the queries above [in]
438 /// @param program BLAST program being executed [in]
441  int query_index,
442  const BlastQueryInfo * qinfo,
443  EBlastProgramType program)
444 {
445  TMaskedQueryRegions mqr =
446  queries.GetMaskedRegions(query_index);
447 
449  (new CBlastQueryFilteredFrames(program, mqr));
450 
451  if (! frame_to_bsl->Empty()) {
452  if (frame_to_bsl->QueryHasMultipleFrames()) {
453  s_RestrictSeqLocs_Multiframe(*frame_to_bsl,
454  queries,
455  query_index,
456  qinfo);
457  } else {
458  CConstRef<CSeq_loc> qseqloc = queries.GetSeqLoc(query_index);
459  BlastSeqLoc_RestrictToInterval((*frame_to_bsl)[0],
460  qseqloc->GetStart(eExtreme_Positional),
461  qseqloc->GetStop (eExtreme_Positional));
462  }
463  }
464 
465  return frame_to_bsl;
466 }
467 
468 /// Mark the contexts corresponding to the query identified by query_index as
469 /// invalid
470 /// @param qinfo BlastQueryInfo structure to modify [in]
471 /// @param query_index index of the query, assumes it's in the BlastQueryInfo
472 /// structure above [in]
473 static void
475 {
476  _ASSERT(qinfo);
477  for (int i = qinfo->first_context; i <= qinfo->last_context; i++) {
478  if (qinfo->contexts[i].query_index == query_index) {
479  qinfo->contexts[i].is_valid = FALSE;
480  }
481  }
482 }
483 
484 void
486  BlastQueryInfo* qinfo,
487  BLAST_SequenceBlk** seqblk,
489  objects::ENa_strand strand_opt,
490  TSearchMessages& messages)
491 {
492  _ASSERT(seqblk);
493  _ASSERT( !queries.Empty() );
494  if (messages.size() != queries.Size()) {
495  messages.resize(queries.Size());
496  }
497 
499 
500  int buflen = QueryInfo_GetSeqBufLen(qinfo);
501  TAutoUint1Ptr buf((Uint1*) calloc(buflen+1, sizeof(Uint1)));
502 
503  if ( !buf ) {
504  NCBI_THROW(CBlastSystemException, eOutOfMemory,
505  "Query sequence buffer");
506  }
507 
508  bool is_na = (prog == eBlastTypeBlastn || prog == eBlastTypeMapping)
509  ? true : false;
510 
511  bool translate = Blast_QueryIsTranslated(prog) ? true : false;
512 
513  unsigned int ctx_index = 0; // index into context_offsets array
514  const unsigned int kNumContexts = GetNumberOfContexts(prog);
515 
516  CBlastMaskLoc mask(BlastMaskLocNew(qinfo->num_queries*kNumContexts));
517 
518  for(TSeqPos index = 0; index < queries.Size(); index++) {
520 
521  try {
522 
523  strand = s_BlastSetup_GetStrand(queries.GetStrand(index), prog,
524  strand_opt);
525  if ((is_na || translate) && strand == eNa_strand_unknown) {
526  strand = eNa_strand_both;
527  }
528 
529  CRef<CBlastQueryFilteredFrames> frame_to_bsl =
530  s_GetRestrictedBlastSeqLocs(queries, index, qinfo, prog);
531 
532  // Set the id if this is possible
533  if (const CSeq_id* id = queries.GetSeqId(index)) {
534  const string kTitle = queries.GetTitle(index);
535  string query_id = id->GetSeqIdString();
536  if (kTitle != kEmptyStr) {
537  query_id += " " + kTitle;
538  }
539  if(query_id.size() > 35) {
540  query_id = query_id.substr(0, 25) + ".. ";
541  }
542 
543  messages[index].SetQueryId(query_id);
544  }
545 
546  SBlastSequence sequence;
547 
548  if (translate) {
549  _ASSERT(strand == eNa_strand_both ||
550  strand == eNa_strand_plus ||
551  strand == eNa_strand_minus);
553 
554  const Uint4 genetic_code_id = queries.GetGeneticCodeId(index);
555  Uint1* gc = GenCodeSingletonFind(genetic_code_id);
556  if (gc == NULL) {
557  TAutoUint1ArrayPtr gc_str =
558  FindGeneticCode(genetic_code_id);
559  GenCodeSingletonAdd(genetic_code_id, gc_str.get());
560  gc = GenCodeSingletonFind(genetic_code_id);
561  _ASSERT(gc);
562  }
563 
564  // Get both strands of the original nucleotide sequence with
565  // sentinels
566  sequence = queries.GetBlastSequence(index, encoding, strand,
567  eSentinels);
568 
569  int na_length = queries.GetLength(index);
570  Uint1* seqbuf_rev = NULL; // negative strand
571  if (strand == eNa_strand_both)
572  seqbuf_rev = sequence.data.get() + na_length + 1;
573  else if (strand == eNa_strand_minus)
574  seqbuf_rev = sequence.data.get();
575 
576  // Populate the sequence buffer
577  for (unsigned int i = 0; i < kNumContexts; i++) {
578  if (qinfo->contexts[ctx_index + i].query_length <= 0) {
579  continue;
580  }
581 
582  int offset = qinfo->contexts[ctx_index + i].query_offset;
583  BLAST_GetTranslation(sequence.data.get() + 1,
584  seqbuf_rev,
585  na_length,
586  qinfo->contexts[ctx_index + i].frame,
587  & buf.get()[offset], gc);
588  }
589 
590  } else if (is_na) {
591 
592  _ASSERT(strand == eNa_strand_both ||
593  strand == eNa_strand_plus ||
594  strand == eNa_strand_minus);
595 
596  sequence = queries.GetBlastSequence(index, encoding, strand,
597  eSentinels);
598 
599  int idx = (strand == eNa_strand_minus) ?
600  ctx_index + 1 : ctx_index;
601 
602  int offset = qinfo->contexts[idx].query_offset;
603  memcpy(&buf.get()[offset], sequence.data.get(),
604  sequence.length);
605 
606  } else {
607 
608  string warnings;
609  sequence = queries.GetBlastSequence(index,
610  encoding,
612  eSentinels,
613  &warnings);
614 
615  int offset = qinfo->contexts[ctx_index].query_offset;
616  memcpy(&buf.get()[offset], sequence.data.get(),
617  sequence.length);
618  if ( !warnings.empty() ) {
619  // FIXME: is index this the right value for the 2nd arg?
621  (new CSearchMessage(eBlastSevWarning, index, warnings));
622  messages[index].push_back(m);
623  }
624  }
625 
626  TSeqPos qlen = BlastQueryInfoGetQueryLength(qinfo, prog, index);
627 
628  // s_AddMask releases the elements of frame_to_bsl that it uses;
629  // the rest are freed by frame_to_bsl in the destructor.
630  s_AddMask(prog, mask, index, *frame_to_bsl, strand, qlen);
631 
632  } catch (const CException& e) {
633  // FIXME: is index this the right value for the 2nd arg? Also, how
634  // to determine whether the message should contain a warning or
635  // error?
637  (new CSearchMessage(eBlastSevWarning, index, e.GetMsg()));
638  messages[index].push_back(m);
639  s_InvalidateQueryContexts(qinfo, index);
640  }
641 
642  ctx_index += kNumContexts;
643  }
644 
645  if (BlastSeqBlkNew(seqblk) < 0) {
646  NCBI_THROW(CBlastSystemException, eOutOfMemory, "Query sequence block");
647  }
648 
649  // Validate that at least one query context is valid
650  if (BlastSetup_Validate(qinfo, NULL) != 0 && messages.HasMessages()) {
651  NCBI_THROW(CBlastException, eSetup, messages.ToString());
652  }
653 
654  BlastSeqBlkSetSequence(*seqblk, buf.release(), buflen - 2);
655 
656  (*seqblk)->lcase_mask = mask.Release();
657  (*seqblk)->lcase_mask_allocated = TRUE;
658 }
659 
660 static void
662  const CSeq_loc* range,
663  Int4 total_length,
665 {
666  output.clear();
667 
668  TSeqPos offset, length;
669 
670  _ASSERT(range->IsInt() || range->IsWhole());
671 
672  if (range->IsInt()) {
673  offset = range->GetInt().GetFrom();
674  length = range->GetInt().GetTo() - offset + 1;
675  } else {
676  offset = 0;
677  length = total_length;
678  }
679 
680  if (!slp ||
681  slp->Which() == CSeq_loc::e_not_set ||
682  slp->IsEmpty() ||
683  slp->IsNull() ) {
684  return;
685  }
686 
687  _ASSERT(slp->IsInt() || slp->IsPacked_int() || slp->IsMix());
688 
689  if (slp->IsInt()) {
690  output.reserve(1);
692  p.first = (slp->GetInt().GetFrom() > offset)? slp->GetInt().GetFrom() - offset : 0;
693  p.second = MIN(slp->GetInt().GetTo() - offset, length-1);
694 
695  if (slp->GetInt().GetTo() >= offset && p.first < length) {
696  output.push_back(p);
697  }
698  } else if (slp->IsPacked_int()) {
699  output.reserve(slp->GetPacked_int().Get().size());
702  p.first = ((*itr)->GetFrom() > offset)? (*itr)->GetFrom() - offset : 0;
703  p.second = MIN((*itr)->GetTo() - offset, length-1);
704 
705  if ((*itr)->GetTo() >= offset && p.first < length) {
706  output.push_back(p);
707  }
708  }
709  } else if (slp->IsMix()) {
710  output.reserve(slp->GetMix().Get().size());
711  ITERATE(CSeq_loc_mix::Tdata, itr, slp->GetMix().Get()) {
713  if ((*itr)->IsInt()) {
714  p.first = ((*itr)->GetInt().GetFrom() > offset)? (*itr)->GetInt().GetFrom() - offset : 0;
715  p.second = MIN((*itr)->GetInt().GetTo() - offset, length-1);
716  if ((*itr)->GetInt().GetTo() >= offset && p.first < length) {
717  output.push_back(p);
718  }
719  } else if ((*itr)->IsPnt()) {
720  p.first = ((*itr)->GetPnt().GetPoint() > offset)? (*itr)->GetPnt().GetPoint() - offset : 0;
721  p.second = MIN((*itr)->GetPnt().GetPoint() - offset, length-1);
722  if ((*itr)->GetPnt().GetPoint() >= offset && p.first < length) {
723  output.push_back(p);
724  }
725  }
726  }
727  } else {
728  NCBI_THROW(CBlastException, eNotSupported, "Unsupported CSeq_loc type");
729  }
730 }
731 
732 void
735  vector<BLAST_SequenceBlk*>* seqblk_vec,
736  unsigned int* max_subjlen)
737 {
738  _ASSERT(seqblk_vec);
739  _ASSERT(max_subjlen);
740  _ASSERT(!subjects.Empty());
741 
742  // Nucleotide subject sequences are stored in ncbi2na format, but the
743  // uncompressed format (ncbi4na/blastna) is also kept to re-evaluate with
744  // the ambiguities
745  bool subj_is_na = Blast_SubjectIsNucleotide(prog) ? true : false;
746 
747  ESentinelType sentinels = eSentinels;
748  if (prog == eBlastTypeTblastn
750  || prog == eBlastTypeTblastx) {
751  sentinels = eNoSentinels;
752  }
753 
755 
756  // N.B.: strand selection is only allowed for translated subjects, this is
757  // done in the engine. For non-translated nucleotide subjects, the
758  // alignment is "fixed" in s_AdjustNegativeSubjFrameInBlastn
759 
760  *max_subjlen = 0;
761 
762  for (TSeqPos i = 0; i < subjects.Size(); i++) {
763  BLAST_SequenceBlk* subj = NULL;
764 
765  SBlastSequence sequence;
766  try {
767  if (BlastSeqBlkNew(&subj) < 0) {
768  NCBI_THROW(CBlastSystemException, eOutOfMemory, "Subject sequence block");
769  }
770  sequence =
771  subjects.GetBlastSequence(i, encoding, eNa_strand_plus, sentinels);
772  }
773  catch(CBlastException & e ) {
774  // Skip bad subject sequence
776  seqblk_vec->push_back(subj);
777  string warning = kEmptyStr;
778  const CSeq_id * id = subjects.GetSeqId(i);
779  string title = subjects.GetTitle(i);
780  if(id != NULL) {
781  warning = id->GetSeqIdString() + " ";
782  }
783  warning += subjects.GetTitle(i);
784  if(warning != kEmptyStr){
785  warning += ": ";
786  }
787  warning += "Subject sequence contains no data";
788  ERR_POST(Warning << warning);
789  continue;
790  }
791  else {
792  subj = BlastSequenceBlkFree(subj);
793  NCBI_RETHROW_SAME(e, e.GetMsg());
794  }
795  } catch (CException & e) {
796  subj = BlastSequenceBlkFree(subj);
797  NCBI_RETHROW_SAME(e, e.GetMsg());
798  }
799 
801  const Uint4 genetic_code_id = subjects.GetGeneticCodeId(i);
802  Uint1* gc = GenCodeSingletonFind(genetic_code_id);
803  if (gc != NULL) {
804  TAutoUint1ArrayPtr gc_str = FindGeneticCode(genetic_code_id);
805  GenCodeSingletonAdd(genetic_code_id, gc_str.get());
806  gc = GenCodeSingletonFind(genetic_code_id);
807  _ASSERT(gc);
808  subj->gen_code_string = gc; /* N.B.: not copied! */
809  }
810  }
811 
812  /* Set the lower case mask, if it exists */
813  if (subjects.GetMask(i).NotEmpty()) {
814  CConstRef<CSeq_loc> range = subjects.GetSeqLoc(i);
815  const CSeq_loc* masks = subjects.GetMask(i);
816  Int4 length = subjects.GetLength(i);
817  CSeqDB::TSequenceRanges masked_ranges;
818  _ASSERT(masks);
819  s_SeqLoc2MaskedSubjRanges(masks, &*range, length, masked_ranges);
820  if ( !masked_ranges.empty() ) {
821  /// @todo: FIXME: this is inefficient, ideally, the masks shouldn't
822  /// be copied for performance reasons...
823  /// TODO bl2seq only use soft masking?
824  subj->length = length;
825  BlastSeqBlkSetSeqRanges(subj, (SSeqRange*) masked_ranges.get_data(),
826  static_cast<Uint4>(masked_ranges.size()) + 1, true, eSoftSubjMasking);
827  } else {
828  subj->num_seq_ranges = 0;
829  }
830  } else {
831  subj->num_seq_ranges = 0;
832  }
833  subj->lcase_mask = NULL; // unused for subjects
834  subj->lcase_mask_allocated = FALSE; // unused for subjects
835 
836  if (subj_is_na) {
837  BlastSeqBlkSetSequence(subj, sequence.data.release(),
838  ((sentinels == eSentinels) ? sequence.length - 2 :
839  sequence.length));
840 
841  try {
842  // Get the compressed sequence
843  SBlastSequence compressed_seq =
847  compressed_seq.data.release());
848  } catch (CException& e) {
849  BlastSequenceBlkFree(subj);
851  "Failed to get compressed nucleotide sequence");
852  }
853  } else {
854  BlastSeqBlkSetSequence(subj, sequence.data.release(),
855  sequence.length - 2);
856  }
857 
858  seqblk_vec->push_back(subj);
859  (*max_subjlen) = MAX((*max_subjlen), subjects.GetLength(i));
860 
861  }
862 }
863 
864 /// Tests if a number represents a valid residue
865 /// @param res Value to test [in]
866 /// @return TRUE if value is a valid residue
867 static inline bool s_IsValidResidue(Uint1 res) { return res < BLASTAA_SIZE; }
868 
869 /// Protein sequences are always encoded in eBlastEncodingProtein and always
870 /// have sentinel bytes around sequence data
871 static SBlastSequence
872 GetSequenceProtein(IBlastSeqVector& sv, string* warnings = 0)
873 {
874  Uint1* buf = NULL; // buffer to write sequence
875  Uint1* buf_var = NULL; // temporary pointer to buffer
876  TSeqPos buflen; // length of buffer allocated
877  TSeqPos i; // loop index of original sequence
878  TAutoUint1Ptr safe_buf; // contains buf to ensure exception safety
879  vector<TSeqPos> replaced_residues; // Substituted residue positions
880  vector<TSeqPos> invalid_residues; // Invalid residue positions
881  // This is the maximum number of residues we'll write a warning about
882  static const size_t kMaxResiduesToWarnAbout = 20;
883 
886  _ASSERT(buflen != 0);
887  buf = buf_var = (Uint1*) malloc(sizeof(Uint1)*buflen);
888  if ( !buf ) {
889  NCBI_THROW(CBlastSystemException, eOutOfMemory,
890  "Failed to allocate " + NStr::IntToString(buflen) + "bytes");
891  }
892  safe_buf.reset(buf);
894  for (i = 0; i < sv.size(); i++) {
895  // Change unsupported residues to X
896  if (sv[i] == AMINOACID_TO_NCBISTDAA[(int)'O']) {
897  replaced_residues.push_back(i);
898  *buf_var++ = AMINOACID_TO_NCBISTDAA[(int)'X'];
899  } else if (!s_IsValidResidue(sv[i])) {
900  invalid_residues.push_back(i);
901  } else {
902  *buf_var++ = sv[i];
903  }
904  }
905  if (invalid_residues.size() > 0) {
906  string error("Invalid residues found at positions ");
907  error += NStr::IntToString(invalid_residues[0]);
908  for (i = 1; i < min(kMaxResiduesToWarnAbout, invalid_residues.size());
909  i++) {
910  error += ", " + NStr::IntToString(invalid_residues[i]);
911  }
912  if (invalid_residues.size() > kMaxResiduesToWarnAbout) {
913  error += ",... (only first ";
914  error += NStr::SizetToString(kMaxResiduesToWarnAbout) + " shown)";
915  }
916  NCBI_THROW(CBlastException, eInvalidCharacter, error);
917  }
918 
920  if (warnings && replaced_residues.size() > 0) {
921  *warnings += "One or more O characters replaced by X for ";
922  *warnings += "alignment score calculations at positions ";
923  *warnings += NStr::IntToString(replaced_residues[0]);
924  for (i = 1; i < min(kMaxResiduesToWarnAbout, replaced_residues.size());
925  i++) {
926  *warnings += ", " + NStr::IntToString(replaced_residues[i]);
927  }
928  if (replaced_residues.size() > kMaxResiduesToWarnAbout) {
929  *warnings += ",... (only first ";
930  *warnings += NStr::SizetToString(kMaxResiduesToWarnAbout);
931  *warnings += " shown)";
932  }
933  }
934  return SBlastSequence(safe_buf.release(), buflen);
935 }
936 
937 /**
938  * @brief Auxiliary function to retrieve plus strand in compressed (ncbi4na)
939  * format
940  *
941  * @param sv abstraction to get sequence data [in]
942  *
943  * @return requested data in compressed format
944  */
945 static SBlastSequence
947 {
950 }
951 
952 /**
953  * @brief Auxiliary function to retrieve a single strand of a nucleotide
954  * sequence.
955  *
956  * @param sv abstraction to get sequence data [in]
957  * @param encoding desired encoding for the data above [in]
958  * @param strand desired strand [in]
959  * @param sentinel use or do not use sentinel bytes [in]
960  *
961  * @return Requested strand in desired encoding with/without sentinels
962  */
963 static SBlastSequence
965  EBlastEncoding encoding,
966  objects::ENa_strand strand,
967  ESentinelType sentinel)
968 {
969  _ASSERT(strand == eNa_strand_plus || strand == eNa_strand_minus);
970 
971  Uint1* buffer = NULL; // buffer to write sequence
972  TSeqPos buflen; // length of buffer allocated
973  const TSeqPos size = sv.size(); // size of original sequence
974  TAutoUint1Ptr safe_buf; // contains buffer to ensure exception safety
975 
976  // We assume that this packs one base per byte in the requested encoding
978  buflen = CalculateSeqBufferLength(size, encoding, strand, sentinel);
979  _ASSERT(buflen != 0);
980  buffer = (Uint1*) malloc(sizeof(Uint1)*buflen);
981  if ( !buffer ) {
982  NCBI_THROW(CBlastSystemException, eOutOfMemory,
983  "Failed to allocate " + NStr::IntToString(buflen) + " bytes");
984  }
985  safe_buf.reset(buffer);
986  if (sentinel == eSentinels)
987  *buffer++ = GetSentinelByte(encoding);
988 
989  sv.GetStrandData(strand, buffer);
990  if (encoding == eBlastEncodingNucleotide) {
991  for (TSeqPos i = 0; i < size; i++) {
992  _ASSERT(sv[i] < BLASTNA_SIZE);
994  }
995  }
996  buffer += size;
997 
998  if (sentinel == eSentinels)
999  *buffer++ = GetSentinelByte(encoding);
1000 
1001  return SBlastSequence(safe_buf.release(), buflen);
1002 }
1003 
1004 /**
1005  * @brief Auxiliary function to retrieve both strands of a nucleotide sequence.
1006  *
1007  * @param sv abstraction to get sequence data [in]
1008  * @param encoding desired encoding for the data above [in]
1009  * @param sentinel use or do not use sentinel bytes [in]
1010  *
1011  * @return concatenated strands in requested encoding with sentinels as
1012  * requested
1013  */
1014 static SBlastSequence
1016  EBlastEncoding encoding,
1017  ESentinelType sentinel)
1018 {
1021  encoding,
1023  eNoSentinels);
1024 
1027  encoding,
1029  eNoSentinels);
1030 
1031  // Stitch the two together
1032  TSeqPos buflen = CalculateSeqBufferLength(sv.size(), encoding,
1033  eNa_strand_both, sentinel);
1034  Uint1* buf_ptr = (Uint1*) malloc(sizeof(Uint1) * buflen);
1035  if ( !buf_ptr ) {
1036  NCBI_THROW(CBlastSystemException, eOutOfMemory,
1037  "Failed to allocate " + NStr::IntToString(buflen) + "bytes");
1038  }
1039  SBlastSequence retval(buf_ptr, buflen);
1040 
1041  if (sentinel == eSentinels) {
1042  *buf_ptr++ = GetSentinelByte(encoding);
1043  }
1044  memcpy(buf_ptr, plus.data.get(), plus.length);
1045  buf_ptr += plus.length;
1046  if (sentinel == eSentinels) {
1047  *buf_ptr++ = GetSentinelByte(encoding);
1048  }
1049  memcpy(buf_ptr, minus.data.get(), minus.length);
1050  buf_ptr += minus.length;
1051  if (sentinel == eSentinels) {
1052  *buf_ptr++ = GetSentinelByte(encoding);
1053  }
1054 
1055  return retval;
1056 }
1057 
1058 
1061  objects::ENa_strand strand,
1062  ESentinelType sentinel,
1063  std::string* warnings)
1064 {
1065  switch (encoding) {
1066  case eBlastEncodingProtein:
1067  return GetSequenceProtein(sv, warnings);
1068 
1069  case eBlastEncodingNcbi4na:
1070  case eBlastEncodingNucleotide: // Used for nucleotide blastn queries
1071  if (strand == eNa_strand_both) {
1072  return GetSequenceNucleotideBothStrands(sv, encoding, sentinel);
1073  } else {
1075  encoding,
1076  strand,
1077  sentinel);
1078  }
1079 
1080  case eBlastEncodingNcbi2na:
1081  _ASSERT(sentinel == eNoSentinels);
1083 
1084  default:
1085  NCBI_THROW(CBlastException, eNotSupported, "Unsupported encoding");
1086  }
1087 }
1088 
1091 {
1093 
1094  switch (program) {
1095  case eBlastTypeBlastn:
1096  case eBlastTypePhiBlastn:
1097  case eBlastTypeMapping:
1098  retval = eBlastEncodingNucleotide;
1099  break;
1100 
1101  case eBlastTypeBlastp:
1102  case eBlastTypeTblastn:
1103  case eBlastTypePsiTblastn:
1104  case eBlastTypeRpsBlast:
1105  case eBlastTypePsiBlast:
1106  case eBlastTypePhiBlastp:
1107  retval = eBlastEncodingProtein;
1108  break;
1109 
1110  case eBlastTypeBlastx:
1111  case eBlastTypeTblastx:
1112  case eBlastTypeRpsTblastn:
1113  retval = eBlastEncodingNcbi4na;
1114  break;
1115 
1116  default:
1117  abort(); // should never happen
1118  }
1119 
1120  return retval;
1121 }
1122 
1125 {
1127 
1128  switch (program) {
1129  case eBlastTypeBlastn:
1130  case eBlastTypeMapping:
1131  retval = eBlastEncodingNucleotide;
1132  break;
1133 
1134  case eBlastTypeBlastp:
1135  case eBlastTypeBlastx:
1136  case eBlastTypePsiBlast:
1137  case eBlastTypePhiBlastp:
1138  retval = eBlastEncodingProtein;
1139  break;
1140 
1141  case eBlastTypeTblastn:
1142  case eBlastTypePsiTblastn:
1143  case eBlastTypeTblastx:
1144  retval = eBlastEncodingNcbi4na;
1145  break;
1146 
1147  default:
1148  abort(); // should never happen
1149  }
1150 
1151  return retval;
1152 }
1153 
1155 {
1156  _ASSERT(source.data.get());
1157 
1158  TSeqPos i; // loop index of original sequence
1159  TSeqPos ci; // loop index for compressed sequence
1160 
1161  // Allocate the return value
1165  eNoSentinels));
1166  Uint1* source_ptr = source.data.get();
1167 
1168  // Populate the compressed sequence up to the last byte
1169  for (ci = 0, i = 0; ci < retval.length-1; ci++, i+= COMPRESSION_RATIO) {
1170  Uint1 a, b, c, d;
1171  a = ((*source_ptr & NCBI2NA_MASK)<<6); ++source_ptr;
1172  b = ((*source_ptr & NCBI2NA_MASK)<<4); ++source_ptr;
1173  c = ((*source_ptr & NCBI2NA_MASK)<<2); ++source_ptr;
1174  d = ((*source_ptr & NCBI2NA_MASK)<<0); ++source_ptr;
1175  retval.data.get()[ci] = a | b | c | d;
1176  }
1177 
1178  // Set the last byte in the compressed sequence
1179  retval.data.get()[ci] = 0;
1180  for (; i < source.length; i++) {
1181  Uint1 bit_shift = 0;
1182  switch (i%COMPRESSION_RATIO) {
1183  case 0: bit_shift = 6; break;
1184  case 1: bit_shift = 4; break;
1185  case 2: bit_shift = 2; break;
1186  default: abort(); // should never happen
1187  }
1188  retval.data.get()[ci] |= ((*source_ptr & NCBI2NA_MASK)<<bit_shift);
1189  ++source_ptr;
1190  }
1191  // Set the number of bases in the last 2 bits of the last byte in the
1192  // compressed sequence
1193  retval.data.get()[ci] |= source.length%COMPRESSION_RATIO;
1194  return retval;
1195 }
1196 
1198  EBlastEncoding encoding,
1199  objects::ENa_strand strand,
1200  ESentinelType sentinel)
1202 {
1203  TSeqPos retval = 0;
1204 
1205  if (sequence_length == 0) {
1206  return retval;
1207  }
1208 
1209  switch (encoding) {
1210  // Strand and sentinels are irrelevant in this encoding.
1211  // Strand is always plus and sentinels cannot be represented
1212  case eBlastEncodingNcbi2na:
1213  _ASSERT(sentinel == eNoSentinels);
1214  _ASSERT(strand == eNa_strand_plus);
1215  retval = sequence_length / COMPRESSION_RATIO + 1;
1216  break;
1217 
1218  case eBlastEncodingNcbi4na:
1219  case eBlastEncodingNucleotide: // Used for nucleotide blastn queries
1220  if (sentinel == eSentinels) {
1221  if (strand == eNa_strand_both) {
1222  retval = sequence_length * 2;
1223  retval += 3;
1224  } else {
1225  retval = sequence_length + 2;
1226  }
1227  } else {
1228  if (strand == eNa_strand_both) {
1229  retval = sequence_length * 2;
1230  } else {
1231  retval = sequence_length;
1232  }
1233  }
1234  break;
1235 
1236  case eBlastEncodingProtein:
1237  _ASSERT(sentinel == eSentinels);
1238  _ASSERT(strand == eNa_strand_unknown);
1239  retval = sequence_length + 2;
1240  break;
1241 
1242  default:
1243  NCBI_THROW(CBlastException, eNotSupported, "Unsupported encoding");
1244  }
1245 
1246  return retval;
1247 }
1248 
1250 {
1251  switch (encoding) {
1252  case eBlastEncodingProtein:
1253  return kProtSentinel;
1254 
1255  case eBlastEncodingNcbi4na:
1257  return kNuclSentinel;
1258 
1259  default:
1260  NCBI_THROW(CBlastException, eNotSupported, "Unsupported encoding");
1261  }
1262 }
1263 
1264 #if 0
1265 // Not used right now, need to complete implementation
1266 void
1267 BLASTGetTranslation(const Uint1* seq, const Uint1* seq_rev,
1268  const int nucl_length, const short frame, Uint1* translation)
1269 {
1270  TSeqPos ni = 0; // index into nucleotide sequence
1271  TSeqPos pi = 0; // index into protein sequence
1272 
1273  const Uint1* nucl_seq = frame >= 0 ? seq : seq_rev;
1274  translation[0] = NULLB;
1275  for (ni = ABS(frame)-1; ni < (TSeqPos) nucl_length-2; ni += CODON_LENGTH) {
1276  Uint1 residue = CGen_code_table::CodonToIndex(nucl_seq[ni+0],
1277  nucl_seq[ni+1],
1278  nucl_seq[ni+2]);
1279  if (IS_residue(residue))
1280  translation[pi++] = residue;
1281  }
1282  translation[pi++] = NULLB;
1283 
1284  return;
1285 }
1286 #endif
1287 
1288 /** Get the path to the matrix, without the actual matrix name.
1289  * @param full_path including the matrix name, this string will be modified [in]
1290  * @param matrix_name name of matrix (e.g., BLOSUM62) [in]
1291  * @return char* to matrix path
1292  */
1293 char* s_GetCStringOfMatrixPath(string& full_path, const string& matrix_name)
1294 {
1295  // The following line erases the actual name of the matrix from the string.
1296  full_path.erase(full_path.size() - matrix_name.size());
1297  char* matrix_path = strdup(full_path.c_str());
1298  return matrix_path;
1299 }
1300 
1301 char* BlastFindMatrixPath(const char* matrix_name, Boolean is_prot)
1302 {
1303  if (!matrix_name)
1304  return NULL;
1305 
1306  try{
1307 
1308  string mtx(matrix_name);
1309  mtx = NStr::ToUpper(mtx);
1310 
1311  // Try all the default directories
1312  string full_path = g_FindDataFile(mtx);
1313  if(!full_path.empty()){
1314  return s_GetCStringOfMatrixPath(full_path, mtx);
1315  }
1316 
1317  // Try all the default directories with original string case -RMH-
1318  full_path = g_FindDataFile(matrix_name);
1319  if(!full_path.empty()){
1320  return s_GetCStringOfMatrixPath(full_path, matrix_name);
1321  }
1322 
1323  // Try env BLASTMAT directory
1325  if (!app) {
1326  return NULL;
1327  }
1328  const string& blastmat_env = app->GetEnvironment().Get("BLASTMAT");
1329  if (CDir(blastmat_env).Exists()) {
1330  full_path = blastmat_env;
1331  full_path += CFile::GetPathSeparator();
1332  full_path += mtx;
1333  if (CFile(full_path).Exists()) {
1334  return s_GetCStringOfMatrixPath(full_path, mtx);
1335  }
1336  // Try env BLASTMAT directory with original matrix string case -RMH-
1337  full_path = blastmat_env;
1338  full_path += CFile::GetPathSeparator();
1339  full_path += matrix_name;
1340  if (CFile(full_path).Exists()) {
1341  return s_GetCStringOfMatrixPath(full_path, matrix_name);
1342  }
1343 
1344  // Try original path/nt/matrix or path/aa/matrix alternatives -RMH-
1345  full_path = blastmat_env;
1346  full_path += CFile::GetPathSeparator();
1347  full_path += is_prot ? "aa" : "nt";
1348  full_path += CFile::GetPathSeparator();
1349  full_path += mtx;
1350  if (CFile(full_path).Exists()) {
1351  return s_GetCStringOfMatrixPath(full_path, mtx);
1352  }
1353 
1354  // Allow original case to be checked. -RMH-
1355  full_path = blastmat_env;
1356  full_path += CFile::GetPathSeparator();
1357  full_path += is_prot ? "aa" : "nt";
1358  full_path += CFile::GetPathSeparator();
1359  full_path += matrix_name;
1360  if (CFile(full_path).Exists()) {
1361  return s_GetCStringOfMatrixPath(full_path, matrix_name);
1362  }
1363 
1364  }
1365 
1366  // Try local "data" directory
1367  full_path = "data";
1368  full_path += CFile::GetPathSeparator();
1369  full_path += mtx;
1370  if (CFile(full_path).Exists()) {
1371  return s_GetCStringOfMatrixPath(full_path, mtx);
1372  }
1373 
1374  // Try local "data" directory with original matrix string case -RMH-
1375  full_path = "data";
1376  full_path += CFile::GetPathSeparator();
1377  full_path += matrix_name;
1378  if (CFile(full_path).Exists()) {
1379  return s_GetCStringOfMatrixPath(full_path, mtx);
1380  }
1381 
1382  } catch (...) { } // Ignore all exceptions and return NULL.
1383 
1384  return NULL;
1385 }
1386 
1387 /// Checks if a BLAST database exists at a given file path: looks for
1388 /// an alias file first, then for an index file
1389 static bool BlastDbFileExists(string& path, bool is_prot)
1390 {
1391  // Check for alias file first
1392  string full_path = path + (is_prot ? ".pal" : ".nal");
1393  if (CFile(full_path).Exists())
1394  return true;
1395  // Check for an index file
1396  full_path = path + (is_prot ? ".pin" : ".nin");
1397  if (CFile(full_path).Exists())
1398  return true;
1399  return false;
1400 }
1401 
1402 string
1403 FindBlastDbPath(const char* dbname, bool is_prot)
1404 {
1405  string retval;
1406  string full_path; // full path to matrix file
1407 
1408  if (!dbname)
1409  return retval;
1410 
1411  string database(dbname);
1412 
1413  // Look for matrix file in local directory
1414  full_path = database;
1415  if (BlastDbFileExists(full_path, is_prot)) {
1416  return retval;
1417  }
1418 
1420  if (app) {
1421  const string& blastdb_env = app->GetEnvironment().Get("BLASTDB");
1422  if (CFile(blastdb_env).Exists()) {
1423  full_path = blastdb_env;
1424  full_path += CFile::GetPathSeparator();
1425  full_path += database;
1426  if (BlastDbFileExists(full_path, is_prot)) {
1427  retval = full_path;
1428  retval.erase(retval.size() - database.size());
1429  return retval;
1430  }
1431  }
1432  }
1433 
1434  // Obtain the matrix path from the ncbi configuration file
1435  string path;
1436  if (app) {
1437  const CNcbiRegistry& registry = app->GetConfig();
1438  if (registry.HasEntry("BLAST", "BLASTDB"))
1439  CDirEntry::NormalizePath(registry.Get("BLAST", "BLASTDB"), eFollowLinks);
1440  }
1441 
1442  full_path = CFile::MakePath(path, database);
1443  if (BlastDbFileExists(full_path, is_prot)) {
1444  retval = full_path;
1445  retval.erase(retval.size() - database.size());
1446  return retval;
1447  }
1448 
1449  return retval;
1450 }
1451 
1452 unsigned int
1454 {
1455  unsigned int retval = 0;
1456  if ( (retval = BLAST_GetNumberOfContexts(p)) == 0) {
1457  int debug_value = static_cast<int>(p);
1458  string prog_name(Blast_ProgramNameFromType(p));
1459  string msg = "Cannot get number of contexts for invalid program ";
1460  msg += "type: " + prog_name + " (" + NStr::IntToString(debug_value);
1461  msg += ")";
1462  NCBI_THROW(CBlastException, eNotSupported, msg);
1463  }
1464 
1465  return retval;
1466 }
1467 
1468 /////////////////////////////////////////////////////////////////////////////
1469 
1472  const CBlastOptions* options,
1473  BlastQueryInfo* query_info,
1474  TSearchMessages& messages)
1475 {
1476  _ASSERT(options);
1477  _ASSERT(query_info);
1478  _ASSERT( !queries.Empty() );
1479 
1480  CBLAST_SequenceBlk retval;
1481  SetupQueries_OMF(queries, query_info, &retval, options->GetProgramType(),
1482  options->GetStrandOption(), messages);
1483 
1484  return retval.Release();
1485 }
1486 
1489  const CBlastOptions* options)
1490 {
1491  _ASSERT(!queries.Empty());
1492  _ASSERT(options);
1493 
1494  CBlastQueryInfo retval;
1495  SetupQueryInfo_OMF(queries, options->GetProgramType(),
1496  options->GetStrandOption(), &retval);
1497 
1498  if (retval.Get() == NULL) {
1499  NCBI_THROW(CBlastException, eInvalidArgument,
1500  "blast::SetupQueryInfo failed");
1501  }
1502  return retval.Release();
1503 }
1504 
1505 
1507 {
1508  bool retval;
1509  switch(m_Program) {
1510  case eBlastTypeBlastx:
1511  case eBlastTypeTblastx:
1512  case eBlastTypeRpsTblastn:
1513  retval = true;
1514  break;
1515 
1516  default:
1517  retval = false;
1518  break;
1519  }
1520  return retval;
1521 }
1522 
1525  : m_Program(program)
1526 {
1528 }
1529 
1532  const TMaskedQueryRegions & mqr)
1533  : m_Program(program)
1534 {
1536 
1537  if (mqr.empty()) {
1538  return;
1539  }
1540 
1541  set<ETranslationFrame> frames;
1542  ITERATE(TMaskedQueryRegions, itr, mqr) {
1543  const CSeq_interval & intv = (**itr).GetInterval();
1544 
1545  ETranslationFrame frame =
1546  (ETranslationFrame) (**itr).GetFrame();
1547 
1548  AddSeqLoc(intv, frame);
1549  frames.insert(frame);
1550  if (Blast_QueryIsTranslated(program))
1551  {
1552  if(frame == ncbi::CSeqLocInfo::eFramePlus1)
1553  {
1554  AddSeqLoc(intv, ncbi::CSeqLocInfo::eFramePlus2);
1555  frames.insert(ncbi::CSeqLocInfo::eFramePlus2);
1556  AddSeqLoc(intv, ncbi::CSeqLocInfo::eFramePlus3);
1557  frames.insert(ncbi::CSeqLocInfo::eFramePlus3);
1558  }
1559  else if (frame == ncbi::CSeqLocInfo::eFrameMinus1)
1560  {
1561  AddSeqLoc(intv, ncbi::CSeqLocInfo::eFrameMinus2);
1562  frames.insert(ncbi::CSeqLocInfo::eFrameMinus2);
1563  AddSeqLoc(intv, ncbi::CSeqLocInfo::eFrameMinus3);
1564  frames.insert(ncbi::CSeqLocInfo::eFrameMinus3);
1565  }
1566  }
1567  }
1568 }
1569 
1571 {
1572  ITERATE(TFrameSet, iter, m_Seqlocs) {
1573  if ((*iter).second != 0) {
1574  BlastSeqLocFree((*iter).second);
1575  }
1576  }
1577 }
1578 
1580 {
1583 }
1584 
1585 // some of the logic in this function is shamelessly copied from
1586 // BlastMaskLocDNAToProtein, which should have been used instead of creating
1587 // this class (which I presume was added ignoring the former function)
1589 {
1590  if (m_TranslateCoords) {
1591  m_TranslateCoords = false;
1592  map<ETranslationFrame, int> frame_lengths;
1593  frame_lengths[CSeqLocInfo::eFramePlus1] =
1594  frame_lengths[CSeqLocInfo::eFrameMinus1] = dna_length /
1595  CODON_LENGTH;
1596  frame_lengths[CSeqLocInfo::eFramePlus2] =
1597  frame_lengths[CSeqLocInfo::eFrameMinus2] = (dna_length-1) /
1598  CODON_LENGTH;
1599  frame_lengths[CSeqLocInfo::eFramePlus3] =
1600  frame_lengths[CSeqLocInfo::eFrameMinus3] = (dna_length-2) /
1601  CODON_LENGTH;
1602 
1603  ITERATE(TFrameSet, iter, m_Seqlocs) {
1604  short frame = iter->first;
1605  BlastSeqLoc * bsl = iter->second;
1606 
1607  for (BlastSeqLoc* itr = bsl; itr; itr = itr->next) {
1608  int to(0), from(0);
1609 
1610  if (frame < 0) {
1611  from = ((int) dna_length + frame - itr->ssr->right) / CODON_LENGTH;
1612  to = ((int) dna_length + frame - itr->ssr->left) / CODON_LENGTH;
1613  } else {
1614  from = (itr->ssr->left - frame + 1) / CODON_LENGTH;
1615  to = (itr->ssr->right - frame + 1) / CODON_LENGTH;
1616  }
1617  if (from < 0)
1618  from = 0;
1619  if (to < 0)
1620  to = 0;
1621  const int kFrameLength = frame_lengths[(CSeqLocInfo::ETranslationFrame)frame];
1622  if (from >= kFrameLength)
1623  from = kFrameLength - 1;
1624  if (to >= kFrameLength)
1625  to = kFrameLength - 1;
1626 
1627  _ASSERT(from >= 0 && to >= 0);
1628  _ASSERT(from < kFrameLength && to < kFrameLength);
1629  itr->ssr->left = from;
1630  itr->ssr->right = to;
1631  }
1632  }
1633  }
1634 }
1635 
1638 {
1639  if (m_Frames.empty()) {
1640  ITERATE(TFrameSet, iter, m_Seqlocs) {
1641  if ((*iter).second != 0) {
1642  m_Frames.insert((*iter).first);
1643  }
1644  }
1645  }
1646  return m_Frames;
1647 }
1648 
1650 {
1651  return ListFrames().empty();
1652 }
1653 
1655 {
1656  bool okay = true;
1657 
1658  switch(m_Program) {
1659  case eBlastTypeBlastp:
1660  case eBlastTypeTblastn:
1661  case eBlastTypePsiTblastn:
1662  case eBlastTypeRpsBlast:
1663  case eBlastTypePsiBlast:
1664  case eBlastTypePhiBlastp:
1665  if (frame != 0) {
1666  okay = false;
1667  }
1668  break;
1669 
1670  case eBlastTypeBlastn:
1671  case eBlastTypeMapping:
1672  if ((frame != CSeqLocInfo::eFramePlus1) &&
1673  (frame != CSeqLocInfo::eFrameMinus1)) {
1674  okay = false;
1675  }
1676  break;
1677 
1678  case eBlastTypeBlastx:
1679  case eBlastTypeTblastx:
1680  case eBlastTypeRpsTblastn:
1681  switch(frame) {
1682  case 1:
1683  case 2:
1684  case 3:
1685  case -1:
1686  case -2:
1687  case -3:
1688  break;
1689 
1690  default:
1691  okay = false;
1692  }
1693  break;
1694 
1695  default:
1696  okay = false;
1697  }
1698 
1699  if (! okay) {
1700  NCBI_THROW(CBlastException, eNotSupported,
1701  "Frame and program values are incompatible.");
1702  }
1703 }
1704 
1706 {
1707  switch(m_Program) {
1708  case eBlastTypeBlastp:
1709  case eBlastTypeTblastn:
1710  case eBlastTypePsiTblastn:
1711  case eBlastTypeRpsBlast:
1712  case eBlastTypePhiBlastp:
1713  case eBlastTypePsiBlast:
1714  return false;
1715 
1716  case eBlastTypeBlastn:
1717  case eBlastTypeBlastx:
1718  case eBlastTypeTblastx:
1719  case eBlastTypeRpsTblastn:
1720  case eBlastTypeMapping:
1721  return true;
1722 
1723  default:
1724  NCBI_THROW(CBlastException, eNotSupported,
1725  "IsMulti: unsupported program");
1726  }
1727 
1728  return false;
1729 }
1730 
1731 void CBlastQueryFilteredFrames::AddSeqLoc(const objects::CSeq_interval & intv,
1732  int frame)
1733 {
1734  _ASSERT( m_Frames.empty() );
1735  if ((frame == 0) && (m_Program == eBlastTypeBlastn
1736  || m_Program == eBlastTypeMapping)) {
1737 
1740  static const CSeqLocInfo::ETranslationFrame kFrames[] = {
1742 
1743  for (size_t i = 0; i < sizeof(kFrames)/sizeof(*kFrames); i++) {
1744  m_SeqlocTails[ kFrames[i] ] =
1745  BlastSeqLocNew( (m_SeqlocTails[ kFrames[i] ]
1746  ? & m_SeqlocTails[ kFrames[i] ]
1747  : & m_Seqlocs[ kFrames[i] ]),
1748  intv.GetFrom(), intv.GetTo());
1749  }
1750 
1751  } else {
1752  x_VerifyFrame(frame);
1753 
1754  m_SeqlocTails[(ETranslationFrame) frame] =
1756  ? & m_SeqlocTails[(ETranslationFrame) frame]
1757  : & m_Seqlocs[(ETranslationFrame) frame]),
1758  intv.GetFrom(), intv.GetTo());
1759  }
1760 }
1761 
1763 {
1764  // Asking for a frame verifies that it is a valid value for the
1765  // type of search you are running.
1766 
1767  x_VerifyFrame(frame);
1768  return & m_Seqlocs[(ETranslationFrame) frame];
1769 }
1770 
1771 
1772 END_SCOPE(blast)
1774 
1775 /* @} */
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eSoftSubjMasking
Definition: blast_def.h:237
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
Definition: blast_def.h:83
#define CODON_LENGTH
Codons are always of length 3.
Definition: blast_def.h:63
BlastMaskLoc * BlastMaskLocNew(Int4 total)
Allocate memory for a BlastMaskLoc.
Definition: blast_filter.c:760
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
@ eBlastSevWarning
Definition: blast_message.h:57
Declares class to encapsulate all BLAST options.
Boolean Blast_ProgramIsMapping(EBlastProgramType p)
Definition: blast_program.c:76
Boolean Blast_ProgramIsPhiBlast(EBlastProgramType p)
Returns true if program is PHI-BLAST (i.e.
Definition: blast_program.c:70
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
Boolean Blast_SubjectIsNucleotide(EBlastProgramType p)
Returns true if the subject is nucleotide.
Definition: blast_program.c:53
Boolean Blast_QueryIsNucleotide(EBlastProgramType p)
Returns true if the query is nucleotide.
Definition: blast_program.c:43
Boolean Blast_QueryIsProtein(EBlastProgramType p)
Returns true if the query is protein.
Definition: blast_program.c:40
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypePsiTblastn
Definition: blast_program.h:83
@ eBlastTypeRpsTblastn
Definition: blast_program.h:85
@ eBlastTypePhiBlastn
Definition: blast_program.h:87
@ eBlastTypeMapping
Definition: blast_program.h:88
@ eBlastTypeTblastx
Definition: blast_program.h:79
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypePhiBlastp
Definition: blast_program.h:86
@ eBlastTypeRpsBlast
Definition: blast_program.h:84
@ eBlastTypeTblastn
Definition: blast_program.h:77
@ eBlastTypeBlastp
Definition: blast_program.h:73
Boolean Blast_SubjectIsTranslated(EBlastProgramType p)
Returns true if the subject is translated.
Definition: blast_program.c:63
Uint4 QueryInfo_GetSeqBufLen(const BlastQueryInfo *qinfo)
Get the number of bytes required for the concatenated sequence buffer, given a query info structure.
Int4 BlastQueryInfoGetQueryLength(const BlastQueryInfo *qinfo, EBlastProgramType program, Int4 query_index)
Obtains the sequence length for a given query in the query, without taking into consideration any app...
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
Utilities initialize/setup BLAST.
Int2 BlastSetup_Validate(const BlastQueryInfo *query_info, const BlastScoreBlk *score_blk)
Validation function for the setup of queries for the BLAST search.
Definition: blast_setup.c:535
void BlastSeqLoc_RestrictToInterval(BlastSeqLoc **mask, Int4 from, Int4 to)
Adjusts the mask locations coordinates to a sequence interval.
Definition: blast_setup.c:1030
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Various auxiliary BLAST utility functions.
#define NCBI2NA_MASK
Bit mask for obtaining a single base from a byte in ncbi2na format.
Definition: blast_util.h:52
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Definition: blast_util.c:245
Int2 BlastSeqBlkSetSeqRanges(BLAST_SequenceBlk *seq_blk, SSeqRange *seq_ranges, Uint4 num_seq_ranges, Boolean copy_seq_ranges, ESubjectMaskingType mask_type)
Sets the seq_range and related fields appropriately in the BLAST_SequenceBlk structure.
Definition: blast_util.c:182
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence, Int4 seqlen)
Stores the sequence in the sequence block structure.
Definition: blast_util.c:147
size_t BLAST_GetTranslatedProteinLength(size_t nucleotide_length, unsigned int context)
Calculates the length of frame for a translated protein.
Definition: blast_util.c:923
Int2 BlastSeqBlkSetCompressedSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence)
Stores the compressed nucleotide sequence in the sequence block structure for the subject sequence wh...
Definition: blast_util.c:167
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
This function translates the context number of a context into the frame of the sequence.
Definition: blast_util.c:839
Int4 BLAST_GetTranslation(const Uint1 *query_seq, const Uint1 *query_seq_rev, Int4 nt_length, Int2 frame, Uint1 *buffer, const Uint1 *genetic_code)
GetTranslation to get the translation of the nucl.
Definition: blast_util.c:428
#define IS_residue(x)
Does character encode a residue?
Definition: blast_util.h:48
Int2 BlastSeqBlkNew(BLAST_SequenceBlk **retval)
Allocates a new sequence block structure.
Definition: blast_util.c:133
unsigned int BLAST_GetNumberOfContexts(EBlastProgramType program)
Get the number of contexts for a given program.
Definition: blast_util.c:1373
ncbi::TMaskedQueryRegions mask
Wrapper class for BLAST_SequenceBlk .
Definition: blast_aux.hpp:309
Defines BLAST error codes (user errors included)
Wrapper class for BlastMaskLoc .
Definition: blast_aux.hpp:354
Encapsulates ALL the BLAST algorithm's options.
Collection of BlastSeqLoc lists for filtering processing.
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:311
Defines system exceptions occurred while running BLAST.
CDir –.
Definition: ncbifile.hpp:1695
CFile –.
Definition: ncbifile.hpp:1604
static int CodonToIndex(char base1, char base2, char base3)
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CRef –.
Definition: ncbiobj.hpp:618
Error or Warning Message from search.
Lightweight wrapper around an indexed sequence container.
Lightweight wrapper around sequence data which provides a CSeqVector-like interface to the data.
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
void erase(iterator pos)
Definition: map.hpp:167
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
#define true
Definition: bool.h:35
static SQLCHAR output[256]
Definition: print.c:5
static tds_mutex mtx
Definition: condition.c:43
int offset
Definition: replacements.h:160
Defines the interface to interact with the genetic code singleton object.
Int2 GenCodeSingletonAdd(Uint4 gen_code_id, const Uint1 *gen_code_str)
Add a genetic code entry to the singleton.
Uint1 * GenCodeSingletonFind(Uint4 gen_code_id)
Returns the genetic code string for the requested genetic code id.
TSeqPos length
Length of the buffer above (not necessarily sequence length!)
Definition: blast_setup.hpp:65
virtual TMaskedQueryRegions GetMaskedRegions(int index)=0
Return the filtered (masked) regions for a sequence.
bool Empty() const
Returns true if the container is empty, else false.
virtual CConstRef< objects::CSeq_loc > GetMask(int index)=0
Return the filtered (masked) regions for a sequence.
BlastQueryInfo * Get() const
Definition: blast_aux.hpp:311
bool HasMessages() const
Definition: blast_aux.cpp:1002
void SetupQueries_OMF(IBlastQuerySource &queries, BlastQueryInfo *qinfo, BLAST_SequenceBlk **seqblk, EBlastProgramType prog, objects::ENa_strand strand_opt, TSearchMessages &messages)
ObjMgr Free version of SetupQueries.
static CRef< CBlastQueryFilteredFrames > s_GetRestrictedBlastSeqLocs(IBlastQuerySource &queries, int query_index, const BlastQueryInfo *qinfo, EBlastProgramType program)
Extract the masking locations for a single query into a CBlastQueryFilteredFrames object and adjust t...
BlastQueryInfo * Release()
Definition: blast_aux.hpp:311
EBlastEncoding
Different types of sequence encodings for sequence retrieval from the BLAST database.
SBlastSequence GetSequence_OMF(IBlastSeqVector &sv, EBlastEncoding encoding, objects::ENa_strand strand, ESentinelType sentinel, std::string *warnings=0)
Object manager free version of GetSequence.
static void s_AdjustFirstContext(BlastQueryInfo *query_info, EBlastProgramType prog, ENa_strand strand_opt, const IBlastQuerySource &queries)
Adjust first context depending on the first query strand.
virtual Uint4 GetGeneticCodeId(int index) const =0
Retrieve the genetic code associated with a sequence.
void SetupSubjects_OMF(IBlastQuerySource &subjects, EBlastProgramType program, vector< BLAST_SequenceBlk * > *seqblk_vec, unsigned int *max_subjlen)
Object manager free version of SetupSubjects.
bool m_TranslateCoords
True if this object's masked regions store DNA coordinates that will later be translated into protein...
#define BLASTAA_SIZE
Size of aminoacid alphabet.
void AddSeqLoc(const objects::CSeq_interval &intv, int frame)
Add a masked interval to the specified frame.
EBlastEncoding GetQueryEncoding(EBlastProgramType program)
Returns the encoding for the sequence data used in BLAST for the query.
TAutoUint1ArrayPtr FindGeneticCode(int genetic_code)
Retrieves the requested genetic code in Ncbistdaa format.
Definition: blast_aux.cpp:588
TSeqPos CalculateSeqBufferLength(TSeqPos sequence_length, EBlastEncoding encoding, objects::ENa_strand strand=objects::eNa_strand_unknown, ESentinelType sentinel=eSentinels) THROWS((CBlastException))
Calculates the length of the buffer to allocate given the desired encoding, strand (if applicable) an...
EBlastProgramType m_Program
The type of search being done.
static SBlastSequence GetSequenceCompressedNucleotide(IBlastSeqVector &sv)
Auxiliary function to retrieve plus strand in compressed (ncbi4na) format.
static bool s_IsValidResidue(Uint1 res)
Tests if a number represents a valid residue.
objects::ENa_strand GetStrandOption() const
static objects::ENa_strand s_BlastSetup_GetStrand(objects::ENa_strand seqloc_strand, EBlastProgramType program, objects::ENa_strand strand_opt)
Internal function to choose between the strand specified in a Seq-loc (which specified the query stra...
static void s_SeqLoc2MaskedSubjRanges(const CSeq_loc *slp, const CSeq_loc *range, Int4 total_length, CSeqDB::TSequenceRanges &output)
bool x_NeedsTrans()
Returns true if this program needs coordinate translation.
char * s_GetCStringOfMatrixPath(string &full_path, const string &matrix_name)
Get the path to the matrix, without the actual matrix name.
static SBlastSequence GetSequenceProtein(IBlastSeqVector &sv, string *warnings=0)
Protein sequences are always encoded in eBlastEncodingProtein and always have sentinel bytes around s...
virtual CConstRef< objects::CSeq_loc > GetSeqLoc(int index) const =0
Return the CSeq_loc associated with a sequence.
bool Empty()
Returns true if this object contains any masking information.
static bool BlastDbFileExists(string &path, bool is_prot)
Checks if a BLAST database exists at a given file path: looks for an alias file first,...
virtual const objects::CSeq_id * GetSeqId(int index) const =0
Return the sequence identifier associated with a sequence.
set< ETranslationFrame > m_Frames
Frames for masked locations.
static SBlastSequence GetSequenceSingleNucleotideStrand(IBlastSeqVector &sv, EBlastEncoding encoding, objects::ENa_strand strand, ESentinelType sentinel)
Auxiliary function to retrieve a single strand of a nucleotide sequence.
virtual string GetTitle(int index) const =0
Return the title of a sequence.
const Uint1 NCBI4NA_TO_BLASTNA[]
Translates between ncbi4na and blastna.
EBlastProgramType GetProgramType() const
Returns the CORE BLAST notion of program type.
bool QueryHasMultipleFrames() const
Check whether the query is multiframe for this type of search.
TAutoUint1Ptr data
Sequence data.
Definition: blast_setup.hpp:64
virtual SBlastSequence GetCompressedPlusStrand()=0
Returns the compressed nucleotide data for the plus strand, still occupying one base per byte.
ESentinelType
Allows specification of whether sentinel bytes should be used or not.
Definition: blast_setup.hpp:93
objects::ENa_strand BlastSetup_GetStrand(const objects::CSeq_loc &query_seqloc, EBlastProgramType program, objects::ENa_strand strand_option)
Choose between a Seq-loc specified query strand and the strand obtained from the CBlastOptions.
CSeqLocInfo::ETranslationFrame ETranslationFrame
Data type for frame value, however inputs to methods use "int" instead of this type for readability a...
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
string ToString() const
Converts messages to a string, which is returned.
Definition: blast_aux.cpp:1013
size_t GetNumFrames() const
virtual void GetStrandData(objects::ENa_strand strand, unsigned char *buf)
Retrieve strand data in one chunk.
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
virtual objects::ENa_strand GetStrand(int index) const =0
Return strand for a sequence.
#define BLASTNA_SIZE
Size of nucleic acid alphabet.
unsigned int GetNumberOfContexts(EBlastProgramType p)
Returns the number of contexts for a given BLAST program.
const Uint1 kProtSentinel
Sentinel byte for protein sequences.
const set< ETranslationFrame > & ListFrames()
Returns the list of frame values for which this object contains masking information.
static SBlastSequence GetSequenceNucleotideBothStrands(IBlastSeqVector &sv, EBlastEncoding encoding, ESentinelType sentinel)
Auxiliary function to retrieve both strands of a nucleotide sequence.
virtual TSeqPos GetLength(int index) const =0
Return the length of a sequence.
virtual SBlastSequence GetBlastSequence(int index, EBlastEncoding encoding, objects::ENa_strand strand, ESentinelType sentinel, std::string *warnings=0) const =0
Return the sequence data for a sequence.
void UseProteinCoords(TSeqPos dna_length)
Adjusts all stored masks from nucleotide to protein offsets.
const Uint1 kNuclSentinel
Sentinel nibble for nucleotide sequences.
string FindBlastDbPath(const char *dbname, bool is_prot)
Returns the path (including a trailing path separator) to the location where the BLAST database can b...
virtual void SetCoding(objects::CSeq_data::E_Choice coding)=0
Sets the encoding for the sequence data.
EBlastEncoding GetSubjectEncoding(EBlastProgramType program)
Returns the encoding for the sequence data used in BLAST2Sequences for the subject.
string Blast_ProgramNameFromType(EBlastProgramType program)
Returns a string program name, given a blast::EBlastProgramType enumeration.
Definition: blast_aux.cpp:813
BlastSeqLoc ** operator[](int frame)
Access the BlastSeqLocs for a given frame.
CBlastQueryFilteredFrames(EBlastProgramType program)
Construct container for frame values and BlastSeqLocs for the specified search program.
static void s_InvalidateQueryContexts(BlastQueryInfo *qinfo, int query_index)
Mark the contexts corresponding to the query identified by query_index as invalid.
SBlastSequence CompressNcbi2na(const SBlastSequence &source)
Compresses the sequence data passed in to the function from 1 base per byte to 4 bases per byte.
BlastQueryInfo * SafeSetupQueryInfo(const IBlastQuerySource &queries, const CBlastOptions *options)
Wrapper around SetupQueryInfo.
void Release(int frame)
Release the BlastSeqLocs for a given frame.
virtual TSeqPos Size() const =0
Return the number of elements in the sequence container.
~CBlastQueryFilteredFrames()
Destructor; frees any BlastSeqLoc lists not released by the caller.
static void s_QueryInfo_SetContext(BlastQueryInfo *qinfo, Uint4 index, Uint4 length)
Set field values for one element of the context array of a concatenated query.
static void s_ComputeStartEndContexts(ENa_strand strand, int num_contexts, int &start, int &end)
Calculate the starting and ending contexts for a given strand.
void SetupQueryInfo_OMF(const IBlastQuerySource &queries, EBlastProgramType prog, objects::ENa_strand strand_opt, BlastQueryInfo **qinfo)
ObjMgr Free version of SetupQueryInfo.
virtual int GetSegmentInfo(int index) const =0
Get segment information (for mapping paired short reads)
TFrameSet m_Seqlocs
Frame and BlastSeqLoc* data.
BLAST_SequenceBlk * SafeSetupQueries(IBlastQuerySource &queries, const CBlastOptions *options, BlastQueryInfo *query_info, TSearchMessages &messages)
Wrapper around SetupQueries.
TSeqPos size() const
Returns the length of the sequence data (in the case of nucleotides, only one strand)
void x_VerifyFrame(int frame)
Verify the specified frame value.
Uint1 GetSentinelByte(EBlastEncoding encoding) THROWS((CBlastException))
Convenience function to centralize the knowledge of which sentinel bytes we use for supported encodin...
BLAST_SequenceBlk * Release()
Definition: blast_aux.hpp:309
static void s_RestrictSeqLocs_Multiframe(CBlastQueryFilteredFrames &frame_to_bsl, const IBlastQuerySource &queries, int query_index, const BlastQueryInfo *qinfo)
Restricts the masked locations in frame_to_bsl for the case when the BLAST program requires the query...
static void s_AddMask(EBlastProgramType prog, BlastMaskLoc *mask, int query_index, CBlastQueryFilteredFrames &seqloc_frames, ENa_strand strand, TSeqPos query_length)
Adds seqloc_frames to mask.
TFrameSet m_SeqlocTails
Frame and tail of BlastSeqLoc* linked list (to speed up appending)
@ eBlastEncodingNcbi4na
NCBI4na.
@ eBlastEncodingProtein
NCBIstdaa.
@ eBlastEncodingError
Error value for encoding.
@ eBlastEncodingNucleotide
Special encoding for preliminary stage of BLAST: permutation of NCBI4na.
@ eBlastEncodingNcbi2na
NCBI2na.
@ eInvalidArgument
Invalid argument to some function/method (could be programmer error - prefer assertions in those case...
@ eNoSentinels
Do not use sentinel bytes.
Definition: blast_setup.hpp:95
@ eSentinels
Use sentinel bytes.
Definition: blast_setup.hpp:94
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
const CNcbiEnvironment & GetEnvironment(void) const
Get the application's cached environment.
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
element_type * release(void)
Release will release ownership of pointer to caller.
Definition: ncbimisc.hpp:472
@ eFollowLinks
Follow symbolic links.
Definition: ncbimisc.hpp:145
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
const string & Get(const string &name, bool *found=NULL) const
Get environment value by name.
Definition: ncbienv.cpp:109
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
#define NCBI_RETHROW_SAME(prev_exception, message)
Generic macro to re-throw the same exception.
Definition: ncbiexpt.hpp:749
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define THROWS(x)
Definition: ncbiexpt.hpp:75
static string NormalizePath(const string &path, EFollowLinks follow_links=eIgnoreLinks)
Normalize a path.
Definition: ncbifile.cpp:820
static string MakePath(const string &dir=kEmptyStr, const string &base=kEmptyStr, const string &ext=kEmptyStr)
Assemble a path from basic components.
Definition: ncbifile.cpp:413
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
const float pi
Definition: math.hpp:54
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
virtual bool HasEntry(const string &section, const string &name=kEmptyStr, TFlags flags=0) const
Definition: ncbireg.cpp:290
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
list< CRef< CSeq_interval > > Tdata
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const Tdata & Get(void) const
Get the member data.
TFrom GetFrom(void) const
Get the From member data.
list< CRef< CSeq_loc > > Tdata
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
const Tdata & Get(void) const
Get the member data.
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
TTo GetTo(void) const
Get the To member data.
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
const TPacked_int & GetPacked_int(void) const
Get the variant data.
Definition: Seq_loc_.cpp:216
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
@ e_not_set
No variant selected.
Definition: Seq_loc_.hpp:97
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
use only n Cassandra database for the lookups</td > n</tr > n< tr > n< td > yes</td > n< td > do not use tables BIOSEQ_INFO and BLOB_PROP in the Cassandra database
char * buf
int i
static char * prog
Definition: mdb_load.c:33
CMetaRegistry: Singleton class for loading CRegistry data from files; keeps track of what it loaded f...
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
const CharType(& source)[N]
Definition: pointer.h:1149
#define strdup
Definition: ncbi_ansi_ext.h:70
unsigned int a
Definition: ncbi_localip.c:102
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ABS(a)
returns absolute value of a (|a|)
Definition: ncbi_std.h:122
#define NULLB
terminating byte of a char* string.
Definition: ncbi_std.h:181
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
T minus(T x_)
T plus(T x_)
T min(T x_, T y_)
void abort()
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Defines BLAST database access classes.
#define assert(x)
Definition: srv_diag.hpp:58
Structure to hold a sequence.
Definition: blast_def.h:242
Uint4 num_seq_ranges
Number of elements in seq_ranges.
Definition: blast_def.h:281
BlastMaskLoc * lcase_mask
Locations to be masked from operations on this sequence: lookup table for query; scanning for subject...
Definition: blast_def.h:265
Boolean lcase_mask_allocated
TRUE if memory has been allocated for lcase_mask.
Definition: blast_def.h:268
Int4 length
Length of sequence.
Definition: blast_def.h:246
Uint1 * gen_code_string
for nucleotide subject sequences (tblast[nx]), the genetic code used to create a translated protein s...
Definition: blast_def.h:272
Int4 query_length
Length of this query, strand or frame.
Boolean is_valid
Determine if this context is valid or not.
Int4 segment_flags
Flags describing segments for paired reads.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Int4 query_index
Index of query (same for all frames)
Int1 frame
Frame number (-1, -2, -3, 0, 1, 2, or 3)
Structure for keeping the query masking information.
Definition: blast_def.h:210
The query related information.
Int4 first_context
Index of the first element of the context array.
BlastContextInfo * contexts
Information per context.
int num_queries
Number of query sequences.
Uint4 min_length
Length of the shortest among the concatenated queries.
Int4 last_context
Index of the last element of the context array.
Uint4 max_length
Length of the longest among the concatenated queries.
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Structure to represent a range.
Definition: seqdb.hpp:225
List of sequence offset ranges.
Definition: seqdb.hpp:236
bool empty() const
Definition: seqdb.hpp:272
size_type size() const
Definition: seqdb.hpp:274
value_type * get_data() const
Definition: seqdb.hpp:282
Structure to store sequence data and its length for use in the CORE of BLAST (it's a malloc'ed array ...
Definition: blast_setup.hpp:62
A structure containing two integers, used e.g.
Definition: blast_def.h:155
#define _ASSERT
static const string kTitle
CTraceGlyph inline method implementation.
string g_FindDataFile(const CTempString &name, CDirEntry::EType type=CDirEntry::eFile)
Look for an NCBI application data file or directory of the given name and type; in general,...
Definition: util_misc.cpp:139
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Wed May 01 14:23:33 2024 by modify_doxy.py rev. 669887