NCBI C++ ToolKit
blastxml_format.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastxml_format.cpp 100362 2023-07-24 18:54:39Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Ilya Dondoshansky
27 *
28 * ===========================================================================
29 */
30 
31 /// @file blastxml_format.cpp
32 /// Formatting of BLAST results in XML form, using the BLAST XML specification.
33 #include <ncbi_pch.hpp>
36 #include <objmgr/util/sequence.hpp>
38 
42 
47 
48 #include <serial/objostrxml.hpp>
49 
51 
52 #include <algorithm>
53 
56 USING_SCOPE(blast);
57 USING_SCOPE(align_format);
58 
59 ncbi::TMaskedQueryRegions mask;
60 
61 
62 // helper function: serialize given object (could be partially initialized)
63 // to string buffer and return it in two parts before and after given tag.
64 // object to serialize
65 // tag to devide by, often "</TAG_NAME>"
66 // start_part beginning of a serialized data before tag
67 // end_part end of a serialized data starting from tag
68 // add_reference_dtd boolen flag, if true - print a DOCTYPE DTD reference
69 // add_xml_versioni boolena flag, if true prin "xml version" open priabula
70 static bool s_SerializeAndSplitBy(const CSerialObject &object,
71  const char *tag,
72  string &start_part,
73  string &end_part,
74  bool add_reference_dtdi = false,
75  bool add_xml_versioni = false );
76 
77 
78 /// Returns translation frame given the strand, alignment endpoints and
79 /// total sequence length.
80 /// @param plus_strand Is this position on a forward strand? [in]
81 /// @param start Starting position, in 1-offset coordinates. [in]
82 /// @param end Ending position in 1-offset coordinates [in]
83 /// @param seq_length Total length of sequence [in]
84 /// @return Frame number.
85 static int
86 s_GetTranslationFrame(bool plus_strand, int start, int end, int seq_length)
87 {
88  int frame;
89 
90  if (plus_strand) {
91  frame = (start - 1) % 3 + 1;
92  } else {
93  frame = -((seq_length - end) % 3 + 1);
94  }
95 
96  return frame;
97 }
98 
99 /// Creates a list of CHsp structures for the XML output, given a list of
100 /// Seq-aligns.
101 /// @param xhsp_list List of CHsp's to populate [in] [out]
102 /// @param alnset Set of alignments to get data from [in]
103 /// @param scope Scope for retrieving sequences [in]
104 /// @param matrix 256x256 matrix for calculating positives for a protein search.
105 /// NULL is passed for a nucleotide search.
106 /// @param mask_info Masking locations [in]
107 static void
109  const CSeq_align_set& alnset, CScope* scope,
110  const CBlastFormattingMatrix* matrix,
111  const ncbi::TMaskedQueryRegions* mask_info,
112  int master_gentic_code, int slave_genetic_code)
113 {
114  int index = 1;
115  ITERATE(CSeq_align_set::Tdata, iter, alnset.Get()) {
116  CRef<CHsp> xhsp(new CHsp());
117  const CSeq_align& kAlign = *(*iter);
118  xhsp->SetNum(index);
119  ++index;
120  bool query_is_na, subject_is_na;
121  int query_length, subject_length;
122 
123  int score, num_ident;
124  double bit_score;
125  double evalue;
126  int sum_n;
127  list<TGi> use_this_gi;
128  CBlastFormatUtil::GetAlnScores(kAlign, score, bit_score, evalue, sum_n,
129  num_ident, use_this_gi);
130 
131  //Print 6 significant digits for double values
132  char tmp[512];
133  sprintf(tmp,"%.*g", 6, bit_score );
134  bit_score = atof(tmp);
135  sprintf(tmp,"%.*g", 6, evalue );
136  evalue = atof(tmp);
137 
138  xhsp->SetBit_score(bit_score);
139  xhsp->SetScore(score);
140  xhsp->SetEvalue(evalue);
141 
142  // Extract the full list of subject ids
143  try {
144  const CBioseq_Handle& kQueryBioseqHandle =
145  scope->GetBioseqHandle(kAlign.GetSeq_id(0));
146  query_is_na = kQueryBioseqHandle.IsNa();
147  query_length = kQueryBioseqHandle.GetBioseqLength();
148  const CBioseq_Handle& kSubjBioseqHandle =
149  scope->GetBioseqHandle(kAlign.GetSeq_id(1));
150  subject_is_na = kSubjBioseqHandle.IsNa();
151  subject_length = kSubjBioseqHandle.GetBioseqLength();
152  } catch (const CException&) {
153  // Either query or subject sequence not found - the remaining
154  // information cannot be correctly filled. Add this HSP as is
155  // and continue.
156  xhsp->SetQuery_from(0);
157  xhsp->SetQuery_to(0);
158  xhsp->SetHit_from(0);
159  xhsp->SetHit_to(0);
160  xhsp->SetIdentity(num_ident); // This may be inaccurate when
161  // alignment contains filtered regions.
162  xhsp->SetQseq(NcbiEmptyString);
163  xhsp->SetHseq(NcbiEmptyString);
164  xhsp_list.push_back(xhsp);
165  continue;
166  }
167 
168  CRef<CSeq_align> final_aln(0);
169 
170  // Convert Std-seg and Dense-diag alignments to Dense-seg.
171  // Std-segs are produced only for translated searches; Dense-diags only
172  // for ungapped, not translated searches.
173  const bool kTranslated = kAlign.GetSegs().IsStd();
174  if (kTranslated) {
175  CRef<CSeq_align> densegAln = kAlign.CreateDensegFromStdseg();
176  // When both query and subject are translated, i.e. tblastx, convert
177  // to a special type of Dense-seg.
178  if (query_is_na && subject_is_na)
179  final_aln = densegAln->CreateTranslatedDensegFromNADenseg();
180  else
181  final_aln = densegAln;
182  } else if (kAlign.GetSegs().IsDendiag()) {
183  final_aln = CBlastFormatUtil::CreateDensegFromDendiag(kAlign);
184  }
185 
186  const CDense_seg& kDenseg = (final_aln ? final_aln->GetSegs().GetDenseg() :
187  kAlign.GetSegs().GetDenseg());
188 
189 
190 
191 
192  // Do not trust the identities count in the Seq-align, because if masking
193  // was used, then masked residues were not counted as identities.
194  // Hence retrieve the sequences present in the alignment and count the
195  // identities again.
196  string query_seq;
197  string subject_seq;
198  string middle_seq;
199  string masked_query_seq;
200 
201  // For blastn search, the matches are shown as '|', and mismatches as
202  // ' '; For all other searches matches are shown as matched characters,
203  // mismatches as ' ', and positives as '+'.
204  // This is a blastn search if and only if both query and subject are
205  // nucleotide, and it is not a translated search.
206  const bool kIsBlastn =
207  (query_is_na && subject_is_na && !kTranslated);
208 
209  const CDense_seg * ds_pt = &kDenseg;
210  CRef<CDense_seg> reversed_ds;
211  // For non-transalted reverse strand alignments, show plus strand on
212  // query and minus strand on subject. To accomplish this, Dense-seg must
213  // be reversed.
214  if (!kTranslated && kDenseg.IsSetStrands() &&
215  kDenseg.GetStrands().front() == eNa_strand_minus)
216  {
217  reversed_ds.Reset(new CDense_seg);
218  reversed_ds->Assign(kDenseg);
219  reversed_ds->Reverse();
220  ds_pt = &(*reversed_ds);
221  }
222 
223  int q_start, q_end, s_start, s_end, q_frame=0, s_frame=0;
224 
225  unsigned int num_gaps = 0;
226  int align_length = 0;
227 
228  if (kAlign.GetSegs().IsDendiag())
229  {
230  align_length = final_aln->GetAlignLength();
231  q_start = final_aln->GetSeqStart(0) + 1;
232  q_end = final_aln->GetSeqStop(0) + 1;
233  s_start = final_aln->GetSeqStart(1) + 1;
234  s_end = final_aln->GetSeqStop(1) + 1;
235  }
236  else
237  {
238  if(!kTranslated)
239  {
240  num_gaps = kAlign.GetTotalGapCount();
241  align_length = kAlign.GetAlignLength();
242  }
243  q_start = kAlign.GetSeqStart(0) + 1;
244  q_end = kAlign.GetSeqStop(0) + 1;
245  s_start = kAlign.GetSeqStart(1) + 1;
246  s_end = kAlign.GetSeqStop(1) + 1;
247  }
248 
249  if (!kTranslated && query_is_na && subject_is_na) {
250  q_frame = s_frame = 1;
251  // For reverse strand alignment, set subject frame to -1 and
252  // swap start and end coordinates.
253  if (eNa_strand_minus == kAlign.GetSeqStrand(0)){
254  s_frame = -1;
255  int tmp = s_start;
256  s_start = s_end;
257  s_end = tmp;
258  }
259  } else if (kTranslated) {
260  align_length = final_aln->GetAlignLength();
261  num_gaps = final_aln->GetTotalGapCount();
262 
263  if (query_is_na)
264  q_frame = s_GetTranslationFrame(eNa_strand_minus != final_aln->GetSeqStrand(0),
265  q_start, q_end, query_length);
266  if (subject_is_na)
267  s_frame = s_GetTranslationFrame(eNa_strand_minus != final_aln->GetSeqStrand(1),
268  s_start, s_end, subject_length);
269  }
270 
271  xhsp->SetQuery_frame(q_frame);
272  xhsp->SetHit_frame(s_frame);
273 
274  xhsp->SetQuery_from(q_start);
275  xhsp->SetQuery_to(q_end);
276  xhsp->SetHit_from(s_start);
277  xhsp->SetHit_to(s_end);
278 
279  if (mask_info)
280  {
283 
285  masked_query_seq,
286  subject_seq,
287  *ds_pt,
288  *scope,
289  master_gentic_code,
290  slave_genetic_code,
291  *mask_info,
292  kMaskCharOpt,
293  q_frame);
294  }
295  else
296  {
298  subject_seq,
299  *ds_pt,
300  *scope,
301  master_gentic_code,
302  slave_genetic_code);
303  }
304 
305  num_ident = 0;
306  int num_positives = 0;
307  middle_seq = query_seq;
308  // The query and subject sequence strings must be the same size in a
309  // correct alignment, but if alignment extends beyond the end of sequence
310  // because of a bug, one of the sequence strings may be truncated, hence
311  // it is necessary to take a minimum here.
312  // FIXME: Should an exception be thrown instead?
313  const unsigned int kMaxOffset = static_cast<unsigned int>(min(query_seq.size(),
314  subject_seq.size()));
315  for (unsigned int i = 0; i < kMaxOffset; ++i) {
316  if (query_seq[i] == subject_seq[i]) {
317  ++num_ident;
318  ++num_positives;
319  if (kIsBlastn)
320  middle_seq[i] = '|';
321  } else if (matrix &&
322  (*matrix)(query_seq[i], subject_seq[i]) > 0 &&
323  !kIsBlastn) {
324  ++num_positives;
325  middle_seq[i] = kIsBlastn ? ' ' : '+';
326  } else {
327  middle_seq[i] = ' ';
328  }
329  }
330 
331  xhsp->SetIdentity(num_ident);
332  xhsp->SetGaps(num_gaps);
333  xhsp->SetAlign_len(align_length);
334 
335  if (mask_info)
336  xhsp->SetQseq(masked_query_seq);
337  else
338  xhsp->SetQseq(query_seq);
339  xhsp->SetHseq(subject_seq);
340  xhsp->SetMidline(middle_seq);
341  xhsp->SetPositive(num_positives);
342 
343 
344  xhsp_list.push_back(xhsp);
345  }
346 }
347 
348 /// Fill the CHit object in BLAST XML output, given an alignment and other
349 /// information.
350 /// @param hit CHit object to fill [in] [out]
351 /// @param align_in Sequence alignment [in]
352 /// @param scope Scope for retrieving sequences [in]
353 /// @param matrix ASCII-alphabet matrix for calculation of positives [in]
354 /// @param mask_info List of masking locations [in]
355 /// @param ungapped Is this an ungapped search? [in]
356 static void
357 s_SeqAlignToXMLHit(CRef<CHit>& hit, const CSeq_align& align_in, CScope* scope,
358  const CBlastFormattingMatrix* matrix,
359  const ncbi::TMaskedQueryRegions* mask_info,
360  bool ungapped, int master_gentice_code, int slave_genetic_code)
361 {
362  _ASSERT(align_in.GetSegs().IsDisc());
363  const CSeq_align_set& kAlignSet = align_in.GetSegs().GetDisc();
364 
365  // Check if the list is empty. Then there is nothing to fill.
366  if (kAlignSet.Get().empty())
367  return;
368 
369  // Create the new CHit object.
370  hit.Reset(new CHit());
371 
372  const CSeq_id& kSeqId = kAlignSet.Get().front()->GetSeq_id(1);
373 
374  try {
375  const CBioseq_Handle& kSubjBioseqHandle = scope->GetBioseqHandle(kSeqId);
376  /// @todo FIXME Should this be passed somehow? For now the following
377  /// list is empty.
378  list<TGi> use_this_gi;
379  string seqid;
380  string defline;
381  /// @todo FIXME Should the "show gi" option be passed to the XML
382  /// formatter? At this time gis are shown unconditionally.
384  use_this_gi, seqid,
385  defline, true);
386  if (defline == NcbiEmptyString)
387  defline = "No definition line";
388 
389  hit->SetId(seqid);
390  hit->SetDef(defline);
391 
392  // Find the "best" Seq-id, and retrieve accession (without version).
393  CSeq_id_Handle idh =
394  sequence::GetId(kSubjBioseqHandle, sequence::eGetId_Best);
395  string accession = CAlignFormatUtil::GetLabel(idh.GetSeqId());
396  hit->SetAccession(accession);
397 
398  int length = sequence::GetLength(kSeqId, scope);
399  hit->SetLen(length);
400  } catch (const CException&) {
401  // If Bioseq handle didn't return some of the information, and not all
402  // mandatory couldn't be filled, skip this hit completely.
403  //hit.Reset(NULL);
404  hit->SetId(kSeqId.AsFastaString());
405  hit->SetDef("Unknown");
406  hit->SetAccession("Unknown");
407  hit->SetLen(0);
408  };
409 
410  // For ungapped search, multiple HSPs, possibly from different strands,
411  // are packed into a single Seq-align.
412  // The C++ utility functions cannot deal with such Seq-aligns, as they
413  // expect one Seq-align per alignment (HSP). Hence we need to expand the
414  // Seq-align-set obtained for an ungapped search.
415  if (ungapped) {
416  CRef<CSeq_align_set> expanded_align_set =
418 
419  s_SeqAlignSetToXMLHsps(hit->SetHsps(), *expanded_align_set, scope,
420  matrix, mask_info, master_gentice_code, slave_genetic_code);
421  } else {
422  s_SeqAlignSetToXMLHsps(hit->SetHsps(), kAlignSet, scope, matrix,
423  mask_info, master_gentice_code, slave_genetic_code);
424  }
425 }
426 
427 /// Retrieves subject Seq-id from a Seq-align
428 /// @param align Seq-align object [in]
429 /// @return Subject Seq-id for this Seq-align.
430 static const CSeq_id*
432 {
433  if (align.GetSegs().IsDenseg()) {
434  return align.GetSegs().GetDenseg().GetIds()[1];
435  } else if (align.GetSegs().IsDendiag()) {
436  return align.GetSegs().GetDendiag().front()->GetIds()[1];
437  } else if (align.GetSegs().IsStd()) {
438  return align.GetSegs().GetStd().front()->GetIds()[1];
439  }
440 
441  return NULL;
442 }
443 
444 /// Fills the list of CHit objects, given a list of Seq-aligns.
445 /// @param hits List of CHit objects to fill [in] [out]
446 /// @param alnset Seq-align-set object containing a list of sequence
447 /// alignments. [in]
448 /// @param scope Scope for retrieving sequences. [in]
449 /// @param matrix ASCII-alphabet matrix for calculation of positives. [in]
450 /// @param mask_info List of masking locations. [in]
451 /// @param ungapped Is this an ungapped search? [in]
452 static void
453 s_SeqAlignSetToXMLHits(list <CRef<CHit> >& hits, const CSeq_align_set& alnset,
454  CScope* scope, const CBlastFormattingMatrix* matrix,
455  const ncbi::TMaskedQueryRegions* mask_info,
456  bool ungapped, int master_gentice_code, int slave_genetic_code,
457  CNcbiOstream *out_stream)
458 {
459  // If there are no hits for this query, return with empty Hits list.
460  if (alnset.Get().empty())
461  return;
462 
463  CSeq_align_set::Tdata::const_iterator iter = alnset.Get().begin();
464 
465  int index = 1;
466  bool incremental_output = (bool)out_stream;
467  while (iter != alnset.Get().end()) {
468  CRef<CHit> new_hit;
469  // Retrieve the next set of results for a single subject sequence.
470  // If the next Seq-align is discontinuous, then take it as is,
471  // otherwise go along the chain of Seq-aligns until the subject Seq-id
472  // changes, then wrap the single subject list into a discontinuous
473  // Seq-align.
474  if ((*iter)->GetSegs().IsDisc()) {
475  s_SeqAlignToXMLHit(new_hit, *(*iter), scope, matrix, mask_info,
476  ungapped, master_gentice_code, slave_genetic_code);
477  ++iter;
478  } else {
479  CSeq_align_set one_subject_alnset;
480  CConstRef<CSeq_id> current_id(s_GetSubjectId(*(*iter)));
481  for ( ; iter != alnset.Get().end(); ++iter) {
482  CConstRef<CSeq_id> next_id(s_GetSubjectId(*(*iter)));
483  if (!current_id->Match(*next_id)) {
484  break;
485  }
486  one_subject_alnset.Set().push_back(*iter);
487  }
488  CSeq_align disc_align_wrap;
489  disc_align_wrap.SetSegs().SetDisc(one_subject_alnset);
490  s_SeqAlignToXMLHit(new_hit, disc_align_wrap, scope, matrix,
491  mask_info, ungapped, master_gentice_code, slave_genetic_code);
492  }
493 
494  if (new_hit) {
495  new_hit->SetNum(index);
496  ++index;
497  if( !incremental_output ) hits.push_back(new_hit);
498  else
499  {
500  CNcbiOstrstream one_hit_os;
501  unique_ptr<CObjectOStreamXml> xml_one_hit_os (new CObjectOStreamXml (one_hit_os,eNoOwnership));
502  xml_one_hit_os->SetEncoding(eEncoding_Ascii);
503  xml_one_hit_os->SetReferenceDTD(false);
504  xml_one_hit_os->Write( &(*new_hit), new_hit->GetThisTypeInfo() );
505  // remove leading xml version
506  string out_str = string(CNcbiOstrstreamToString(one_hit_os));
507  string::size_type start_xml_pos = out_str.find("<?xml");
508  if( start_xml_pos != string::npos ) {
509  string::size_type end_xml_pos = out_str.find_first_of("\n\r");
510  out_str.erase(0,end_xml_pos+1);
511  }
512  *out_stream << out_str ;
513  }
514 
515  }
516  }
517 }
518 
519 /// Add an "iteration" to the BLAST XML report, corresponding to all alignments
520 /// for a single query.
521 /// @param bxmlout BLAST XML output object [in]
522 /// @param alnset Set of aligments for a given query. [in]
523 /// @param seqloc This query's Seq-loc. [in]
524 /// @param scope Scope for retrieving sequences. [in]
525 /// @param matrix ASCII-alphabet matrix for calculation of positives. [in]
526 /// @param mask_info List of masking locations. [in]
527 /// @param index This query's index [in]
528 /// @param stat Search statistics for this query, already filled. [in]
529 /// @param is_ungapped Is this an ungapped search? [in]
530 /// @param out_stream Stream for incremental output, ignore if NULL [out]
531 static void
533  const CSeq_loc& seqloc, CScope* scope,
534  const CBlastFormattingMatrix* matrix,
535  const ncbi::TMaskedQueryRegions* mask_info,
536  int index, int iteration, CStatistics& stat, bool is_ungapped,
537  int master_gentice_code, int slave_genetic_code,
538  const vector<string>& messages,
539  CNcbiOstream *out_stream)
540 {
541  bool incremental_output = (bool) out_stream;
542  list<CRef<CIteration> >& iterations = bxmlout.SetIterations();
543 
544  CRef<CIteration> one_query_iter(new CIteration());
545 
546  one_query_iter->SetIter_num(iteration);
547 
548  string query_def = NcbiEmptyString;
549 
550  // If Bioseq handle cannot return a title string here, it is not critical.
551  // But make sure the exceptions are caught.
552  const CSeq_id& kSeqId = sequence::GetId(seqloc, scope);
553  try {
554  CBioseq_Handle bh = scope->GetBioseqHandle(kSeqId);
555  // Get the full query Seq-id string.
556  const CBioseq& kQueryBioseq = *bh.GetBioseqCore();
557  one_query_iter->SetQuery_ID(
558  CBlastFormatUtil::GetSeqIdString(kQueryBioseq));
559  query_def = sequence::CDeflineGenerator().GenerateDefline(bh);
560  } catch (const CException&) {
561  one_query_iter->SetQuery_ID(kSeqId.AsFastaString());
562  };
563 
564  if (query_def == NcbiEmptyString)
565  query_def = "No definition line";
566  one_query_iter->SetQuery_def(query_def);
567 
568  one_query_iter->SetQuery_len(sequence::GetLength(seqloc, scope));
569  one_query_iter->SetStat(stat);
570  if (messages.size() > 0 && !messages[index].empty())
571  one_query_iter->SetMessage(messages[index]);
572  // have serialized CIteration split and output first portion before hits
573  string serial_xml_start, serial_xml_end;
574  if( incremental_output) {
575  //bool add_dtd_reference = false, add_xml_version = false;
576  s_SerializeAndSplitBy( *one_query_iter, "</Iteration_query-len>",
577  serial_xml_start, serial_xml_end);
578  *out_stream << serial_xml_start << "\n<Iteration_hits>\n"; // PART BEFORE HITS
579  }
580 
581  // Only add hits if they exist.
582  if (alnset) {
583  s_SeqAlignSetToXMLHits(one_query_iter->SetHits(), *alnset,
584  scope, matrix, mask_info, is_ungapped,
585  master_gentice_code, slave_genetic_code,
586  out_stream);
587  }
588 
589  if( incremental_output ) *out_stream << "</Iteration_hits>" << serial_xml_end;
590  else
591  iterations.push_back(one_query_iter);
592 }
593 
594 /// Fills the parameters part of the BLAST XML output.
595 /// @param bxmlout BLAST XML output object [in] [out]
596 /// @param data Data structure, from which all necessary information can be
597 /// retrieved [in]
598 static void
600 {
601  CParameters& params = bxmlout.SetParam();
602  string matrix_name = data->GetMatrixName();
603  if (matrix_name != NcbiEmptyString)
604  params.SetMatrix(matrix_name);
605  params.SetExpect(data->GetEvalueThreshold());
606  params.SetGap_open(data->GetGapOpeningCost());
607  params.SetGap_extend(data->GetGapExtensionCost());
608 
609  int val;
610  if ((val = data->GetMatchReward()) != 0)
611  params.SetSc_match(val);
612 
613  if ((val = data->GetMismatchPenalty()) != 0)
614  params.SetSc_mismatch(val);
615 
616  string str;
617  if ((str = data->GetPHIPattern()) != NcbiEmptyString)
618  params.SetPattern(str);
619 
620  if ((str = data->GetFilterString()) != NcbiEmptyString)
621  params.SetFilter(str);
622 }
623 
624 /// Fills the search statistics part of the BLAST XML output for all queries.
625 /// @param stat_vec Vector of the CStatistics objects, to be filled. [in] [out]
626 /// @param data Data structure, from which all necessary information can be
627 /// retrieved [in]
628 static void
630  const IBlastXMLReportData* data)
631 {
632  int db_numseq = data->GetDbNumSeqs();
633  Int8 db_length = data->GetDbLength();
634 
635  for (unsigned int index = 0; index < data->GetNumQueries(); ++index) {
636  CRef<CStatistics> stat(new CStatistics());
637  stat->SetDb_num(db_numseq);
638  stat->SetDb_len(db_length);
639  stat->SetHsp_len(data->GetLengthAdjustment(index));
640  stat->SetEff_space((double)data->GetEffectiveSearchSpace(index));
641  stat->SetKappa(data->GetKappa(index));
642  stat->SetLambda(data->GetLambda(index));
643  stat->SetEntropy(data->GetEntropy(index));
644  stat_vec.push_back(stat);
645  }
646 }
647 
648 /// Given BLAST task, returns enumerated value for the publication to be
649 /// referenced.
650 /// @param program BLAST task [in]
651 /// @return What publication to reference?
654 {
656 
657  switch (program) {
658  case eMegablast:
659  case eMapper:
660  publication = CReference::eMegaBlast; break;
661  case ePHIBlastp: case ePHIBlastn:
662  publication = CReference::ePhiBlast; break;
663  case ePSIBlast:
664  publication = CReference::eCompBasedStats; break;
665  case eDeltaBlast:
666  publication = CReference::eDeltaBlast; break;
667  default:
668  publication = CReference::eGappedBlast; break;
669  }
670  return publication;
671 }
672 
673 /// Fills all fields in the data structure for a BLAST XML report.
674 /// @param bxmlout BLAST XML report data structure to fill [in] [out]
675 /// @param data Data structure, from which all necessary information can be
676 /// retrieved [in]
677 /// @param out_stream Output stream for incremental output, ignore if NULL [out]
678 void
680 {
681  bool incremental_output = (bool)out_stream;
682  string program_name = data->GetBlastProgramName();
683  bxmlout.SetProgram(program_name);
684  bxmlout.SetVersion(CBlastFormatUtil::BlastGetVersion(program_name));
685  EProgram blast_task = data->GetBlastTask();
687  bxmlout.SetDb(data->GetDatabaseName());
688 
689  const CSeq_loc* kSeqLoc = data->GetQuery(0);
690  if (!kSeqLoc)
691  NCBI_THROW(CException, eUnknown, "Query Seq-loc is not available");
692 
693  CRef<CScope> scope(data->GetScope(0));
694 
695  string query_def = NcbiEmptyString;
696 
697  // Try to retrieve all Seq-ids, using a Bioseq handle. If this fails,
698  // report the one available Seq-id, retrieved from the query Seq-loc.
699  const CSeq_id& kSeqId = sequence::GetId(*kSeqLoc, scope);
700  try {
701  CBioseq_Handle bh = scope->GetBioseqHandle(kSeqId);
702  // Get the full query Seq-id string.
703  const CBioseq& kQueryBioseq = *bh.GetBioseqCore();
704  bxmlout.SetQuery_ID(CBlastFormatUtil::GetSeqIdString(kQueryBioseq));
705  query_def = sequence::CDeflineGenerator().GenerateDefline(bh);
706  } catch (const CException&) {
707  bxmlout.SetQuery_ID(kSeqId.AsFastaString());
708  };
709 
710  if (query_def == NcbiEmptyString)
711  query_def = "No definition line";
712 
713  bxmlout.SetQuery_def(query_def);
714 
715  bxmlout.SetQuery_len(sequence::GetLength(*kSeqLoc, scope));
716 
717  s_SetBlastXMLParameters(bxmlout, data);
718 
719  unique_ptr< CBlastFormattingMatrix > matrix(data->GetMatrix());
720 
721  vector<CRef<CStatistics> > stat_vec;
722  s_BlastXMLGetStatistics(stat_vec, data);
723  //serialized data before and after BlastOutput_param
724  string serial_xml_start, serial_xml_end;
725  if( incremental_output && incremental_struct->m_IterationNum == 0) {
726  bool add_dtd_reference = true, add_xml_version = true;
727  s_SerializeAndSplitBy( bxmlout, "</BlastOutput_param>",
728  serial_xml_start, serial_xml_end,
729  add_dtd_reference, add_xml_version );
730  // incremental_output
731  *out_stream << serial_xml_start << "\n<BlastOutput_iterations>" ;
732  incremental_struct->m_SerialXmlEnd = "\n</BlastOutput_iterations>" + serial_xml_end;
733  }
734  string query_label;
735  for (unsigned int index = 0; index < data->GetNumQueries(); ++index) {
736  // Check that this query's Seq-loc is available.
737  const CSeq_loc* seqloc = data->GetQuery(index);
738  if (!seqloc) {
739  string message =
740  "Unable to retrieve query " + NStr::IntToString(index);
741  NCBI_THROW(CException, eUnknown, message);
742  }
743  if (incremental_struct)
744  incremental_struct->m_IterationNum++;
745  query_label = string("Q(")+NStr::NumericToString(index) +
746  string("/")+ NStr::NumericToString(data->GetNumQueries() );
747  try{
748  s_BlastXMLAddIteration(bxmlout, data->GetAlignment(index), *seqloc,
749  data->GetScope(index), matrix.get(),
750  data->GetMaskLocations(index),
751  index, incremental_struct->m_IterationNum,
752  *stat_vec[index], !data->GetGappedMode(),
753  data->GetMasterGeneticCode(), data->GetSlaveGeneticCode(),
754  data->GetMessages(),
755  out_stream);
756  }
757  catch(CException &e){
758  ERR_POST(Error << "Failed s_BlastXMLAddIteration " << query_label << e.what() );
759  return;
760  }
761  catch(...){
762  ERR_POST(Error << "Failed s_BlastXMLAddIteration " << query_label );
763  return;
764  }
765  }
766 }
767 
768 /// serialize givem object and split data by provided XML tag for futher manual integrationa
769 // <start of a XML data ><TAG_NAME></TAG_NAME>< .., end of XML data>
770 //static bool s_SerializeAndSplit(TConstObjectPtr object, TTypeInfo typeInfo )
771 static bool s_SerializeAndSplitBy(const CSerialObject &object,
772  const char *tag, // tag name to break XML data by in form </TAG_NAME>
773  string &start_part, // part before </TAG_NAME>
774  string &end_part,
775  bool add_reference_dtd, // part starting from </TAG_NAME>
776  bool add_xml_version )
777 {
778  bool res_code = false; // not implemented
779  TTypeInfo typeInfo = object.GetThisTypeInfo();
780  string breake_by_tag = tag;
781  start_part="<NOT SET>";
782  end_part="</NOT SET>";
783  CNcbiOstrstream one_iter_ss_os;
784  {
785  unique_ptr<CObjectOStreamXml> xml_one_iter_os(new CObjectOStreamXml (one_iter_ss_os,eNoOwnership));
786  xml_one_iter_os->SetEncoding(eEncoding_Ascii);
787  xml_one_iter_os->SetVerifyData( eSerialVerifyData_No );
788  xml_one_iter_os->SetReferenceDTD(add_reference_dtd);
789  xml_one_iter_os->SetDefaultDTDFilePrefix("http://www.ncbi.nlm.nih.gov/dtd/");
790  if( add_xml_version )
791  xml_one_iter_os->Write(&object, typeInfo );
792  else
793  xml_one_iter_os->WriteObject(&object, typeInfo );
794  }
795  string out_str = string(CNcbiOstrstreamToString(one_iter_ss_os));
796  string::size_type iterations_insert_point = out_str.find( breake_by_tag );
797  if( iterations_insert_point != string::npos ){
798  iterations_insert_point += breake_by_tag.length();
799  start_part = out_str.substr(0,iterations_insert_point);
800  end_part = out_str.substr(iterations_insert_point);
801  res_code = true;
802  }
803  else {
804  start_part = out_str;
805  }
806  return res_code;
807 }
808 
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
BLAST formatter utilities.
Formatting of pairwise sequence alignments in XML form.
Declares class to display one-line descriptions at the top of the BLAST report.
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ ePHIBlastn
Nucleotide PHI BLAST.
Definition: blast_types.hpp:70
@ ePHIBlastp
Protein PHI BLAST.
Definition: blast_types.hpp:69
@ eMapper
Jumper alignment for mapping.
Definition: blast_types.hpp:73
@ ePSIBlast
PSI Blast.
Definition: blast_types.hpp:67
@ eMegablast
Nucl-Nucl (traditional megablast)
Definition: blast_types.hpp:65
@ eDeltaBlast
Delta Blast.
Definition: blast_types.hpp:71
USING_SCOPE(objects)
static void s_SeqAlignSetToXMLHits(list< CRef< CHit > > &hits, const CSeq_align_set &alnset, CScope *scope, const CBlastFormattingMatrix *matrix, const ncbi::TMaskedQueryRegions *mask_info, bool ungapped, int master_gentice_code, int slave_genetic_code, CNcbiOstream *out_stream)
Fills the list of CHit objects, given a list of Seq-aligns.
static void s_BlastXMLGetStatistics(vector< CRef< CStatistics > > &stat_vec, const IBlastXMLReportData *data)
Fills the search statistics part of the BLAST XML output for all queries.
static void s_SetBlastXMLParameters(CBlastOutput &bxmlout, const IBlastXMLReportData *data)
Fills the parameters part of the BLAST XML output.
void BlastXML_FormatReport(CBlastOutput &bxmlout, const IBlastXMLReportData *data, CNcbiOstream *out_stream, SBlastXMLIncremental *incremental_struct)
Fills all fields in the data structure for a BLAST XML report.
static void s_SeqAlignToXMLHit(CRef< CHit > &hit, const CSeq_align &align_in, CScope *scope, const CBlastFormattingMatrix *matrix, const ncbi::TMaskedQueryRegions *mask_info, bool ungapped, int master_gentice_code, int slave_genetic_code)
Fill the CHit object in BLAST XML output, given an alignment and other information.
static int s_GetTranslationFrame(bool plus_strand, int start, int end, int seq_length)
Returns translation frame given the strand, alignment endpoints and total sequence length.
static CReference::EPublication s_GetBlastPublication(EProgram program)
Given BLAST task, returns enumerated value for the publication to be referenced.
static void s_SeqAlignSetToXMLHsps(list< CRef< CHsp > > &xhsp_list, const CSeq_align_set &alnset, CScope *scope, const CBlastFormattingMatrix *matrix, const ncbi::TMaskedQueryRegions *mask_info, int master_gentic_code, int slave_genetic_code)
Creates a list of CHsp structures for the XML output, given a list of Seq-aligns.
static const CSeq_id * s_GetSubjectId(const CSeq_align &align)
Retrieves subject Seq-id from a Seq-align.
ncbi::TMaskedQueryRegions mask
static bool s_SerializeAndSplitBy(const CSerialObject &object, const char *tag, string &start_part, string &end_part, bool add_reference_dtdi=false, bool add_xml_versioni=false)
serialize givem object and split data by provided XML tag for futher manual integrationa
static void s_BlastXMLAddIteration(CBlastOutput &bxmlout, const CSeq_align_set *alnset, const CSeq_loc &seqloc, CScope *scope, const CBlastFormattingMatrix *matrix, const ncbi::TMaskedQueryRegions *mask_info, int index, int iteration, CStatistics &stat, bool is_ungapped, int master_gentice_code, int slave_genetic_code, const vector< string > &messages, CNcbiOstream *out_stream)
Add an "iteration" to the BLAST XML report, corresponding to all alignments for a single query.
static string GetLabel(CConstRef< objects::CSeq_id > id, bool with_version=false)
Return a label for an ID Tries to recreate behavior of GetLabel before a change that prepends "ti|" t...
CBioseq_Handle –.
256x256 matrix used for calculating positives etc.
CBlastOutput –.
Definition: BlastOutput.hpp:66
void Reverse(void)
Reverse the segments' orientation.
Definition: Dense_seg.cpp:644
void Assign(const CSerialObject &obj, ESerialRecursionMode how=eRecursive)
overloaded Assign()
Definition: Dense_seg.cpp:62
static CRef< objects::CSeq_align_set > PrepareBlastUngappedSeqalign(const objects::CSeq_align_set &alnset)
static functions Need to call this if the seqalign is stdseg or dendiag for ungapped blast alignment ...
Definition: showalign.cpp:3164
SeqLocCharOption
character used to display seqloc, such as masked sequence
Definition: showalign.hpp:204
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
CHsp –.
Definition: Hsp.hpp:66
CIteration –.
Definition: Iteration.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CObjectOStreamXml –.
Definition: objostrxml.hpp:54
CParameters –.
Definition: Parameters.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CRef< CSeq_align > CreateTranslatedDensegFromNADenseg(void) const
Create a Dense-seg with widths from Dense-seg of nucleotides Used by AlnMgr to handle translated nucl...
Definition: Seq_align.cpp:953
CRef< CSeq_align > CreateDensegFromStdseg(SSeqIdChooser *SeqIdChooser=0) const
---------------------------------------------------------------------------- PRE : the Seq-align has ...
Definition: Seq_align.cpp:728
TSeqPos GetTotalGapCount(TDim row=-1) const
Retrieves the total number of gaps in the given row an alignment; all gaps by default.
Definition: Seq_align.cpp:1550
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1993
Base class for all serializable objects.
Definition: serialbase.hpp:150
static void GetBioseqHandleDeflineAndId(const objects::CBioseq_Handle &handle, list< TGi > &use_this_gi, string &seqid, string &defline, bool show_gi=true, TGi this_gi_first=INVALID_GI)
Returns sequence id and a BLAST defline as strings, given a Bioseq handle and a list of gis.
CStatistics –.
Definition: Statistics.hpp:66
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Interface for filling the top layer of the XML report.
string GetSeqIdString(const CSeq_id &id)
Definition: compartp.cpp:100
#define bool
Definition: bool.h:34
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
static string GetString(EPublication pub)
Reference for requested publication.
Definition: version.cpp:112
EPublication
Enumerates the various BLAST publications.
Definition: version.hpp:70
@ eMaxPublications
Used as sentinel value.
Definition: version.hpp:78
@ ePhiBlast
1998 NAR paper
Definition: version.hpp:72
@ eCompBasedStats
2001 NAR paper
Definition: version.hpp:74
@ eGappedBlast
1997 NAR paper
Definition: version.hpp:71
@ eMegaBlast
2000 J Compt Biol paper
Definition: version.hpp:73
@ eDeltaBlast
2012 Biology Direct on DeltaBLAST
Definition: version.hpp:77
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
int m_IterationNum
ctor sets to true, set to false for first chunk.
string m_SerialXmlEnd
tag to be printed at end.
static void GetWholeAlnSeqStrings(string &query, string &subject, const objects::CDense_seg &ds, objects::CScope &scope, int master_gen_code, int slave_gen_code)
static string BlastGetVersion(const string program)
Returns the version and release date, e.g.
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
@ eUnknown
Definition: app_popup.hpp:72
@ eSerialVerifyData_No
do not verify
Definition: serialdef.hpp:109
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
CConstRef< CSeq_id > GetSeqId(void) const
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
TSeqPos GetBioseqLength(void) const
bool IsNa(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
#define NcbiEmptyString
Definition: ncbistr.hpp:122
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ eEncoding_Ascii
Definition: ncbistr.hpp:202
void SetLambda(TLambda value)
Assign a value to Lambda data member.
void SetExpect(TExpect value)
Assign a value to Expect data member.
void SetGap_open(TGap_open value)
Assign a value to Gap_open data member.
void SetEntropy(TEntropy value)
Assign a value to Entropy data member.
void SetDb_num(TDb_num value)
Assign a value to Db_num data member.
void SetQuery_len(TQuery_len value)
Assign a value to Query_len data member.
void SetHsp_len(THsp_len value)
Assign a value to Hsp_len data member.
void SetSc_match(TSc_match value)
Assign a value to Sc_match data member.
void SetMatrix(const TMatrix &value)
Assign a value to Matrix data member.
void SetProgram(const TProgram &value)
Assign a value to Program data member.
void SetGap_extend(TGap_extend value)
Assign a value to Gap_extend data member.
void SetQuery_ID(const TQuery_ID &value)
Assign a value to Query_ID data member.
void SetVersion(const TVersion &value)
Assign a value to Version data member.
void SetKappa(TKappa value)
Assign a value to Kappa data member.
void SetFilter(const TFilter &value)
Assign a value to Filter data member.
TIterations & SetIterations(void)
Assign a value to Iterations data member.
void SetSc_mismatch(TSc_mismatch value)
Assign a value to Sc_mismatch data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
void SetQuery_def(const TQuery_def &value)
Assign a value to Query_def data member.
void SetParam(TParam &value)
Assign a value to Param data member.
void SetPattern(const TPattern &value)
Assign a value to Pattern data member.
void SetReference(const TReference &value)
Assign a value to Reference data member.
void SetDb_len(TDb_len value)
Assign a value to Db_len data member.
void SetEff_space(TEff_space value)
Assign a value to Eff_space data member.
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
Tdata & Set(void)
Assign a value to data member.
bool IsSetStrands(void) const
Check if a value has been assigned to Strands data member.
Definition: Dense_seg_.hpp:568
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
bool IsDendiag(void) const
Check if variant Dendiag is selected.
Definition: Seq_align_.hpp:720
const TStd & GetStd(void) const
Get the variant data.
Definition: Seq_align_.hpp:752
const TDendiag & GetDendiag(void) const
Get the variant data.
Definition: Seq_align_.hpp:726
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
bool IsDisc(void) const
Check if variant Disc is selected.
Definition: Seq_align_.hpp:772
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
list< CRef< CSeq_align > > Tdata
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsDenseg(void) const
Check if variant Denseg is selected.
Definition: Seq_align_.hpp:740
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
int i
static CRef< CSeq_align > CreateDensegFromDendiag(CSeq_align const &aln)
const char * tag
T min(T x_, T y_)
The Object manager core.
Structure to hold data for incremental XML formatting.
#define _ASSERT
Modified on Thu Apr 25 08:20:01 2024 by modify_doxy.py rev. 669887