NCBI C++ ToolKit
score_builder.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: score_builder.cpp 100426 2023-07-31 13:45:22Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
40 
53 
55 #include <objmgr/util/sequence.hpp>
61 
62 #include <util/checksum.hpp>
63 #include <sstream>
64 
65 
68 USING_SCOPE(blast);
69 
71 {
72  Int2 status;
73  const CBlastOptions& opts = options.GetOptions();
74 
77 
78  // alignments are either protein-protein or nucl-nucl
79 
80  m_BlastType = opts.GetProgram();
81  switch (m_BlastType) {
82  case eBlastn:
83  case eMegablast:
84  case eDiscMegablast:
85  case eVecScreen:
86  case ePHIBlastn:
88  break;
89 
90  default:
92  break;
93  }
94 
95  if (m_BlastType == eBlastn) {
97  } else {
99  }
100  if (m_ScoreBlk == NULL) {
102  "Failed to initialize blast score block");
103  }
104 
105  // fill in the score matrix
106 
108  BlastScoringOptions *score_options;
109  BlastScoringOptionsNew(core_type, &score_options);
110  BLAST_FillScoringOptions(score_options, core_type, TRUE,
111  opts.GetMismatchPenalty(),
112  opts.GetMatchReward(),
113  opts.GetMatrixName(),
115  status = Blast_ScoreBlkMatrixInit(core_type, score_options,
116  m_ScoreBlk, NULL);
117  score_options = BlastScoringOptionsFree(score_options);
118  if (status) {
120  "Failed to initialize score matrix");
121  }
122 
123  // fill in Karlin blocks
124 
126  if (m_BlastType == eBlastn) {
127  // the following computes the same Karlin blocks as blast
128  // if the gap penalties are not large. When the penalties are
129  // large the ungapped Karlin blocks are used, but these require
130  // sequence data to be computed exactly. Instead we build an
131  // ideal ungapped Karlin block to approximate the exact answer
139  NULL);
140  }
141  else {
144  m_ScoreBlk->name, NULL);
145  }
146  if (status || m_ScoreBlk->kbp_gap_std[0] == NULL ||
147  m_ScoreBlk->kbp_gap_std[0]->Lambda <= 0.0) {
149  "Failed to initialize Karlin blocks");
150  }
151 }
152 
153 /// Default constructor
155  : m_ScoreBlk(0),
156  m_EffectiveSearchSpace(0)
157 {
158 }
159 
160 /// Constructor (uses blast program defaults)
162  : m_EffectiveSearchSpace(0)
163 {
165  options(CBlastOptionsFactory::Create(blast_program));
166  x_Initialize(*options);
167 }
168 
169 /// Constructor (uses previously configured blast options)
171  : m_EffectiveSearchSpace(0)
172 {
173  x_Initialize(options);
174 }
175 
176 /// Destructor
178 {
180 }
181 
183  const CSeq_align& align)
184 {
185  if (align.CheckNumRows() != 2) {
186  NCBI_THROW(CSeqalignException, eUnsupported,
187  "CScoreBuilder::GetBlastScore(): "
188  "only two-row alignments are supported");
189  }
190  if (align.GetSegs().IsDenseg() ) {
191  return GetBlastScoreDenseg(scope, align);
192  }
193  if (align.GetSegs().IsStd() ) {
194  return GetBlastScoreStd(scope, align);
195  }
196  if (align.GetSegs().IsSpliced() ) {
197  return GetBlastScoreSpliced(scope, align);
198  }
199  NCBI_THROW(CSeqalignException, eUnsupported,
200  "CScoreBuilder::GetBlastScore(): " +
201  align.GetSegs().SelectionName(align.GetSegs().Which())
202  +" is not supported");
203  return 0;
204 }
205 
206 static const unsigned char reverse_4na[16] = {0, 8, 4, 0, 2, 0, 0, 0, 1};
207 
209  const CSeq_align& align)
210 {
211  if ( !align.GetSegs().IsDenseg() ) {
212  NCBI_THROW(CSeqalignException, eUnsupported,
213  "CScoreBuilder::GetBlastScore(): "
214  "only dense-seg alignments are supported");
215  }
216 
217  if (m_ScoreBlk == 0) {
218  NCBI_THROW(CSeqalignException, eInvalidInputData,
219  "Blast scoring parameters have not been specified");
220  }
221 
222  int computed_score = 0;
223  const CDense_seg& ds = align.GetSegs().GetDenseg();
224  CAlnVec vec(ds, scope);
225  CBioseq_Handle bsh1 = vec.GetBioseqHandle(0);
226  CBioseq_Handle bsh2 = vec.GetBioseqHandle(1);
227  CSeqVector vec1(bsh1);
228  CSeqVector vec2(bsh2);
229  CSeq_inst::TMol mol1 = vec1.GetSequenceType();
230  CSeq_inst::TMol mol2 = vec2.GetSequenceType();
231 
232  int gap_open = m_GapOpen;
233  int gap_extend = m_GapExtend;
234 
235  if (mol1 == CSeq_inst::eMol_aa && mol2 == CSeq_inst::eMol_aa) {
236 
237  Int4 **matrix = m_ScoreBlk->matrix->data;
238 
239  if (m_BlastType != eBlastp) {
240  NCBI_THROW(CSeqalignException, eUnsupported,
241  "Protein scoring parameters required");
242  }
243 
244  for (CAlnVec::TNumseg seg_idx = 0;
245  seg_idx < vec.GetNumSegs(); ++seg_idx) {
246 
247  TSignedSeqPos start1 = vec.GetStart(0, seg_idx);
248  TSignedSeqPos start2 = vec.GetStart(1, seg_idx);
249  TSeqPos seg_len = vec.GetLen(seg_idx);
250 
251  if (start1 == -1 || start2 == -1) {
252  computed_score -= gap_open + gap_extend * seg_len;
253  continue;
254  }
255 
256  // @todo FIXME the following assumes ncbistdaa format
257 
258  for (TSeqPos pos = 0; pos < seg_len; ++pos) {
259  unsigned char c1 = vec1[start1 + pos];
260  unsigned char c2 = vec2[start2 + pos];
261  computed_score += matrix[c1][c2];
262  }
263  }
264  }
265  else if (CSeq_inst::IsNa(mol1) && CSeq_inst::IsNa(mol2)) {
266 
267  int match = m_ScoreBlk->reward;
268  int mismatch = m_ScoreBlk->penalty; // assumed negative
269 
270  if (m_BlastType != eBlastn) {
271  NCBI_THROW(CSeqalignException, eUnsupported,
272  "Nucleotide scoring parameters required");
273  }
274 
275  bool scaled_up = false;
276  if (gap_open == 0 && gap_extend == 0) { // possible with megablast
277  match *= 2;
278  mismatch *= 2;
279  gap_extend = match / 2 - mismatch;
280  scaled_up = true;
281  }
282 
283  int strand1 = vec.StrandSign(0);
284  int strand2 = vec.StrandSign(1);
285 
286  for (CAlnVec::TNumseg seg_idx = 0;
287  seg_idx < vec.GetNumSegs(); ++seg_idx) {
288 
289  TSignedSeqPos start1 = vec.GetStart(0, seg_idx);
290  TSignedSeqPos start2 = vec.GetStart(1, seg_idx);
291  TSeqPos seg_len = vec.GetLen(seg_idx);
292 
293  if (start1 == -1 || start2 == -1) {
294  computed_score -= gap_open + gap_extend * seg_len;
295  continue;
296  }
297 
298  // @todo FIXME encoding assumed to be ncbi4na, without
299  // ambiguity charaters
300 
301  if (strand1 > strand2) {
302  for (TSeqPos pos = 0; pos < seg_len; ++pos) {
303  unsigned char c1 = vec1[start1 + pos];
304  unsigned char c2 = vec2[start2 + seg_len - 1 - pos];
305  computed_score += (c1 == reverse_4na[c2]) ?
306  match : mismatch;
307  }
308  }
309  else if (strand1 < strand2) {
310  for (TSeqPos pos = 0; pos < seg_len; ++pos) {
311  unsigned char c1 = vec1[start1 + seg_len - 1 - pos];
312  unsigned char c2 = vec2[start2 + pos];
313  computed_score += (reverse_4na[c1] == c2) ?
314  match : mismatch;
315  }
316  }
317  else {
318  for (TSeqPos pos = 0; pos < seg_len; ++pos) {
319  unsigned char c1 = vec1[start1 + pos];
320  unsigned char c2 = vec2[start2 + pos];
321  computed_score += (c1 == c2) ? match : mismatch;
322  }
323  }
324  }
325 
326  if (scaled_up)
327  computed_score /= 2;
328  }
329 
330  computed_score = max(0, computed_score);
331  return computed_score;
332 }
333 
335  const CSeq_align& align)
336 {
339 
340  CSeq_inst::TMol mol1 = scope.GetBioseqHandle(bsh1).GetSequenceType();
341  CSeq_inst::TMol mol2 = scope.GetBioseqHandle(bsh2).GetSequenceType();
342 
343  if (mol1 == mol2) {
344  CRef<CSeq_align> new_align =
345  ConvertSeq_align(align,
347  -1,
348  &scope);
349  return GetBlastScoreDenseg(scope, *new_align);
350  }
351 
352  const CSeq_align* align_ptr = &align;
353 
354  unique_ptr<CSeq_align> swapped_align_ptr;
355  if (CSeq_inst::IsNa(mol1)) {
356  swapped_align_ptr.reset(new CSeq_align);
357  swapped_align_ptr->Assign(align);
358  swapped_align_ptr->SwapRows(0,1);
359  align_ptr = swapped_align_ptr.get();
360  }
361 
362  list<CRef<CPairwiseAln> > pairs;
364  pairs.push_back(aln);
365 
366  return GetBlastScoreProtToNucl(scope, *align_ptr, pairs);
367 }
368 
370  const CSeq_align& align)
371 {
372  // check assumptions:
373  //
374  if ( align.GetSegs().GetSpliced().GetProduct_type() !=
376  NCBI_THROW(CSeqalignException, eUnsupported,
377  "CScore_TblastnScore: "
378  "valid only for protein spliced-seg alignments");
379  }
380 
381  list<CRef<CPairwiseAln> > pairs;
382  CSeq_align sub_align;
383  sub_align.Assign(align);
384 
386  align.GetSegs().GetSpliced().GetExons()) {
387  CRef<CSpliced_exon> exon = *it;
388  sub_align.SetSegs().SetSpliced().SetExons().clear();
389  sub_align.SetSegs().SetSpliced().SetExons().push_back(exon);
390 
392 
393  if (exon->IsSetAcceptor_before_exon() || pairs.empty()) {
394  pairs.push_back(aln);
395  } else {
396  ITERATE(CPairwiseAln, r, *aln) {
397  pairs.back()->push_back(*r);
398  }
399  }
400  }
401  return GetBlastScoreProtToNucl(scope, align, pairs);
402 }
403 
405  const CSeq_align& align,
406  list<CRef<CPairwiseAln> >& pairs)
407 {
409  CSeq_id_Handle genomic_idh =
411 
412  ENa_strand strand = align.GetSeqStrand(1);
413  CBioseq_Handle prot_bsh = scope.GetBioseqHandle(prot_idh);
414  CBioseq_Handle genomic_bsh = scope.GetBioseqHandle(genomic_idh);
415  CSeqVector prot_vec (prot_bsh);
416 
417  int gcode = 1;
418  try {
419  gcode = sequence::GetOrg_ref(genomic_bsh).GetGcode();
420  }
421  catch (CException&) {
422  // use the default genetic code
423  }
424 
425  const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
426 
427  int state = 0;
428  int offs = 0;
429  int score = 1;
430 
431  if (m_ScoreBlk == NULL) {
434  x_Initialize(*options);
435  }
436  Int4 **matrix = m_ScoreBlk->matrix->data;
437 
438 // int num_positives = 0;
439 // int num_negatives = 0;
440 // int num_match = 0;
441 // int num_mismatch = 0;
442  ITERATE (list<CRef<CPairwiseAln> >, it, pairs) {
443  CRef<CPairwiseAln> aln = *it;
444 
445  int this_pair_score = -1;
446  list<int> gaps;
447  CPairwiseAln::const_iterator prev = aln->end();
448  ITERATE (CPairwiseAln, range_it, *aln) {
449 
450  // handle gaps
451  if (prev != aln->end()) {
452  int q_gap = range_it->GetFirstFrom() - prev->GetFirstTo() - 1;
453  int s_gap =
454  (strand == eNa_strand_minus ?
455  prev->GetSecondFrom() - range_it->GetSecondTo() - 1 :
456  range_it->GetSecondFrom() - prev->GetSecondTo() - 1);
457 
458  // check if this range is in the list of known introns
459 
460  int gap = abs(q_gap - s_gap);
461  gaps.push_back(gap);
462  }
463  prev = range_it;
464 
465  CRange<int> q_range = range_it->GetFirstRange();
466  CRange<int> s_range = range_it->GetSecondRange();
467 
468  int s_start = s_range.GetFrom();
469  int s_end = s_range.GetTo();
470  int q_pos = q_range.GetFrom();
471 
472  int new_offs = q_pos % 3;
473  for ( ; offs != new_offs; offs = (offs + 1) % 3) {
474  state = tbl.NextCodonState(state, 'N');
475  }
476 
477  // first range is in nucleotide coordinates...
478 
479  CRef<CSeq_loc> loc =
480  genomic_bsh.GetRangeSeq_loc(s_start, s_end, strand);
481  CSeqVector genomic_vec(*loc, scope, CBioseq_Handle::eCoding_Iupac);
482  CSeqVector_CI vec_it(genomic_vec);
483 
484  for ( ; s_start <= s_end; ++s_start, ++q_pos, ++vec_it) {
485  state = tbl.NextCodonState(state, *vec_it);
486  if (offs % 3 == 2) {
487  Uint1 prot = prot_vec[(int)(q_pos / 3)];
488  Uint1 xlate = AMINOACID_TO_NCBISTDAA[(unsigned)tbl.GetCodonResidue(state)];
489 
490  if (q_pos/3 == 0 &&
491  prot != xlate &&
493  ) {
494  xlate = prot;
495  }
496  int this_score = matrix[prot][xlate];
497 
498 // num_match += (prot == xlate);
499 // num_mismatch += (prot != xlate);
500 // num_positives += (this_score > 0);
501 // num_negatives += (this_score <= 0);
502  this_pair_score += this_score;
503  }
504 
505  offs = (offs + 1) % 3;
506  }
507  }
508 
509  // adjust score for gaps
510  // HACK: this isn't exactly correct; it overestimates scores because we
511  // don't have full accounting of composition based statistics, etc.
512  // It's close enough, though
513  gaps.sort();
514 
515  ITERATE(list<int>, gap_bases, gaps) {
516  int new_score = this_pair_score - m_GapOpen - (*gap_bases/3) * m_GapExtend;
517  // do not score huge gaps - they are between hits gaps
518  if (new_score > 0 ) {
519  this_pair_score = new_score;
520  } else {
521  this_pair_score -= 1;
522  }
523  }
524 
525  score += this_pair_score;
526  }
527 
528  return score;
529 }
530 
532  const CSeq_align& align)
533 {
534  int raw_score = GetBlastScore(scope, align);
536 
538  raw_score &= ~1;
539 
540  return (raw_score * kbp->Lambda - kbp->logK) / NCBIMATH_LN2;
541 }
542 
543 
545  const CSeq_align& align)
546 {
547  if (m_EffectiveSearchSpace == 0) {
548  NCBI_THROW(CSeqalignException, eInvalidInputData,
549  "E-value calculation requires search space "
550  "to be specified");
551  }
552 
553  int raw_score = GetBlastScore(scope, align);
555 
557  raw_score &= ~1;
558 
559  return BLAST_KarlinStoE_simple(raw_score, kbp, m_EffectiveSearchSpace);
560 }
561 
562 
563 double CScoreBuilder::ComputeScore(CScope& scope, const CSeq_align& align,
564  const CRangeCollection<TSeqPos> &ranges,
566 {
567  // Override certain score computations in this subclass.
568  switch (score) {
570  return GetBlastScore(scope, align);
571 
573  {{
574  double d = GetBlastBitScore(scope, align);
576  d == numeric_limits<double>::quiet_NaN()) {
577  d = 0;
578  }
579  if (d > 1e35 || d < -1e35) {
580  d = 0;
581  }
582  return d;
583  }}
584 
586  {{
587  double d = GetBlastEValue(scope, align);
589  d == numeric_limits<double>::quiet_NaN()) {
590  d = 0;
591  }
592  if (d > 1e35 || d < -1e35) {
593  d = 0;
594  }
595  return d;
596  }}
597 
600  // FIXME TODO: Not implemented.
601 
602  // Fallback to superclass implementation of score computation.
603  default:
604  return CScoreBuilderBase::ComputeScore(scope, align, ranges, score);
605  }
606 }
607 
608 
609 
611  EScoreType score)
612 {
613  switch (score) {
614  case eScore_Blast:
615  AddScore(scope, align, CSeq_align::eScore_Blast);
616  break;
617 
619  AddScore(scope, align, CSeq_align::eScore_BitScore);
620  break;
621 
622  case eScore_Blast_EValue:
623  AddScore(scope, align, CSeq_align::eScore_EValue);
624  break;
625 
628  break;
629 
632  break;
633 
636  break;
639  break;
640  }
641 }
642 
644  list< CRef<CSeq_align> >& aligns, EScoreType score)
645 {
646  NON_CONST_ITERATE (list< CRef<CSeq_align> >, iter, aligns) {
647  CSeq_align& align = **iter;
648  switch (score) {
649  case eScore_Blast:
650  AddScore(scope, align, CSeq_align::eScore_Blast);
651  break;
652 
654  AddScore(scope, align, CSeq_align::eScore_BitScore);
655  break;
656 
657  case eScore_Blast_EValue:
658  AddScore(scope, align, CSeq_align::eScore_EValue);
659  break;
660 
663  break;
664 
667  break;
668 
671  break;
674  break;
675  }
676  }
677 }
678 
679 static inline void s_RecordMatch(size_t match, string &BTOP, string &flipped_BTOP)
680 {
681  if (match) {
682  string match_str = NStr::NumericToString(match);
683  BTOP += match_str;
684  flipped_BTOP.insert(0, match_str);
685  }
686 }
687 
688 static pair<string,string> s_ComputeTraceback(CScope& scope,
689  const CSeq_align& align)
690 {
691  if (!align.GetSegs().IsDenseg() || align.CheckNumRows() != 2) {
693  "Traceback strings can only be calculated for pairwise "
694  "Dense-seg alignments");
695  }
696 
697  const CDense_seg& ds = align.GetSegs().GetDenseg();
698  CAlnVec vec(ds, scope);
699  string BTOP, flipped_BTOP;
700  for (CDense_seg::TNumseg i = 0; i < ds.GetNumseg(); ++i) {
701  string query, subject;
702  vec.GetSegSeqString(query, 0, i);
703  vec.GetSegSeqString(subject, 1, i);
704  if (query.empty()) {
705  for (unsigned idx = 0; idx < subject.size(); ++idx) {
706  string complement;
708  idx, 1, complement);
709  BTOP += '-';
710  BTOP += subject[idx];
711  flipped_BTOP.insert(flipped_BTOP.begin(), complement[0]);
712  flipped_BTOP.insert(flipped_BTOP.begin(), '-');
713  }
714  } else if (subject.empty()) {
715  for (unsigned idx = 0; idx < query.size(); ++idx) {
716  string complement;
718  idx, 1, complement);
719  BTOP += query[idx];
720  BTOP += '-';
721  flipped_BTOP.insert(flipped_BTOP.begin(), '-');
722  flipped_BTOP.insert(flipped_BTOP.begin(), complement[0]);
723  }
724  } else {
725  size_t match = 0;
726  for (unsigned idx = 0; idx < query.size(); ++idx) {
727  NCBI_ASSERT(query.size() == subject.size(),
728  "inconsistent aligned segment length");
729  if (query[idx] == subject[idx]) {
730  ++match;
731  } else {
732  s_RecordMatch(match, BTOP, flipped_BTOP);
733  match = 0;
734  BTOP += query[idx];
735  BTOP += subject[idx];
736  string query_complement;
738  idx, 1, query_complement);
739  string subject_complement;
741  idx, 1, subject_complement);
742  flipped_BTOP.insert(flipped_BTOP.begin(),
743  subject_complement[0]);
744  flipped_BTOP.insert(flipped_BTOP.begin(),
745  query_complement[0]);
746  }
747  }
748  s_RecordMatch(match, BTOP, flipped_BTOP);
749  }
750  }
751  return pair<string,string>(
752  BTOP, align.GetSeqStrand(0) == align.GetSeqStrand(1)
753  ? BTOP : flipped_BTOP);
754 }
755 
757 {
758  CRef<CUser_object> tracebacks;
759  ITERATE (CSeq_align::TExt, ext_it, align.SetExt()) {
760  if ((*ext_it)->GetType().IsStr() &&
761  (*ext_it)->GetType().GetStr() == "Tracebacks")
762  {
763  /// Tracebacks object already exists
764  tracebacks = *ext_it;
765  break;
766  }
767  }
768 
769  if (!tracebacks) {
770  tracebacks.Reset(new CUser_object);
771  tracebacks->SetType().SetStr("Tracebacks");
772  align.SetExt().push_back(tracebacks);
773  } else if (tracebacks->HasField("Query") && tracebacks->HasField("Subject"))
774  {
775  return;
776  }
777  pair<string,string> traceback_strings = s_ComputeTraceback(scope, align);
778  tracebacks->SetField("Query").SetData().SetStr(traceback_strings.first);
779  tracebacks->SetField("Subject").SetData().SetStr(traceback_strings.second);
780 }
781 
783  list< CRef<CSeq_align> >& aligns)
784 {
785  NON_CONST_ITERATE (list< CRef<CSeq_align> >, iter, aligns) {
786  AddTracebacks(scope, **iter);
787  }
788 }
789 
791  CSeq_align::TDim row)
792 {
793  if (align.IsSetExt()) {
794  ITERATE (CSeq_align::TExt, ext_it, align.GetExt()) {
795  if ((*ext_it)->GetType().IsStr() &&
796  (*ext_it)->GetType().GetStr() == "Tracebacks")
797  {
798  string field = row == 0 ? "Query" : "Subject";
799  if ((*ext_it)->HasField(field)) {
800  return (*ext_it)->GetField(field).GetData().GetStr();
801  }
802  break;
803  }
804  }
805  }
806  return "";
807 }
808 
809 string CScoreBuilder::GetTraceback(CScope& scope, const CSeq_align& align,
810  CSeq_align::TDim row)
811 {
812  string stored_traceback = GetTraceback(align, row);
813  if (!stored_traceback.empty()) {
814  return stored_traceback;
815  }
816 
817  /// Tracebacks user object not found; need to calculate on the fly
818  pair<string,string> traceback_strings = s_ComputeTraceback(scope, align);
819  return row == 0 ? traceback_strings.first : traceback_strings.second;
820 }
821 
822 
823 
824 // clear out all of the parts of a seq-align
825 // that might mess with the CRC but don't effect
826 // the important parts of the alignment
828 {
829 
830  // These seg specific parts are probably incomplete
831  // but denseg, disc, and spliced ought to handle 99.9%
832  if (align.CanGetSegs()) {
833  if (align.GetSegs().IsDenseg()) {
834  align.SetSegs().SetDenseg().SetScores().clear();
835  // ...
836  }
837  else if (align.GetSegs().IsDisc()) {
839  align.SetSegs().SetDisc().Set()) {
840  s_CleanSeqAlign(**align_iter);
841  }
842  }
843  else if (align.GetSegs().IsSpliced()) {
845  align.SetSegs().SetSpliced().SetExons()) {
846  (*exon_iter)->SetScores().Set().clear();
847  }
848  }
849  else if (align.GetSegs().IsSparse()) {
850  align.SetSegs().SetSparse().SetRow_scores().clear();
851  // ...
852  }
853  else if (align.GetSegs().IsStd()) {
855  align.SetSegs().SetStd()) {
856  (*std_iter)->SetScores().clear();
857  }
858  }
859  }
860 
861 
862  align.SetScore().clear();
863  align.SetId().clear();
864  align.SetBounds().clear();
865  align.SetExt().clear();
866 }
867 
869 {
870  CChecksum checksum;
871 
872  CSeq_align clean;
873  clean.Assign(align);
874  s_CleanSeqAlign(clean);
875 
876  stringstream cleanStr;
877  cleanStr << MSerial_AsnText << clean;
878 
879  checksum.AddLine(cleanStr.str());
880 
881  int result = 0;
882  Uint4* result_ptr = (Uint4*)&result;
883  *result_ptr = checksum.GetChecksum();
884 
885  return result;
886 }
887 
889 {
890  int tiebreaker = ComputeTieBreaker(align);
891  align.SetNamedScore("tiebreaker", tiebreaker);
892 }
893 
894 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CRef< CPairwiseAln > CreatePairwiseAlnFromSeqAlign(const objects::CSeq_align &seq_align)
A simple API that assumes that the seq_align has exactly two rows and you want to create a pairwise w...
CRef< CSeq_align > ConvertSeq_align(const CSeq_align &src, CSeq_align::TSegs::E_Choice dst_choice, CSeq_align::TDim anchor_row=-1, CScope *scope=NULL)
Convert source alignment to a new type.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
The structures and functions in blast_options.
Int2 BLAST_FillScoringOptions(BlastScoringOptions *options, EBlastProgramType program, Boolean greedy_extension, Int4 penalty, Int4 reward, const char *matrix, Int4 gap_open, Int4 gap_extend)
Fill non-default values in the BlastScoringOptions structure.
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
BlastScoringOptions * BlastScoringOptionsFree(BlastScoringOptions *options)
Deallocate memory for BlastScoringOptions.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
Utilities initialize/setup BLAST.
Int2 Blast_ScoreBlkMatrixInit(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
Initializes the substitution matrix in the BlastScoreBlk according to the scoring options specified.
Definition: blast_setup.c:330
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
Int2 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in lambda, H, and K values, as calculated by Stephen Altschul in Methods in Enzy.
Definition: blast_stat.c:3526
Blast_KarlinBlk * Blast_KarlinBlkNew(void)
Callocs a Blast_KarlinBlk.
Definition: blast_stat.c:2860
Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk *kbp_ungap, Boolean *round_down, Blast_Message **error_return)
Retrieves Karlin-Altschul parameters from precomputed tables, given the substitution and gap scores.
Definition: blast_stat.c:3835
double BLAST_KarlinStoE_simple(Int4 S, Blast_KarlinBlk *kbp, Int8 searchsp)
Calculates the Expect value based upon the search space and some Karlin-Altschul parameters.
Definition: blast_stat.c:4140
Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk *sbp)
Calculates the Karlin-Altschul parameters assuming standard residue compositions for the query and su...
Definition: blast_stat.c:2831
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
Definition: blast_stat.c:884
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ eVecScreen
Vector screening.
Definition: blast_types.hpp:72
@ eBlastn
Nucl-Nucl (traditional blastn)
Definition: blast_types.hpp:58
@ ePHIBlastn
Nucleotide PHI BLAST.
Definition: blast_types.hpp:70
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
@ eTblastn
Protein-Translated nucl.
Definition: blast_types.hpp:61
@ eMegablast
Nucl-Nucl (traditional megablast)
Definition: blast_types.hpp:65
@ eDiscMegablast
Nucl-Nucl using discontiguous megablast.
Definition: blast_types.hpp:66
Checksum and hash calculation classes.
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:614
TSeqPos GetLen(TNumseg seg, int offset=0) const
Definition: alnmap.hpp:621
CDense_seg::TNumseg TNumseg
Definition: alnmap.hpp:72
int StrandSign(TNumrow row) const
Definition: alnmap.hpp:593
TNumseg GetNumSegs(void) const
Definition: alnmap.hpp:510
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
Definition: alnvec.cpp:86
string & GetSegSeqString(string &buffer, TNumrow row, TNumseg seg, TNumseg offset=0) const
Definition: alnvec.hpp:317
CBioseq_Handle –.
Handle to the options to the BLAST algorithm.
Encapsulates ALL the BLAST algorithm's options.
CChecksum – Checksum calculator.
Definition: checksum.hpp:302
static const CTrans_table & GetTransTable(int id)
int GetGcode(void) const
Definition: Org_ref.cpp:134
A pairwise aln is a collection of ranges for a pair of rows.
CScope –.
Definition: scope.hpp:92
double ComputeScore(CScope &scope, const CSeq_align &align, CSeq_align::EScoreType score)
int GetBlastScoreProtToNucl(CScope &scope, const CSeq_align &align, list< CRef< CPairwiseAln > > &pairs)
int ComputeTieBreaker(const CSeq_align &align)
void AddTracebacks(CScope &scope, CSeq_align &align)
enum blast::EProgram m_BlastType
~CScoreBuilder()
Destructor.
void AddScore(CScope &scope, CSeq_align &align, EScoreType score)
deprecated: use CSeq_align::EScoreType directly
double GetBlastBitScore(CScope &scope, const CSeq_align &align)
Compute the BLAST bit score.
struct BlastScoreBlk * m_ScoreBlk
int GetBlastScore(CScope &scope, const CSeq_align &align)
Compute the BLAST score of the alignment.
void AddTieBreaker(CSeq_align &align)
int GetBlastScoreDenseg(CScope &scope, const CSeq_align &align)
void x_Initialize(blast::CBlastOptionsHandle &options)
double ComputeScore(CScope &scope, const CSeq_align &align, const CRangeCollection< TSeqPos > &ranges, CSeq_align::EScoreType score)
int GetBlastScoreSpliced(CScope &scope, const CSeq_align &align)
Int8 m_EffectiveSearchSpace
int GetBlastScoreStd(CScope &scope, const CSeq_align &align)
string GetTraceback(const CSeq_align &align, CSeq_align::TDim row)
double GetBlastEValue(CScope &scope, const CSeq_align &align)
Compute the BLAST e-value.
CScoreBuilder()
Default constructor.
static SIZE_TYPE Complement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Iupacna
Definition: sequtil.hpp:47
CSeqVector –.
Definition: seq_vector.hpp:65
EScoreType
enum controlling known named scores
Definition: Seq_align.hpp:128
@ eScore_SumEValue
Definition: Seq_align.hpp:171
@ eScore_PercentCoverage
Definition: Seq_align.hpp:168
@ eScore_CompAdjMethod
Definition: Seq_align.hpp:174
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
@ eScore_PercentIdentity
Definition: Seq_align.hpp:189
@ eScore_MismatchCount
Definition: Seq_align.hpp:154
void SetNamedScore(const string &id, int score)
Definition: Seq_align.cpp:636
TDim CheckNumRows(void) const
Validatiors.
Definition: Seq_align.cpp:73
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
bool IsNa(void) const
Definition: Seq_inst.hpp:106
char GetStartResidue(int state) const
char GetCodonResidue(int state) const
static int NextCodonState(int state, unsigned char ch)
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
CUser_field & SetField(const string &str, const string &delim=".", const string &obj_subtype=kEmptyStr, NStr::ECase use_case=NStr::eCase)
Access a named field in this user object.
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
#define BLASTNA_SEQ_CODE
Identifies the blastna alphabet, for use in blast only.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
EBlastProgramType EProgramToEBlastProgramType(EProgram p)
Convert EProgram to EBlastProgramType.
Definition: blast_aux.cpp:709
int GetGapExtensionCost() const
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
EProgram GetProgram() const
Accessors/Mutators for individual options.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
int GetMismatchPenalty() const
int GetMatchReward() const
int GetGapOpeningCost() const
const char * GetMatrixName() const
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
void AddLine(const char *line, size_t len)
Definition: checksum.hpp:609
Uint4 GetChecksum(void) const
Return calculated checksum.
Definition: checksum.hpp:341
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
Definition: sequence.cpp:264
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TMol GetSequenceType(void) const
CRef< CSeq_loc > GetRangeSeq_loc(TSeqPos start, TSeqPos stop, ENa_strand strand=eNa_strand_unknown) const
Return CSeq_loc referencing the given range and strand on the bioseq If start == 0,...
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TMol GetSequenceType(void) const
Definition: seq_vector.hpp:343
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
TId & SetId(void)
Assign a value to Id data member.
Definition: Seq_align_.hpp:982
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
list< CRef< CStd_seg > > TStd
Definition: Seq_align_.hpp:196
bool IsSetExt(void) const
extra info Check if a value has been assigned to Ext data member.
Definition: Seq_align_.hpp:989
static string SelectionName(E_Choice index)
Retrieve selection name (for diagnostic purposes).
Definition: Seq_align_.cpp:143
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
list< CRef< CUser_object > > TExt
Definition: Seq_align_.hpp:402
bool IsSparse(void) const
Check if variant Sparse is selected.
Definition: Seq_align_.hpp:784
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool CanGetSegs(void) const
Check if it is safe to call GetSegs method.
Definition: Seq_align_.hpp:915
TExt & SetExt(void)
Assign a value to Ext data member.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
bool IsDisc(void) const
Check if variant Disc is selected.
Definition: Seq_align_.hpp:772
const TExt & GetExt(void) const
Get the Ext member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
TBounds & SetBounds(void)
Assign a value to Bounds data member.
Definition: Seq_align_.hpp:957
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsDenseg(void) const
Check if variant Denseg is selected.
Definition: Seq_align_.hpp:740
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
const int infinity
Definition: nucprot.cpp:52
int i
#define abs(a)
Definition: ncbi_heapmgr.c:130
#define NCBIMATH_LN2
Natural log(2)
Definition: ncbi_math.h:161
T max(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
static void s_RecordMatch(size_t match, string &BTOP, string &flipped_BTOP)
USING_SCOPE(objects)
void s_CleanSeqAlign(CSeq_align &align)
static pair< string, string > s_ComputeTraceback(CScope &scope, const CSeq_align &align)
static const unsigned char reverse_4na[16]
Boolean round_down
Score must be rounded down to nearest even score if odd.
Definition: blast_stat.h:221
char * name
name of scoring matrix.
Definition: blast_stat.h:183
Int4 penalty
penalty for mismatch in blastn.
Definition: blast_stat.h:199
SBlastScoreMatrix * matrix
scoring matrix data
Definition: blast_stat.h:185
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Definition: blast_stat.h:216
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Definition: blast_stat.h:214
Int4 reward
reward for match in blastn.
Definition: blast_stat.h:200
Scoring options block Used to produce the BlastScoreBlk structure This structure may be needed for lo...
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
static string subject
static string query
@ TRUE
Definition: testodbc.c:27
else result
Definition: token2.c:20
Modified on Tue Nov 28 02:28:17 2023 by modify_doxy.py rev. 669887