NCBI C++ ToolKit
score_builder_base.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: score_builder_base.cpp 100758 2023-09-07 19:05:43Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
35 
37 
41 
43 #include <objmgr/seq_vector.hpp>
44 #include <objmgr/feat_ci.hpp>
45 
56 
57 #include <objmgr/util/sequence.hpp>
61 
64 
65 /// Default constructor
67 : m_ErrorMode(eError_Throw)
68 , m_SubstMatrixName("BLOSUM62")
69 {
70 }
71 
72 /// Destructor
74 {
75 }
76 
77 /// Get length of intersection between a range and a range collection
78 static inline
80  const TSeqRange &range)
81 {
82  TSeqPos length = 0;
83  ITERATE (CRangeCollection<TSeqPos>, it, ranges) {
84  length += it->IntersectionWith(range).GetLength();
85  }
86  return length;
87 }
88 
89 ///
90 /// calculate mismatches and identities in a seq-align
91 ///
92 
93 static void s_GetNucIdentityMismatch(const vector<string>& data,
94  int* identities,
95  int* mismatches)
96 {
97  if ( data.empty() ) {
98  return;
99  }
100  size_t rows = data.size();
101  size_t size = data[0].size();
102  for (size_t i = 1; i < rows; ++i ) {
103  if ( data[i].size() != size ) {
104  NCBI_THROW(CSeqalignException, eInvalidInputData,
105  "Rows have different lengths");
106  }
107  }
108  for (size_t a = 0; a < size; ++a) {
109  bool is_mismatch = false;
110  char c = data[0][a];
111  for (size_t b = 1; b < rows; ++b) {
112  if (data[b][a] != c) {
113  is_mismatch = true;
114  break;
115  }
116  }
117 
118  if (is_mismatch) {
119  ++(*mismatches);
120  } else {
121  ++(*identities);
122  }
123  }
124 }
125 
126 
128  const CSeq_align& align,
129  const CRangeCollection<TSeqPos> &ranges,
130  int* identities,
131  int* mismatches)
132 {
133  ///
134  /// easy route
135  /// use the alignment manager
136  ///
137  TAlnSeqIdIRef id1(new CAlnSeqId(align.GetSeq_id(0)));
138  TAlnSeqIdIRef id2(new CAlnSeqId(align.GetSeq_id(1)));
139  CRef<CPairwiseAln> pairwise(new CPairwiseAln(id1, id2));
140  ConvertSeqAlignToPairwiseAln(*pairwise, align, 0, 1);
141 
142  CBioseq_Handle prod_bsh = scope.GetBioseqHandle(align.GetSeq_id(0));
143  CBioseq_Handle genomic_bsh = scope.GetBioseqHandle(align.GetSeq_id(1));
144  if ( !prod_bsh || !genomic_bsh ) {
145  const CSeq_id &failed_id = align.GetSeq_id(genomic_bsh ? 0 : 1);
146  NCBI_THROW(CSeqalignException, eInvalidSeqId,
147  "Can't get sequence data for " + failed_id.AsFastaString() +
148  " in order to count identities/mismatches");
149  }
150 
152 
153  switch (align.GetSegs().GetSpliced().GetProduct_type()) {
155  {{
157  ITERATE (CPairwiseAln, it, *pairwise) {
158  const CPairwiseAln::TAlnRng& range = *it;
159  TSeqRange r1(range.GetFirstFrom(), range.GetFirstTo());
160  TSeqRange r2(range.GetSecondFrom(), range.GetSecondTo());
161  string prod_data;
162  prod.GetSeqData(r1.GetFrom(), r1.GetTo() + 1, prod_data);
163  string gen_data;
164  gen.GetSeqData(r2.GetFrom(), r2.GetTo() + 1, gen_data);
165  if (range.IsReversed()) {
168  0, gen_data.size());
169  }
170 
171  CRangeCollection<TSeqPos> seg_ranges = ranges;
172  seg_ranges.IntersectWith(r1);
173  ITERATE (CRangeCollection<TSeqPos>, range_it, seg_ranges) {
174  TSeqPos start_offset = range_it->GetFrom() - r1.GetFrom(),
175  end_offset = range_it->GetToOpen() - r1.GetFrom();
176  string::const_iterator pit = prod_data.begin()
177  + start_offset;
178  string::const_iterator pit_end = prod_data.begin()
179  + end_offset;
180  string::const_iterator git = gen_data.begin()
181  + start_offset;
182  string::const_iterator git_end = gen_data.begin()
183  + end_offset;
184 
185  for ( ; pit != pit_end && git != git_end; ++pit, ++git)
186  {
187  bool match = (*pit == *git);
188  *identities += match;
189  *mismatches += !match;
190  }
191  }
192  }
193  }}
194  break;
195 
197  {{
198  int gcode = 1;
199  try {
200  const COrg_ref& org_ref = sequence::GetOrg_ref(genomic_bsh);
201  gcode = org_ref.GetOrgname().GetGcode();
202  }
203  catch (CException&) {
204  }
205  const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
206 
207  char codon[3];
208  codon[0] = codon[1] = codon[2] = 'N';
209 
210  TSeqRange last_r1(0, 0);
211  ITERATE (CPairwiseAln, it, *pairwise) {
212  const CPairwiseAln::TAlnRng& range = *it;
213  TSeqRange r1(range.GetFirstFrom(), range.GetFirstTo());
214  TSeqRange r2(range.GetSecondFrom(), range.GetSecondTo());
215 
216  if (last_r1.GetTo() + 1 != r1.GetFrom()) {
217  size_t i = last_r1.GetTo() + 1;
218  size_t count = 0;
219  for ( ; i != r1.GetFrom() && count < 3; ++i, ++count) {
220  codon[ i % 3 ] = 'N';
221  }
222  }
223  last_r1 = r1;
224 
225  string gen_data;
227  gen.GetSeqData(r2.GetFrom(), r2.GetTo() + 1, gen_data);
228  if (range.IsReversed()) {
231  0, gen_data.size());
232 
233  //LOG_POST(Error << "reverse range: [" << r1.GetFrom() << ", " << r1.GetTo() << "] - [" << r2.GetFrom() << ", " << r2.GetTo() << "]");
234  } else {
235  //LOG_POST(Error << "forward range: [" << r1.GetFrom() << ", " << r1.GetTo() << "] - [" << r2.GetFrom() << ", " << r2.GetTo() << "]");
236  }
237 
238  /// compare product range to conceptual translation
239  TSeqPos prod_pos = r1.GetFrom();
240  //LOG_POST(Error << " genomic = " << gen_data);
241  for (size_t i = 0; i < gen_data.size(); ++i, ++prod_pos) {
242  codon[ prod_pos % 3 ] = gen_data[i];
243  //LOG_POST(Error << " filling: " << prod_pos << ": " << prod_pos % 3 << ": " << gen_data[i]);
244 
245  if (prod_pos % 3 == 2) {
246  int state = tbl.SetCodonState(codon[0], codon[1], codon[2]);
247  char residue = (prod_pos == 2
248  ? tbl.GetStartResidue(state)
249  : tbl.GetCodonResidue(state));
250 
251  /// NOTE:
252  /// we increment identities/mismatches by 3 here,
253  /// counting identities in nucleotide space!!
254  if (residue == prod[prod_pos / 3] &&
255  residue != 'X' && residue != '-') {
256  *identities += 3;
257  } else {
258  *mismatches += 3;
259  }
260  }
261  }
262  }
263  }}
264  break;
265 
266  default:
267  break;
268  }
269 
270  /*
271  * NB: leave this here; it's useful for validation
272  int actual_identities = 0;
273  if (align.GetNamedScore("N of matches", actual_identities)) {
274  if (actual_identities != *identities) {
275  LOG_POST(Error << "actual identities: " << actual_identities
276  << " computed identities: " << *identities);
277 
278  //cerr << MSerial_AsnText << align;
279  }
280  }
281  **/
282 }
283 
284 
285 static void s_GetCountIdentityMismatch(CScope& scope, const CSeq_align& align,
286  int* identities, int* mismatches,
287  const CRangeCollection<TSeqPos> &ranges =
289 {
290  _ASSERT(identities && mismatches);
291  if (ranges.empty()) {
292  return;
293  }
294 
295  {{
296  ///
297  /// shortcut: if 'num_ident' is present, we trust it
298  ///
299  int num_ident = 0;
300  if (ranges.begin()->IsWhole() &&
302  {
303  size_t len = align.GetAlignLength(false /*ignore gaps*/);
304  *identities += num_ident;
305  *mismatches += (len - num_ident);
306  return;
307  }
308  }}
309 
310  switch (align.GetSegs().Which()) {
312  {{
313  const CDense_seg& ds = align.GetSegs().GetDenseg();
314  vector<string> data;
315  CAlnVec vec(ds, scope);
316  data.resize(vec.GetNumRows());
317  for (int seg = 0; seg < vec.GetNumSegs(); ++seg) {
318  bool has_gap = false;
319  for (int i = 0; !has_gap && i < vec.GetNumRows(); ++i) {
320  if (vec.GetStart(i, seg) == -1) {
321  has_gap = true;
322  }
323  }
324  if (has_gap) {
325  /// we compute ungapped identities
326  /// gap on at least one row, so we skip this segment
327  continue;
328  }
329 
330  TSeqPos seg_start = vec.GetStart(0, seg),
331  seg_stop = vec.GetStop(0, seg);
332  CRangeCollection<TSeqPos> seg_ranges = ranges;
333  seg_ranges.IntersectWith(TSeqRange(seg_start, seg_stop));
334  for (int i = 0; i < vec.GetNumRows(); ++i) {
335  TSeqPos offset = vec.GetStart(i, seg) - seg_start;
336  ITERATE (CRangeCollection<TSeqPos>, range_it, seg_ranges) {
337  string seq_string;
338  vec.GetSeqString(seq_string, i,
339  range_it->GetFrom()+offset,
340  range_it->GetTo()+offset);
341  data[i] += seq_string;
342  }
343  }
344  }
345  s_GetNucIdentityMismatch(data, identities, mismatches);
346  }}
347  break;
348 
350  {{
352  align.GetSegs().GetDisc().Get()) {
353  s_GetCountIdentityMismatch(scope, **iter,
354  identities, mismatches, ranges);
355  }
356  }}
357  break;
358 
360  NCBI_THROW(CSeqalignException, eNotImplemented,
361  "identity + mismatch function not implemented for std-seg");
362  break;
363 
365  {{
366  int aln_identities = 0;
367  int aln_mismatches = 0;
368  bool has_non_standard = false;
370  align.GetSegs().GetSpliced().GetExons()) {
371  const CSpliced_exon& exon = **iter;
372  TSeqRange product_span;
373  product_span.Set(exon.GetProduct_start().AsSeqPos(),
374  exon.GetProduct_end().AsSeqPos());
375  if (exon.IsSetParts()) {
376  TSeqPos part_start = product_span.GetFrom();
377  ITERATE (CSpliced_exon::TParts, it, exon.GetParts()) {
378  const CSpliced_exon_chunk& chunk = **it;
379  int part_len = 0;
380  switch (chunk.Which()) {
382  part_len = chunk.GetMatch();
383  aln_identities += s_IntersectionLength(ranges,
384  TSeqRange(part_start,
385  part_start+part_len-1));
386  break;
387 
389  part_len = chunk.GetMismatch();
390  aln_mismatches += s_IntersectionLength(ranges,
391  TSeqRange(part_start,
392  part_start+part_len-1));
393  break;
394 
396  part_len = chunk.GetDiag();
397  if (s_IntersectionLength(ranges,
398  TSeqRange(part_start,
399  part_start+part_len-1)))
400  {
401  has_non_standard = true;
402  }
403  break;
404 
406  part_len = chunk.GetProduct_ins();
407  break;
408 
409  default:
410  break;
411  }
412  part_start += part_len;
413  }
414  } else {
415  has_non_standard = true;
416  break;
417  }
418  }
419  if ( !has_non_standard ) {
420  *identities += aln_identities;
421  *mismatches += aln_mismatches;
422  break;
423  }
424 
425  /// we must compute match and mismatch based on first
426  /// prinicples. Sometimes loader will fail in getting
427  /// all components of the genomic sequence; in that case
428  /// throw an exception, but make it somewhat more informative
429  try {
430  s_GetSplicedSegIdentityMismatch(scope, align, ranges,
431  identities, mismatches);
432  } catch (CLoaderException &e) {
434  "Can't calculate identities/mismatches for "
435  "alignment with genomic sequence " +
436  align.GetSeq_id(1).AsFastaString() +
437  "; Loader can't load all required "
438  "components of sequence");
439  }
440  }}
441  break;
442 
443  default:
444  _ASSERT(false);
445  break;
446  }
447 }
448 
449 ///
450 /// calculate the percent identity
451 /// we also return the count of identities and mismatches
452 ///
453 static void s_GetPercentIdentity(CScope& scope, const CSeq_align& align,
454  int* identities,
455  int* mismatches,
456  double* pct_identity,
458  const CRangeCollection<TSeqPos> &ranges =
460 {
461  size_t count_aligned = 0;
462  switch (type) {
464  count_aligned = align.GetAlignLengthWithinRanges(ranges, true /* include gaps */);
465  break;
466 
468  count_aligned = align.GetAlignLengthWithinRanges(ranges, false /* omit gaps */);
469  break;
470 
472  count_aligned = align.GetAlignLengthWithinRanges(ranges, false /* omit gaps */);
473  count_aligned += align.GetNumGapOpeningsWithinRanges(ranges);
474  break;
475  }
476 
477  s_GetCountIdentityMismatch(scope, align, identities, mismatches, ranges);
478  if (count_aligned) {
479  *pct_identity = 100.0f * double(*identities) / count_aligned;
480  } else {
481  *pct_identity = 0;
482  }
483 }
484 
485 
486 ///
487 /// calculate the percent coverage
488 ///
489 static bool s_SequenceIsProtein(CScope& scope,
490  const CSeq_id& id)
491 {
492  CSeq_inst::TMol mol = scope.GetSequenceType(id);
493  if (mol == CSeq_inst::eMol_not_set) {
494  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
495  if ( !bsh ) {
497  "failed to retrieve sequence: " + id.AsFastaString());
498  }
499  return bsh.IsAa();
500  }
501 
502  return (mol == CSeq_inst::eMol_aa);
503 }
504 
505 
506 static bool s_IsProteinToGenomic(CScope& scope,
507  const CSeq_align& align)
508 {
509  if (align.GetSegs().IsSpliced()) {
510  return align.GetSegs().GetSpliced()
512  }
513 
514  if (align.GetSegs().IsDenseg()) {
515  const CDense_seg& seg = align.GetSegs().GetDenseg();
516  if (seg.IsSetWidths()) {
517  // FIXME: I can't remember what the rule for widths is
518  //
519  }
520  else {
521  // we must be protein-to-protein or nuc-to-nuc
522  return false;
523  }
524  }
525 
526  // our short-cuts are exhausted
527  // fall back to a check of sequence type
528  const CSeq_id& id0 = align.GetSeq_id(0);
529  if ( !s_SequenceIsProtein(scope, id0) ) {
530  return false;
531  }
532  const CSeq_id& id1 = align.GetSeq_id(1);
533  return s_SequenceIsProtein(scope, id1);
534 }
535 
536 
537 static void s_GetPercentCoverage(CScope& scope, const CSeq_align& align,
538  const CRangeCollection<TSeqPos>& ranges,
539  double* pct_coverage, unsigned query = 0)
540 {
541  if (!ranges.empty() && ranges.begin()->IsWhole() &&
543  *pct_coverage)) {
544  return;
545  }
546 
547  size_t covered_bases = align.GetAlignLengthWithinRanges
548  (ranges, false /* don't include gaps */);
549  size_t seq_len = 0;
550  if(ranges.empty() || !ranges.begin()->IsWhole()){
551  seq_len = ranges.GetCoveredLength();
552  } else {
553  if (align.GetSegs().IsSpliced() &&
555  {
556  seq_len = align.GetSegs().GetSpliced().GetProduct_length();
557  } else {
558  const auto &query_id = align.GetSeq_id(query);
559  const objects::CBioseq_Handle& bsh_seq = scope.GetBioseqHandle(query_id);
560  if (!bsh_seq) {
561  *pct_coverage = 0;
562  NCBI_THROW(CSeqalignException, eInvalidSeqId,
563  "Can't get sequence data for " + query_id.AsFastaString() +
564  " in order to calculate coverage");
565  }
566  seq_len = bsh_seq.GetBioseqLength();
567  }
568  if (align.GetSegs().IsSpliced() &&
569  align.GetSegs().GetSpliced().IsSetPoly_a()) {
570 
571  if (align.GetSegs().GetSpliced().IsSetProduct_strand() &&
573  seq_len -= align.GetSegs().GetSpliced().GetPoly_a();
574  } else {
575  seq_len = align.GetSegs().GetSpliced().GetPoly_a();
576  }
577  }
578 
579 
580  //
581  // determine if the alignment is protein-to-genomic
582  //
583  bool is_protein_to_genomic = s_IsProteinToGenomic(scope, align);
584  if (is_protein_to_genomic) {
585  /// alignment is protein-to-genomic alignment
586  /// NOTE: alignment length is always reported in nucleotide
587  /// coordinates
588  seq_len *= 3;
589  if (align.GetSegs().IsStd()) {
590  /// odd corner case:
591  /// std-seg alignments of protein to nucleotide
592  covered_bases *= 3;
593  }
594  }
595  }
596 
597  if (covered_bases) {
598  *pct_coverage = 100.0f * double(covered_bases) / double(seq_len);
599  } else {
600  *pct_coverage = 0;
601  }
602 }
603 
604 /////////////////////////////////////////////////////////////////////////////
606  const CSeq_align& align,
607  int* positives, int* negatives)
608 {
609  if (!align.GetSegs().IsSpliced() ||
610  align.GetSegs().GetSpliced().GetProduct_type() !=
612  {
613  NCBI_THROW(CSeqalignException, eUnsupported,
614  "num_positives and num_negatives scores only defined "
615  "for protein alignment");
616  }
617  CProteinAlignText pro_text(scope, align, m_SubstMatrixName);
618  const string& prot = pro_text.GetProtein();
619  const string& dna = pro_text.GetDNA();
620  const string& match = pro_text.GetMatch();
621  for(string::size_type i=0;i<match.size(); ++i) {
622  if( isalpha(prot[i]) && (dna[i] != '-')) {
623  int increment = isupper(prot[i]) ? 3 : 1;
624  switch(match[i]) {
625  case '|':
626  case '+':
627  *positives += increment;
628  break;
629  case 'X': /// skip introns and bad parts
630  break;
631  default://mismatch
632  *negatives += increment;
633  break;
634  }
635  }
636  }
637 }
638 
639 
640 
641 
642 void CScoreBuilderBase::SetSubstMatrix(const string &name)
643 {
644  m_SubstMatrixName = name;
645 }
646 
648  const CSeq_align& align,
650 {
651  int identities = 0;
652  int mismatches = 0;
653  double pct_identity = 0;
654  s_GetPercentIdentity(scope, align,
655  &identities, &mismatches, &pct_identity, type);
656  return pct_identity;
657 }
658 
659 
661  const CSeq_align& align,
662  const TSeqRange &range,
664 {
665  int identities = 0;
666  int mismatches = 0;
667  double pct_identity = 0;
668  s_GetPercentIdentity(scope, align,
669  &identities, &mismatches, &pct_identity, type,
671  return pct_identity;
672 }
673 
674 
676  const CSeq_align& align,
677  const CRangeCollection<TSeqPos> &ranges,
679 {
680  int identities = 0;
681  int mismatches = 0;
682  double pct_identity = 0;
683  s_GetPercentIdentity(scope, align,
684  &identities, &mismatches, &pct_identity, type, ranges);
685  return pct_identity;
686 }
687 
688 
690  const CSeq_align& align,
691  unsigned query)
692 {
693  double pct_coverage = 0;
694  s_GetPercentCoverage(scope, align,
696  &pct_coverage,
697  query);
698  return pct_coverage;
699 }
700 
702  const CSeq_align& align,
703  const TSeqRange& range,
704  unsigned query)
705 {
706  double pct_coverage = 0;
707  s_GetPercentCoverage(scope, align, CRangeCollection<TSeqPos>(range), &pct_coverage, query);
708  return pct_coverage;
709 }
710 
712  const CSeq_align& align,
713  const CRangeCollection<TSeqPos>& ranges,
714  unsigned query)
715 {
716  double pct_coverage = 0;
717  s_GetPercentCoverage(scope, align, ranges, &pct_coverage, query);
718  return pct_coverage;
719 }
720 
722 {
723  int identities = 0;
724  int mismatches = 0;
725  s_GetCountIdentityMismatch(scope, align, &identities, &mismatches);
726  return identities;
727 }
728 
729 
731 {
732  int identities = 0;
733  int mismatches = 0;
734  s_GetCountIdentityMismatch(scope, align, &identities,&mismatches);
735  return mismatches;
736 }
737 
738 
740  int& identities, int& mismatches)
741 {
742  identities = 0;
743  mismatches = 0;
744  s_GetCountIdentityMismatch(scope, align, &identities, &mismatches);
745 }
746 
747 
749  const TSeqRange& range)
750 {
751  int identities = 0;
752  int mismatches = 0;
753  s_GetCountIdentityMismatch(scope, align, &identities, &mismatches,
755  return identities;
756 }
757 
758 
760  const TSeqRange& range)
761 {
762  int identities = 0;
763  int mismatches = 0;
764  s_GetCountIdentityMismatch(scope, align, &identities,&mismatches,
766  return mismatches;
767 }
768 
769 
771  const TSeqRange& range,
772  int& identities, int& mismatches)
773 {
774  identities = 0;
775  mismatches = 0;
776  s_GetCountIdentityMismatch(scope, align, &identities, &mismatches,
778 }
779 
780 
782  const CRangeCollection<TSeqPos> &ranges)
783 {
784  int identities = 0;
785  int mismatches = 0;
786  s_GetCountIdentityMismatch(scope, align, &identities, &mismatches, ranges);
787  return identities;
788 }
789 
790 
792  const CRangeCollection<TSeqPos> &ranges)
793 {
794  int identities = 0;
795  int mismatches = 0;
796  s_GetCountIdentityMismatch(scope, align, &identities,&mismatches, ranges);
797  return mismatches;
798 }
799 
800 
802  const CRangeCollection<TSeqPos> &ranges,
803  int& identities, int& mismatches)
804 {
805  identities = 0;
806  mismatches = 0;
807  s_GetCountIdentityMismatch(scope, align, &identities, &mismatches, ranges);
808 }
809 
810 
812 {
813  int positives = 0;
814  int negatives = 0;
815  x_GetMatrixCounts(scope, align, &positives, &negatives);
816  return positives;
817 }
818 
819 
821 {
822  int positives = 0;
823  int negatives = 0;
824  x_GetMatrixCounts(scope, align, &positives, &negatives);
825  return negatives;
826 }
827 
828 
830  int& positives, int& negatives)
831 {
832  positives = 0;
833  negatives = 0;
834  x_GetMatrixCounts(scope, align, &positives, &negatives);
835 }
836 
837 
839 {
840  return align.GetTotalGapCount();
841 }
842 
843 
845 {
846  return align.GetNumGapOpenings();
847 }
848 
849 
851 {
852  return align.GetAlignLength( !ungapped /* true = include gaps = !ungapped */);
853 }
854 
855 
857  const TSeqRange &range)
858 {
859  return align.GetTotalGapCountWithinRange(range);
860 }
861 
862 
864  const TSeqRange &range)
865 {
866  return align.GetNumGapOpeningsWithinRange(range);
867 }
868 
869 
871  const TSeqRange &range,
872  bool ungapped)
873 {
874  return align.GetAlignLengthWithinRange(range, !ungapped
875  /* true = include gaps = !ungapped */);
876 }
877 
878 
880  const CRangeCollection<TSeqPos> &ranges)
881 {
882  return align.GetTotalGapCountWithinRanges(ranges);
883 }
884 
885 
887  const CRangeCollection<TSeqPos> &ranges)
888 {
889  return align.GetNumGapOpeningsWithinRanges(ranges);
890 }
891 
892 
894  const CRangeCollection<TSeqPos> &ranges,
895  bool ungapped)
896 {
897  return align.GetAlignLengthWithinRanges(ranges, !ungapped
898  /* true = include gaps = !ungapped */);
899 }
900 
901 
902 /////////////////////////////////////////////////////////////////////////////
903 
906 {
907  NON_CONST_ITERATE (list< CRef<CSeq_align> >, iter, aligns) {
908  AddScore(scope, **iter, score);
909  }
910 }
911 
914 {
915  try {
916  switch (score) {
917  /// Special cases for the three precent-identity scores, to add
918  /// the num_ident and num_mismatch scores as well
922  {{
923  int identities = 0;
924  int mismatches = 0;
925  double pct_identity = 0;
926  s_GetPercentIdentity(scope, align, &identities, &mismatches,
927  &pct_identity,
928  static_cast<EPercentIdentityType>(
930  align.SetNamedScore(score, pct_identity);
933  }}
934  break;
935 
936  default:
937  {{
938  align.ResetNamedScore(score);
939  double score_value = ComputeScore(scope, align, score);
940  if (CSeq_align::IsIntegerScore(score)) {
941  align.SetNamedScore(score, (int)score_value);
942  } else {
943  if (score_value == numeric_limits<double>::infinity()) {
944  score_value = numeric_limits<double>::max() / 10.0;
945  }
946  align.SetNamedScore(score, score_value);
947  }
948  }}
949  }
950  } catch (CSeqalignException& e) {
951  // Unimplemented (code missing) or unsupported (score cannot be defined)
952  // is handled according to the error handling mode. All other
953  // errors always throw.
954  switch (e.GetErrCode()) {
957  break;
958  default:
959  throw;
960  }
961 
962  switch (GetErrorMode()) {
963  case eError_Throw:
964  throw;
965  case eError_Report:
967  << "CScoreBuilderBase::AddScore(): error computing score: "
968  << e);
969  default:
970  break;
971  }
972  }
973 }
974 
975 string GetDonor(const objects::CSpliced_exon& exon) {
976  if( exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() ) {
977  return exon.GetDonor_after_exon().GetBases();
978  }
979  return string();
980 }
981 
982 string GetAcceptor(const objects::CSpliced_exon& exon) {
983  if( exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() ) {
984  return exon.GetAcceptor_before_exon().GetBases();
985  }
986  return string();
987 }
988 
989 //returns true for GT/AG, GC/AG AND AT/AC
990 bool IsConsSplice(const string& donor, const string acc) {
991  if(donor.length()<2 || acc.length()<2) return false;
992  if(toupper(Uchar(acc.c_str()[0])) != 'A') return false;
993  switch(toupper(Uchar(acc.c_str()[1]))) {
994  case 'C':
995  if( toupper(Uchar(donor.c_str()[0])) == 'A' && toupper(Uchar(donor.c_str()[1])) == 'T' ) return true;
996  else return false;
997  break;
998  case 'G':
999  if( toupper(Uchar(donor.c_str()[0])) == 'G' ) {
1000  char don2 = toupper(Uchar(donor.c_str()[1]));
1001  if(don2 == 'T' || don2 == 'C') return true;
1002  }
1003  return false;
1004  break;
1005  default:
1006  return false;
1007  break;
1008  }
1009  return false;
1010 }
1011 
1012 
1014  CSeq_align::TScore &scores)
1015 {
1016  typedef CSeq_align::TSegs::TSpliced TSpliced;
1017  const TSpliced & spliced (align.GetSegs().GetSpliced());
1018  if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
1019  NCBI_THROW(CSeqalignException, eUnsupported,
1020  "CScoreBuilderBase::AddSplignScores(): Unsupported product type");
1021  }
1022 
1023  const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
1024 
1025  typedef TSpliced::TExons TExons;
1026  const TExons & exons (spliced.GetExons());
1027 
1028  TSeqPos matches (0),
1029  aligned_query_bases (0), // matches, mismatches and indels
1030  aln_length_exons (0),
1031  aln_length_gaps (0),
1032  splices_total (0), // twice the number of introns
1033  splices_consensus (0);
1034 
1035  const TSeqPos qlen (spliced.GetProduct_length());
1036  const TSeqPos polya (spliced.CanGetPoly_a()?
1037  spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
1038  const TSeqPos prod_length_no_polya (qstrand? polya: qlen - 1 - polya);
1039 
1040  typedef CSpliced_exon TExon;
1041  TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
1042  string donor;
1043  ITERATE(TExons, ii2, exons) {
1044 
1045  const TExon & exon (**ii2);
1046  const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
1047  qmax (exon.GetProduct_end().GetNucpos());
1048 
1049  const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
1050 
1051  if(qgap > 0) {
1052  aln_length_gaps += qgap;
1053  donor.clear();
1054  }
1055  else if (ii2 != exons.begin()) {
1056  splices_total += 2;
1057  if(IsConsSplice(donor, GetAcceptor(exon))) { splices_consensus += 2; }
1058  }
1059 
1060  typedef TExon::TParts TParts;
1061  const TParts & parts (exon.GetParts());
1062  string errmsg;
1063  ITERATE(TParts, ii3, parts) {
1064  const CSpliced_exon_chunk & part (**ii3);
1065  const CSpliced_exon_chunk::E_Choice choice (part.Which());
1066  TSeqPos len (0);
1067  switch(choice) {
1069  len = part.GetMatch();
1070  matches += len;
1071  aligned_query_bases += len;
1072  break;
1074  len = part.GetMismatch();
1075  aligned_query_bases += len;
1076  break;
1078  len = part.GetProduct_ins();
1079  aligned_query_bases += len;
1080  break;
1082  len = part.GetGenomic_ins();
1083  break;
1084  default:
1085  errmsg = "Unexpected spliced exon chunk part: "
1086  + part.SelectionName(choice);
1087  NCBI_THROW(CSeqalignException, eUnsupported, errmsg);
1088  }
1089  aln_length_exons += len;
1090  }
1091 
1092  donor = GetDonor(exon);
1093  qprev = qstrand? qmax: qmin;
1094  } // TExons
1095 
1096  const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
1097  aln_length_gaps += qgap;
1098 
1099  for (CSeq_align::TScore::iterator it = scores.begin(); it != scores.end(); )
1100  {
1102  if ((*it)->GetId().IsStr()) {
1105  . find((*it)->GetId().GetStr());
1106  if (score != CSeq_align::ScoreNameMap().end()) {
1107  score_type = score->second;
1108  }
1109  }
1110  if (score_type >= CSeq_align::eScore_Matches &&
1111  score_type <= CSeq_align::eScore_ExonIdentity)
1112  {
1113  it = scores.erase(it);
1114  } else {
1115  ++it;
1116  }
1117  }
1118 
1119  {
1120  CRef<CScore> score_matches (new CScore());
1121  score_matches->SetId().SetStr(
1123  score_matches->SetValue().SetInt(matches);
1124  scores.push_back(score_matches);
1125  }
1126  {
1127  CRef<CScore> score_overall_identity (new CScore());
1128  score_overall_identity->SetId().SetStr(
1130  score_overall_identity->SetValue().
1131  SetReal(double(matches)/(aln_length_exons + aln_length_gaps));
1132  scores.push_back(score_overall_identity);
1133  }
1134  {
1135  CRef<CScore> score_splices (new CScore());
1136  score_splices->SetId().SetStr(
1138  score_splices->SetValue().SetInt(splices_total);
1139  scores.push_back(score_splices);
1140  }
1141  {
1142  CRef<CScore> score_splices_consensus (new CScore());
1143  score_splices_consensus->SetId().SetStr(
1145  score_splices_consensus->SetValue().SetInt(splices_consensus);
1146  scores.push_back(score_splices_consensus);
1147  }
1148  {
1149  CRef<CScore> score_coverage (new CScore());
1150  score_coverage->SetId().SetStr(
1152  score_coverage->SetValue().
1153  SetReal(double(aligned_query_bases) / prod_length_no_polya);
1154  scores.push_back(score_coverage);
1155  }
1156  {
1157  CRef<CScore> score_exon_identity (new CScore());
1158  score_exon_identity->SetId().SetStr(
1160  score_exon_identity->SetValue().
1161  SetReal(double(matches) / aln_length_exons);
1162  scores.push_back(score_exon_identity);
1163  }
1164 
1165 }
1166 
1168  CSeq_align::EScoreType score)
1169 {
1170  return ComputeScore(scope, align,
1172 }
1173 
1175  const TSeqRange &range,
1176  CSeq_align::EScoreType score)
1177 {
1178  return ComputeScore(scope, align, CRangeCollection<TSeqPos>(range), score);
1179 }
1180 
1182  const CRangeCollection<TSeqPos> &ranges,
1183  CSeq_align::EScoreType score)
1184 {
1185  switch (score) {
1187  {{
1188  NCBI_THROW(CSeqalignException, eUnsupported,
1189  "CScoreBuilderBase::ComputeScore(): "
1190  "generic 'score' computation is undefined");
1191  }}
1192  break;
1193 
1199  NCBI_THROW(CSeqalignException, eNotImplemented,
1200  "CScoreBuilderBase::ComputeScore(): "
1201  "BLAST scores are available in CScoreBuilder, "
1202  "not CScoreBuilderBase");
1203  break;
1204 
1206  return GetIdentityCount(scope, align, ranges);
1207 
1209  if (ranges.empty() || !ranges.begin()->IsWhole()) {
1210  NCBI_THROW(CSeqalignException, eNotImplemented,
1211  "positive-count score not supported within a range");
1212  }
1213  return GetPositiveCount(scope, align);
1214 
1216  if (ranges.empty() || !ranges.begin()->IsWhole()) {
1217  NCBI_THROW(CSeqalignException, eNotImplemented,
1218  "positive-count score not supported within a range");
1219  }
1220  return GetNegativeCount(scope, align);
1221 
1223  return GetMismatchCount(scope, align, ranges);
1224 
1226  return GetGapCount(align, ranges);
1227 
1229  return align.GetAlignLengthWithinRanges(ranges, true /* include gaps */);
1230 
1232  {{
1233  int identities = 0;
1234  int mismatches = 0;
1235  double pct_identity = 0;
1236  s_GetPercentIdentity(scope, align,
1237  &identities, &mismatches, &pct_identity,
1238  eGapped, ranges);
1239  return pct_identity;
1240  }}
1241  break;
1242 
1244  {{
1245  int identities = 0;
1246  int mismatches = 0;
1247  double pct_identity = 0;
1248  s_GetPercentIdentity(scope, align,
1249  &identities, &mismatches, &pct_identity,
1250  eUngapped, ranges);
1251  return pct_identity;
1252  }}
1253  break;
1254 
1256  {{
1257  int identities = 0;
1258  int mismatches = 0;
1259  double pct_identity = 0;
1260  s_GetPercentIdentity(scope, align,
1261  &identities, &mismatches, &pct_identity,
1262  eGBDNA, ranges);
1263  return pct_identity;
1264  }}
1265  break;
1266 
1268  {{
1269  double pct_coverage = 0;
1270  s_GetPercentCoverage(scope, align, ranges, &pct_coverage);
1271  return pct_coverage;
1272  }}
1273  break;
1274 
1276  {{
1277  if(align.GetSegs().Which() == CSeq_align::TSegs::e_Std)
1278  /// high-quality-coverage calculatino is not possbile for standard segs
1279  NCBI_THROW(CSeqalignException, eUnsupported,
1280  "High-quality percent coverage not supported "
1281  "for standard seg representation");
1282 
1283  if (ranges.empty() || !ranges.begin()->IsWhole()) {
1284  NCBI_THROW(CSeqalignException, eNotImplemented,
1285  "High-quality percent coverage not supported "
1286  "within a range");
1287  }
1288  /// If we have annotation for a high-quality region, it is in a ftable named
1289  /// "NCBI_GPIPE", containing a region Seq-feat named "alignable"
1290  TSeqRange alignable_range = TSeqRange::GetWhole();
1291  CBioseq_Handle query = scope.GetBioseqHandle(align.GetSeq_id(0));
1292  for(CFeat_CI feat_it(query,
1294  SetExcludeExternal());
1295  feat_it; ++feat_it)
1296  {
1297  if(feat_it->GetData().GetRegion() == "alignable" &&
1298  feat_it->GetAnnot().IsNamed() &&
1299  feat_it->GetAnnot().GetName() == "NCBI_GPIPE")
1300  {
1301  alignable_range = feat_it->GetRange();
1302  break;
1303  }
1304  }
1305  double pct_coverage = 0;
1306  s_GetPercentCoverage(scope, align,
1307  CRangeCollection<TSeqPos>(alignable_range),
1308  &pct_coverage);
1309  return pct_coverage;
1310  }}
1311  break;
1312 
1319  {{
1320  if (ranges.empty() || !ranges.begin()->IsWhole()) {
1321  NCBI_THROW(CSeqalignException, eNotImplemented,
1322  "splign scores not supported within a range");
1323  }
1324  CSeq_align::TScore scores;
1325  AddSplignScores(align, scores);
1326  ITERATE (CSeq_align::TScore, it, scores) {
1327  if ((*it)->GetId().GetStr() == CSeq_align::ScoreName(score))
1328  {
1329  if ((*it)->GetValue().IsInt()) {
1330  return (*it)->GetValue().GetInt();
1331  } else {
1332  return (*it)->GetValue().GetReal();
1333  }
1334  }
1335  }
1336  NCBI_ASSERT(false, "Should never reach this point");
1337  }}
1338 
1339  default:
1340  {{
1341  NCBI_THROW(CSeqalignException, eNotImplemented,
1342  "Unknown score");
1343  return 0;
1344  }}
1345  }
1346 }
1347 
1348 
1349 
1350 
1351 
1353 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void ConvertSeqAlignToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSeq_align &sa, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a seq-align.
CAlignRange Represents an element of pairwise alignment of two sequences.
Definition: align_range.hpp:63
TSignedSeqPos GetStop(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:635
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:614
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
TNumseg GetNumSegs(void) const
Definition: alnmap.hpp:510
Default IAlnSeqId implementation based on CSeq_id_Handle.
Definition: aln_seqid.hpp:116
string & GetSeqString(string &buffer, TNumrow row, TSeqPos seq_from, TSeqPos seq_to) const
Definition: alnvec.hpp:288
CBioseq_Handle –.
bool IsSetWidths(void) const
Definition: Dense_seg.hpp:196
CFeat_CI –.
Definition: feat_ci.hpp:64
static const CTrans_table & GetTransTable(int id)
Data loader exceptions, used by GenBank loader.
A pairwise aln is a collection of ranges for a pair of rows.
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
Text representation of ProSplign alignment.
Definition: alntext.hpp:60
const string & GetDNA() const
Definition: alntext.hpp:77
const string & GetMatch() const
Definition: alntext.hpp:79
const string & GetProtein() const
Definition: alntext.hpp:80
TThisType & IntersectWith(const TRange &r)
Definition: range_coll.hpp:173
bool empty() const
Definition: range_coll.hpp:102
const_iterator begin() const
Definition: range_coll.hpp:82
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
Definition: range_coll.hpp:157
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
int GetPositiveCount(CScope &scope, const CSeq_align &align)
counts based on substitution matrix for protein alignments
TSeqPos GetAlignLength(const CSeq_align &align, bool ungapped=false)
Compute the length of the alignment (= length of all segments, gaps + aligned)
@ eError_Report
Print error messages, but do not fail.
@ eError_Throw
Throw exceptions on errors.
int GetIdentityCount(CScope &scope, const CSeq_align &align)
Compute the number of identities in the alignment.
void AddSplignScores(const CSeq_align &align, CSeq_align::TScore &scores)
Compute the six splign scores.
void AddScore(CScope &scope, CSeq_align &align, CSeq_align::EScoreType score)
int GetGapCount(const CSeq_align &align)
Compute the number of gaps in the alignment.
double ComputeScore(CScope &scope, const CSeq_align &align, CSeq_align::EScoreType score)
EPercentIdentityType
Compute percent identity (range 0-100)
int GetNegativeCount(CScope &scope, const CSeq_align &align)
double GetPercentCoverage(CScope &scope, const CSeq_align &align, unsigned query=0)
Compute percent coverage of the query (sequence 0) (range 0-100)
double GetPercentIdentity(CScope &scope, const CSeq_align &align, EPercentIdentityType type=eGapped)
virtual ~CScoreBuilderBase()
Destructor.
void x_GetMatrixCounts(CScope &scope, const CSeq_align &align, int *positives, int *negatives)
int GetMismatchCount(CScope &scope, const CSeq_align &align)
Compute the number of mismatches in the alignment.
CScoreBuilderBase()
Default constructor.
EErrorMode GetErrorMode(void) const
void GetMatrixCounts(CScope &scope, const CSeq_align &align, int &positives, int &negatives)
void SetSubstMatrix(const string &name)
int GetGapBaseCount(const CSeq_align &align)
Compute the number of gap bases in the alignment (= length of all gap segments)
Definition: Score.hpp:57
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Iupacna
Definition: sequtil.hpp:47
CSeqVector –.
Definition: seq_vector.hpp:65
EScoreType
enum controlling known named scores
Definition: Seq_align.hpp:128
@ eScore_PercentIdentity_GapOpeningOnly
Definition: Seq_align.hpp:165
@ eScore_PercentIdentity_Gapped
Definition: Seq_align.hpp:163
@ eScore_ConsensusSplices
Definition: Seq_align.hpp:183
@ eScore_OverallIdentity
Definition: Seq_align.hpp:181
@ eScore_SumEValue
Definition: Seq_align.hpp:171
@ eScore_AlignLength
Definition: Seq_align.hpp:142
@ eScore_PercentIdentity_Ungapped
Definition: Seq_align.hpp:164
@ eScore_NegativeCount
Definition: Seq_align.hpp:151
@ eScore_ExonIdentity
Definition: Seq_align.hpp:185
@ eScore_PositiveCount
Definition: Seq_align.hpp:148
@ eScore_PercentCoverage
Definition: Seq_align.hpp:168
@ eScore_HighQualityPercentCoverage
Definition: Seq_align.hpp:177
@ eScore_ProductCoverage
Definition: Seq_align.hpp:184
@ eScore_CompAdjMethod
Definition: Seq_align.hpp:174
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
@ eScore_MismatchCount
Definition: Seq_align.hpp:154
TSeqPos GetNumGapOpeningsWithinRanges(const CRangeCollection< TSeqPos > &ranges, TDim row=-1) const
Definition: Seq_align.cpp:1582
TSeqPos GetTotalGapCount(TDim row=-1) const
Retrieves the total number of gaps in the given row an alignment; all gaps by default.
Definition: Seq_align.cpp:1550
TSeqPos GetNumGapOpeningsWithinRange(const TSeqRange &range, TDim row=-1) const
Definition: Seq_align.cpp:1570
static string ScoreName(EScoreType score)
Definition: Seq_align.cpp:503
void SetNamedScore(const string &id, int score)
Definition: Seq_align.cpp:636
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
TSeqPos GetTotalGapCountWithinRanges(const CRangeCollection< TSeqPos > &ranges, TDim row=-1) const
Definition: Seq_align.cpp:1576
void ResetNamedScore(const string &name)
Definition: Seq_align.cpp:606
TSeqPos GetAlignLengthWithinRange(const TSeqRange &range, bool include_gaps=true) const
Get the length of this alignment within a specified range By default, this function computes an align...
Definition: Seq_align.cpp:2001
TSeqPos GetTotalGapCountWithinRange(const TSeqRange &range, TDim row=-1) const
Definition: Seq_align.cpp:1564
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1993
TSeqPos GetAlignLengthWithinRanges(const CRangeCollection< TSeqPos > &ranges, bool include_gaps=true) const
Get the length of this alignment within a specified range By default, this function computes an align...
Definition: Seq_align.cpp:2010
static const TScoreNameMap & ScoreNameMap()
Definition: Seq_align.cpp:483
static bool IsIntegerScore(EScoreType score)
Definition: Seq_align.cpp:513
TSeqPos GetNumGapOpenings(TDim row=-1) const
Retrieves the number of gap openings in a given row in an alignment (ignoring how many gaps are in th...
Definition: Seq_align.cpp:1557
@ eUnsupported
Operation that is undefined for the given input Seq-align, and which is impossible to perform.
@ eNotImplemented
Attempt to use unimplemented funtionality.
CSpliced_exon_chunk –.
char GetStartResidue(int state) const
char GetCodonResidue(int state) const
static int SetCodonState(unsigned char ch1, unsigned char ch2, unsigned char ch3)
container_type::const_iterator const_iterator
Definition: map.hpp:53
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:687
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define NCBI_RETHROW_SAME(prev_exception, message)
Generic macro to re-throw the same exception.
Definition: ncbiexpt.hpp:749
@ eUnknown
Definition: app_popup.hpp:72
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
Definition: sequence.cpp:264
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeq_inst::TMol GetSequenceType(const CSeq_id &id, TGetFlags flags=0)
Get molecular type of sequence (protein/dna/rna) Return CSeq_inst::eMol_not_set if sequence is not fo...
Definition: scope.cpp:804
bool IsAa(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
unsigned char Uchar
Alias for unsigned char.
Definition: ncbitype.h:95
TThisType & Set(position_type from, position_type to)
Definition: range.hpp:188
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
TGcode GetGcode(void) const
Get the Gcode member data.
Definition: OrgName_.hpp:918
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
bool IsSetParts(void) const
basic seqments always are in biologic order Check if a value has been assigned to Parts data member.
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
TMatch GetMatch(void) const
Get the variant data.
bool IsSetProduct_strand(void) const
should be 'plus' or 'minus' Check if a value has been assigned to Product_strand data member.
static string SelectionName(E_Choice index)
Retrieve selection name (for diagnostic purposes).
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
bool IsSetPoly_a(void) const
start of poly(A) tail on the transcript For sense transcripts: aligned product positions < poly-a <= ...
TDiag GetDiag(void) const
Get the variant data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSetProduct_length(void) const
length of the product, in bases/residues from this (or from poly-a if present), a 3' unaligned length...
TPoly_a GetPoly_a(void) const
Get the Poly_a member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
list< CRef< CSeq_align > > Tdata
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsDenseg(void) const
Check if variant Denseg is selected.
Definition: Seq_align_.hpp:740
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
@ e_Region
named region (globin locus)
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
const int infinity
Definition: nucprot.cpp:52
int i
int len
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
unsigned int a
Definition: ncbi_localip.c:102
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int isupper(Uchar c)
Definition: ncbictype.hpp:70
T max(T x_, T y_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
int offset
Definition: replacements.h:160
USING_SCOPE(objects)
static void s_GetCountIdentityMismatch(CScope &scope, const CSeq_align &align, int *identities, int *mismatches, const CRangeCollection< TSeqPos > &ranges=CRangeCollection< TSeqPos >(TSeqRange::GetWhole()))
static void s_GetNucIdentityMismatch(const vector< string > &data, int *identities, int *mismatches)
calculate mismatches and identities in a seq-align
static bool s_SequenceIsProtein(CScope &scope, const CSeq_id &id)
calculate the percent coverage
bool IsConsSplice(const string &donor, const string acc)
static TSeqPos s_IntersectionLength(const CRangeCollection< TSeqPos > &ranges, const TSeqRange &range)
Get length of intersection between a range and a range collection.
static void s_GetPercentIdentity(CScope &scope, const CSeq_align &align, int *identities, int *mismatches, double *pct_identity, CScoreBuilderBase::EPercentIdentityType type, const CRangeCollection< TSeqPos > &ranges=CRangeCollection< TSeqPos >(TSeqRange::GetWhole()))
calculate the percent identity we also return the count of identities and mismatches
static void s_GetPercentCoverage(CScope &scope, const CSeq_align &align, const CRangeCollection< TSeqPos > &ranges, double *pct_coverage, unsigned query=0)
string GetAcceptor(const objects::CSpliced_exon &exon)
static void s_GetSplicedSegIdentityMismatch(CScope &scope, const CSeq_align &align, const CRangeCollection< TSeqPos > &ranges, int *identities, int *mismatches)
string GetDonor(const objects::CSpliced_exon &exon)
static bool s_IsProteinToGenomic(CScope &scope, const CSeq_align &align)
SAnnotSelector –.
static string query
Definition: type.c:6
#define _ASSERT
Modified on Wed Mar 27 11:18:36 2024 by modify_doxy.py rev. 669887