32 #include <ncbi_pch.hpp>
37 #include <algo/sequence/util.hpp>
61 #include <objmgr/scope.hpp>
62 #include <objmgr/seq_vector.hpp>
63 #include <objmgr/util/sequence.hpp>
73 /////////////////////////////////////////////////////////////////////////////
76 {
77 public:
78  CScore_AlignLength(bool include_gaps)
79  : m_Gaps(include_gaps)
80  {
81  }
83  virtual EComplexity GetComplexity() const { return eEasy; };
85  virtual bool IsInteger() const { return true; };
87  virtual double Get(const CSeq_align& align, CScope*) const
88  {
89  return align.GetAlignLength(m_Gaps);
90  }
92  virtual void PrintHelp(CNcbiOstream& ostr) const
93  {
94  if (m_Gaps) {
95  ostr << "Length of the aligned segments, including the length of all gap segments";
96  }
97  else {
98  ostr << "Length of the aligned segments, excluding all gap segments; thus, this is the length of all actually aligned (i.e., match or mismatch) bases";
99  }
100  }
102 private:
103  bool m_Gaps;
104 };
106 /////////////////////////////////////////////////////////////////////////////
109 {
110 public:
111  CScore_GapCount(bool count_bases, int row = -1,
112  bool exon_specific = false)
113  : m_CountBases(count_bases), m_Row(row), m_ExonSpecific(exon_specific)
114  {
115  }
117  virtual EComplexity GetComplexity() const { return eEasy; };
119  virtual bool IsInteger() const { return true; };
121  virtual double Get(const CSeq_align& align, CScope*) const
122  {
123  if (m_ExonSpecific && !align.GetSegs().IsSpliced()) {
124  NCBI_THROW(CSeqalignException, eUnsupported,
125  "'product_gap_length' and 'genomic_gap_length' scores "
126  "valid only for Spliced-seg alignments");
127  }
128  return m_CountBases ? align.GetTotalGapCount(m_Row)
129  : align.GetNumGapOpenings(m_Row);
130  }
132  virtual void PrintHelp(CNcbiOstream& ostr) const
133  {
134  if (m_CountBases) {
135  ostr << "Total number of gap bases missing";
136  }
137  else {
138  ostr << "Number of gap openings";
139  }
140  if (m_ExonSpecific) {
141  if (m_Row == 0) {
142  ostr << " in product exons";
143  } else if(m_Row == 1) {
144  ostr << " in genomic exons";
145  }
146  } else {
147  if (m_Row == 0) {
148  ostr << " in query";
149  } else if(m_Row == 1) {
150  ostr << " in subject";
151  }
152  }
153  }
155 private:
157  int m_Row;
159 };
161 /////////////////////////////////////////////////////////////////////////////
164 {
165 public:
166  CScore_FrameShifts(int row = -1, bool frameshifts = true)
167  : m_Row(row)
168  , m_Frameshifts(frameshifts)
169  {
170  }
172  virtual EComplexity GetComplexity() const { return eEasy; };
174  virtual bool IsInteger() const { return true; };
176  virtual double Get(const CSeq_align& align, CScope *scope) const
177  {
178  int opposite_row = m_Row >= 0 ? 1 - m_Row : m_Row;
179  if (align.GetSegs().IsSpliced() &&
180  align.GetSegs().GetSpliced().GetProduct_type() ==
182  {
183  /// Protein alignment; just count frameshifts
184  return m_Frameshifts ? align.GetNumFrameshifts(m_Row)
185  : align.GetNumGapOpenings(opposite_row)
186  - align.GetNumFrameshifts(m_Row);
187  }
189  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(0));
190  if ( !bsh ) {
192  "failed to retrieve sequence for " +
193  align.GetSeq_id(0).AsFastaString());
194  }
195  if (bsh.GetBioseqMolType() != CSeq_inst::eMol_rna) {
197  "Can't count frameshifts on a genomic alignment");
198  }
200  /// Only count frameshifts within cdregion
201  CFeat_CI feat_it(bsh,
203  .IncludeFeatType(CSeqFeatData::e_Cdregion));
204  return !feat_it ? 0 : (m_Frameshifts
205  ? align.GetNumFrameshiftsWithinRange(feat_it->GetRange(), m_Row)
206  : align.GetNumGapOpeningsWithinRange(feat_it->GetRange(), opposite_row)
207  - align.GetNumFrameshiftsWithinRange(feat_it->GetRange(), m_Row));
208  }
210  virtual void PrintHelp(CNcbiOstream& ostr) const
211  {
212  ostr << "Number of ";
213  if (!m_Frameshifts) {
214  ostr << "non-";
215  }
216  ostr << "frameshifting insertions";
217  if (m_Row == 0) {
218  ostr << " in the query";
219  } else if(m_Row == 1) {
220  ostr << " in the subject";
221  } else {
222  ostr << " or deletions";
223  }
224  }
226 private:
227  int m_Row;
229 };
231 /////////////////////////////////////////////////////////////////////////////
234 {
235 public:
236  virtual void PrintHelp(CNcbiOstream& ostr) const
237  {
238  ostr << "Length of the longest gap observed in either query or subject";
239  }
241  virtual EComplexity GetComplexity() const { return eEasy; };
243  virtual bool IsInteger() const { return true; };
245  virtual double Get(const CSeq_align& align, CScope*) const
246  {
247  return align.GapLengthRange().second;
248  }
249 };
251 /////////////////////////////////////////////////////////////////////////////
254 {
255 public:
257  virtual void PrintHelp(CNcbiOstream& ostr) const
258  {
259  ostr << "Length of unaligned sequence 3' of alignment end";
260  }
262  virtual EComplexity GetComplexity() const { return eEasy; };
264  virtual bool IsInteger() const { return true; };
266  virtual double Get(const CSeq_align& align, CScope* scope) const
267  {
268  double score_value = 0;
269  if (align.GetSegs().IsSpliced()) {
270  score_value = align.GetSegs().GetSpliced().GetProduct_length();
271  if (align.GetSegs().GetSpliced().IsSetPoly_a()) {
272  score_value = align.GetSegs().GetSpliced().GetPoly_a();
273  }
274  } else {
275  if (scope) {
276  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(0));
277  if (bsh) {
278  score_value = bsh.GetBioseqLength();
279  }
280  }
281  }
282  if (score_value) {
283  score_value -= align.GetSeqStop(0) + 1;
284  }
285  return score_value;
286  }
287 };
289 /////////////////////////////////////////////////////////////////////////////
292 {
293 public:
295  virtual void PrintHelp(CNcbiOstream& ostr) const
296  {
297  ostr << "Length of polya tail";
298  }
300  virtual EComplexity GetComplexity() const { return eEasy; };
302  virtual bool IsInteger() const { return true; };
304  virtual double Get(const CSeq_align& align, CScope* scope) const
305  {
306  if (!align.GetSegs().IsSpliced() ||
307  !align.GetSegs().GetSpliced().IsSetPoly_a())
308  {
309  return 0;
310  }
311  if (align.GetSegs().GetSpliced().IsSetProduct_strand() &&
313  {
314  /// Alignment on minus strand, so poly-a score represents the actual
315  /// length of the poly-t tail
316  return align.GetSegs().GetSpliced().GetPoly_a();
317  }
318  double product_length = 0;
319  if (align.GetSegs().GetSpliced().IsSetProduct_length()) {
320  product_length = align.GetSegs().GetSpliced().GetProduct_length();
321  } else if (scope) {
322  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(0));
323  if (bsh) {
324  product_length = bsh.GetBioseqLength();
325  }
326  }
327  if (product_length == 0) {
328  return 0;
329  }
330  return product_length - align.GetSegs().GetSpliced().GetPoly_a();
331  }
332 };
334 /////////////////////////////////////////////////////////////////////////////
337 {
338 public:
340  virtual void PrintHelp(CNcbiOstream& ostr) const
341  {
342  ostr << "Length of unaligned sequence contained within the aligned "
343  "range. Note that this does not count gaps; rather, it computes "
344  "the length of all missing, unaligned sequence bounded by the "
345  "aligned range";
346  }
348  virtual EComplexity GetComplexity() const { return eEasy; };
350  virtual bool IsInteger() const { return true; };
352  virtual double Get(const CSeq_align& align, CScope* ) const
353  {
354  double score_value = 0;
355  switch (align.GetSegs().Which()) {
357  {{
358  const CSpliced_seg& seg = align.GetSegs().GetSpliced();
359  if (seg.IsSetProduct_strand() &&
361  CSpliced_seg::TExons::const_reverse_iterator it =
362  seg.GetExons().rbegin();
363  CSpliced_seg::TExons::const_reverse_iterator prev =
364  seg.GetExons().rbegin();
365  CSpliced_seg::TExons::const_reverse_iterator end =
366  seg.GetExons().rend();
367  if (seg.GetProduct_type() ==
369  for (++it; it != end; ++it, ++prev) {
370  score_value += (*it)->GetProduct_start().GetNucpos() -
371  (*prev)->GetProduct_end().GetNucpos() - 1;
372  }
373  } else {
374  for (++it; it != end; ++it, ++prev) {
375  TSeqPos curr_nuc = (*it)->GetProduct_start().AsSeqPos();
376  TSeqPos last_nuc = (*prev)->GetProduct_end().AsSeqPos();
377  score_value += curr_nuc - last_nuc - 1;
378  }
379  }
380  }
381  else {
382  CSpliced_seg::TExons::const_iterator it =
383  seg.GetExons().begin();
384  CSpliced_seg::TExons::const_iterator prev =
385  seg.GetExons().begin();
386  CSpliced_seg::TExons::const_iterator end =
387  seg.GetExons().end();
388  if (seg.GetProduct_type() ==
390  for (++it; it != end; ++it, ++prev) {
391  score_value += (*it)->GetProduct_start().GetNucpos() -
392  (*prev)->GetProduct_end().GetNucpos() - 1;
393  }
394  } else {
395  for (++it; it != end; ++it, ++prev) {
396  TSeqPos curr_nuc = (*it)->GetProduct_start().AsSeqPos();
397  TSeqPos last_nuc = (*prev)->GetProduct_end().AsSeqPos();
398  score_value += curr_nuc - last_nuc - 1;
399  }
400  }
401  }
402  }}
403  break;
405  default:
406  NCBI_THROW(CSeqalignException, eNotImplemented,
407  "internal_unaligned not implemented for this "
408  "type of alignment");
409  }
410  return score_value;
411  }
412 };
414 /////////////////////////////////////////////////////////////////////////////
417 {
418 public:
419  CScore_AlignStartStop(int row, bool start)
420  : m_Row(row)
421  , m_Start(start)
422  {
423  }
425  virtual EComplexity GetComplexity() const { return eEasy; };
427  virtual bool IsInteger() const { return true; };
429  virtual void PrintHelp(CNcbiOstream& ostr) const
430  {
431  if (m_Start) {
432  if (m_Row == 0) {
433  ostr << "Start of query sequence (0-based coordinates)";
434  }
435  else if (m_Row == 1) {
436  ostr << "Start of subject sequence (0-based coordinates)";
437  }
438  }
439  else {
440  if (m_Row == 0) {
441  ostr << "End of query sequence (0-based coordinates)";
442  }
443  else if (m_Row == 1) {
444  ostr << "End of subject sequence (0-based coordinates)";
445  }
446  }
447  }
449  virtual double Get(const CSeq_align& align, CScope*) const
450  {
451  if (m_Start) {
452  return align.GetSeqStart(m_Row);
453  } else {
454  return align.GetSeqStop(m_Row);
455  }
456  }
458 private:
459  int m_Row;
460  bool m_Start;
461 };
463 /////////////////////////////////////////////////////////////////////////////
466 {
467 public:
469  virtual void PrintHelp(CNcbiOstream& ostr) const
470  {
471  ostr << "Ratio of subject aligned range length to query aligned "
472  "range length";
473  }
475  virtual EComplexity GetComplexity() const { return eEasy; };
477  virtual double Get(const CSeq_align& align, CScope*) const
478  {
479  return align.AlignLengthRatio();
480  }
481 };
483 //////////////////////////////////////////////////////////////////////////////
486 {
487 public:
489  : m_Row(row)
490  {
491  }
493  virtual void PrintHelp(CNcbiOstream& ostr) const
494  {
495  if (m_Row == 0) {
496  ostr << "Length of query sequence";
497  }
498  else if (m_Row == 1) {
499  ostr << "Length of subject sequence";
500  }
501  }
503  virtual EComplexity GetComplexity() const { return eHard; };
505  virtual bool IsInteger() const { return true; };
507  virtual double Get(const CSeq_align& align, CScope* scope) const
508  {
509  if (m_Row == 0 && align.GetSegs().IsSpliced()) {
510  return align.GetSegs().GetSpliced().GetProduct_length();
511  } else {
512  if (scope) {
513  CBioseq_Handle bsh =
514  scope->GetBioseqHandle(align.GetSeq_id(m_Row));
515  if (bsh) {
516  return bsh.GetBioseqLength();
517  } else {
518  NCBI_THROW(CSeqalignException, eInvalidSeqId,
519  "Can't get length for sequence " +
520  align.GetSeq_id(m_Row).AsFastaString());
521  }
522  }
523  }
524  return 0;
525  }
527 private:
528  int m_Row;
529 };
531 //////////////////////////////////////////////////////////////////////////////
532 /// Get sequence's length in nucleic acids
534 {
535  TSeqPos len = bsh.GetBioseqLength();
536  if (bsh.CanGetInst_Mol() && bsh.GetInst_Mol() == CSeq_inst::eMol_aa) {
537  /// This is an amino-acid sequence, so multiply length by 3
538  len *= 3;
539  }
540  return len;
541 }
545 {
546 public:
547  enum EType {e_Min, e_Avg};
549  : m_Type(type)
550  {
551  }
553  virtual void PrintHelp(CNcbiOstream& ostr) const
554  {
555  ostr <<
556  "Symmetric overlap, as a percent (0-100). This is similar to "
557  "coverage, except that it takes into account both query and "
558  "subject sequence lengths. Alignment length is divided by "
559  << (m_Type == e_Min ? "minimum" : "average")
560  << " of the two sequence lengths";
561  }
563  virtual EComplexity GetComplexity() const { return eHard; };
565  virtual double Get(const CSeq_align& align, CScope* scope) const
566  {
567  TSeqPos length = align.GetAlignLength(false);
568  double pct_overlap = length * 100;
570  CBioseq_Handle q = scope->GetBioseqHandle(align.GetSeq_id(0));
571  if ( !q ) {
573  "failed to retrieve sequence for " +
574  align.GetSeq_id(0).AsFastaString());
575  }
576  CBioseq_Handle s = scope->GetBioseqHandle(align.GetSeq_id(1));
577  if ( !s ) {
579  "failed to retrieve sequence for " +
580  align.GetSeq_id(1).AsFastaString());
581  }
582  if (q.IsAa() && s.IsAa()) {
583  pct_overlap *= 3;
584  }
586  switch (m_Type) {
587  case e_Min:
588  pct_overlap /= min(s_GetNaLength(q), s_GetNaLength(s));
589  break;
591  case e_Avg:
592  pct_overlap /= (s_GetNaLength(q) + s_GetNaLength(s))/2;
593  break;
594  }
595  return pct_overlap;
596  }
598 private:
600 };
602 //////////////////////////////////////////////////////////////////////////////
605 {
606 public:
607  virtual void PrintHelp(CNcbiOstream& ostr) const
608  {
609  ostr <<
610  "Length of the shortest exon. Note that this score has "
611  "meaning only for Spliced-seg alignments, as would be generated "
612  "by Splign or ProSplign.";
613  }
615  virtual EComplexity GetComplexity() const { return eEasy; };
617  virtual bool IsInteger() const { return true; };
619  virtual double Get(const CSeq_align& align, CScope*) const
620  {
621  return align.ExonLengthRange().first;
622  }
623 };
625 //////////////////////////////////////////////////////////////////////////////
628 {
629 public:
630  virtual void PrintHelp(CNcbiOstream& ostr) const
631  {
632  ostr <<
633  "Length of the longest intron. Note that this score has "
634  "meaning only for Spliced-seg alignments, as would be generated "
635  "by Splign or ProSplign.";
636  }
638  virtual EComplexity GetComplexity() const { return eEasy; };
640  virtual bool IsInteger() const { return true; };
642  virtual double Get(const CSeq_align& align, CScope*) const
643  {
644  return align.IntronLengthRange().second;
645  }
646 };
648 //////////////////////////////////////////////////////////////////////////////
651 {
652 public:
653  virtual void PrintHelp(CNcbiOstream& ostr) const
654  {
655  ostr <<
656  "Count of the number of exons. Note that this score has "
657  "meaning only for Spliced-seg alignments, as would be generated "
658  "by Splign or ProSplign.";
659  }
661  virtual EComplexity GetComplexity() const { return eEasy; };
663  virtual bool IsInteger() const { return true; };
665  virtual double Get(const CSeq_align& align, CScope*) const
666  {
667  if (align.GetSegs().IsSpliced()) {
668  const CSpliced_seg& seg = align.GetSegs().GetSpliced();
669  if (seg.IsSetExons()) {
670  return seg.GetExons().size();
671  }
672  return 0;
673  }
675  NCBI_THROW(CSeqalignException, eUnsupported,
676  "'exon_count' score is valid only for "
677  "Spliced-seg alignments");
678  }
679 };
681 //////////////////////////////////////////////////////////////////////////////
684 {
685 public:
686  virtual void PrintHelp(CNcbiOstream& ostr) const
687  {
688  ostr <<
689  "Minimum distance between an indel and a splice site. Note that "
690  "this score has meaning only for Spliced-seg alignments, as would "
691  "be generated by Splign or ProSplign.";
692  }
694  virtual EComplexity GetComplexity() const { return eEasy; };
696  virtual bool IsInteger() const { return true; };
698  virtual double Get(const CSeq_align& align, CScope*) const
699  {
700  if (align.GetSegs().IsSpliced() &&
701  align.GetSegs().GetSpliced().IsSetExons())
702  {
703  const CSpliced_seg& seg = align.GetSegs().GetSpliced();
704  unsigned result = INT_MAX;
705  ITERATE (CSpliced_seg::TExons, exon_it, seg.GetExons()) {
706  const CSpliced_exon& exon = **exon_it;
707  if (!exon.IsSetParts()) {
708  continue;
709  }
710  unsigned distance_5prime = 0, distance_3prime = 0;
711  bool found_indel = false;
712  ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
713  const CSpliced_exon_chunk& part = **part_it;
714  switch (part.Which()) {
716  distance_5prime += part.GetMatch();
717  break;
719  distance_5prime += part.GetMismatch();
720  break;
722  distance_5prime += part.GetDiag();
723  break;
724  default:
725  found_indel = true;
726  break;
727  }
728  if (found_indel) {
729  break;
730  }
731  }
732  if (!exon.IsSetAcceptor_before_exon() ||
733  exon.GetAcceptor_before_exon().GetBases() == " " ||
734  !found_indel)
735  {
736  distance_5prime = INT_MAX;
737  }
738  found_indel = false;
739  REVERSE_ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
740  const CSpliced_exon_chunk& part = **part_it;
741  switch (part.Which()) {
743  distance_3prime += part.GetMatch();
744  break;
746  distance_3prime += part.GetMismatch();
747  break;
749  distance_3prime += part.GetDiag();
750  break;
751  default:
752  found_indel = true;
753  break;
754  }
755  if (found_indel) {
756  break;
757  }
758  }
759  if (!exon.IsSetDonor_after_exon() ||
760  exon.GetDonor_after_exon().GetBases() == " " ||
761  !found_indel)
762  {
763  distance_3prime = INT_MAX;
764  }
765  result = min(result, min(distance_5prime,distance_3prime));
766  }
767  if (result < INT_MAX) {
768  return result;
769  }
770  }
773  "No indels found in exons with splice sites");
774  }
775 };
777 //////////////////////////////////////////////////////////////////////////////
779 static const CGenetic_code *s_GetGeneticCode(const CSeq_id& seq_id,
780  CScope* scope)
781 {
782  CRef<CGenetic_code> genetic_code;
783  try {
784  CBioseq_Handle bsh = scope->GetBioseqHandle(seq_id);
785  int gcode = sequence::GetOrg_ref(bsh).GetGcode();
787  ITERATE (CGenetic_code_table::Tdata, it, tbl.Get()) {
788  if ((*it)->GetId() == gcode) {
789  genetic_code = *it;
790  break;
791  }
792  }
793  }
794  catch (CException&) {
795  // use the default genetic code
796  }
798  return genetic_code.GetPointer();
799 }
802 {
803 public:
804  CScore_StartStopCodon(bool start_codon)
805  : m_StartCodon(start_codon)
806  {
807  }
809  virtual void PrintHelp(CNcbiOstream& ostr) const
810  {
811  ostr << "1 if a " << (m_StartCodon ? "start" : "stop")
812  << " codon was found, 0 otherwise. Note that this score has "
813  "meaning only for Spliced-seg alignments, as would be generated "
814  "by Splign or ProSplign.";
815  }
817  virtual EComplexity GetComplexity() const { return eEasy; };
819  virtual bool IsInteger() const { return true; };
821  virtual double Get(const CSeq_align& align, CScope* scope) const
822  {
823  bool is_protein = false;
824  TSeqPos product_length = 0;
825  if (align.GetSegs().IsSpliced()) {
826  bool score_precalculated=false;
827  const CSpliced_seg& seg = align.GetSegs().GetSpliced();
828  is_protein = seg.GetProduct_type() ==
830  if (seg.CanGetProduct_length()) {
831  product_length = seg.GetProduct_length();
832  }
834  if (m_StartCodon
835  ? (*it)->IsStart_codon_found()
836  : (*it)->IsStop_codon_found() ) {
837  score_precalculated=true;
838  if (m_StartCodon
839  ? (*it)->GetStart_codon_found()
840  : (*it)->GetStop_codon_found())
841  {
842  return 1;
843  }
844  }
845  }
846  if (score_precalculated) {
847  /// Found the modifier, but it was set to false
848  return 0;
849  }
850  }
852  if (!product_length) {
853  CBioseq_Handle product_bsh =
854  scope->GetBioseqHandle(align.GetSeq_id(0));
855  if (!product_bsh) {
856  NCBI_THROW(CSeqalignException, eUnsupported,
857  "Can't get sequence " +
858  align.GetSeq_id(0).AsFastaString());
859  }
860  is_protein = product_bsh.IsAa();
861  product_length = product_bsh.GetBioseqLength();
862  }
864  CRef<CSeq_loc> aligned_genomic;
866  //
867  // generate the cleaned alignment
868  //
870  CFeatureGenerator generator(*scope);
871  generator.SetAllowedUnaligned(10);
872  CConstRef<CSeq_align> clean_align = generator.CleanAlignment(align);
874  // we can't call CFeatureGenerator because CFeatureGenerator depends on
875  // having certain fields set (such as Spliced-seg modifiers indicating
876  // (wait for it...) that the stop codon or start codon was found. This
877  // here function is to be called to verify that the star/stop are
878  // included, hence we have a circular logical relationship...
879  CSeq_id &query_id = const_cast<CSeq_id &>(clean_align->GetSeq_id(0));
880  CSeq_id &subject_id = const_cast<CSeq_id &>(clean_align->GetSeq_id(1));
881  CBioseq_Handle genomic_bsh = scope->GetBioseqHandle(subject_id);
882  if ( !genomic_bsh ) {
884  "failed to retrieve sequence for " +
885  subject_id.AsFastaString());
886  }
887  int genomic_len = genomic_bsh.GetBioseqLength();
889  CSeq_loc_Mapper mapper(*clean_align, 1);
891  CRef<CSeq_loc> cds_loc;
892  if (is_protein) {
893  CSeq_loc loc;
894  loc.SetWhole().Assign(query_id);
895  cds_loc = mapper.Map(loc);
896  }
897  else {
898  CBioseq_Handle bsh = scope->GetBioseqHandle(query_id);
899  if ( !bsh ) {
901  "failed to retrieve sequence for " +
902  query_id.AsFastaString());
903  }
904  CFeat_CI feat_it(bsh,
906  .IncludeFeatType(CSeqFeatData::e_Cdregion));
908  CMappedFeat mf;
909  for ( ; feat_it; ++feat_it) {
910  mf = *feat_it;
911  break;
912  }
914  if ( !mf ) {
915  // no CDS == no start or stop
916  return 0.0;
917  }
919  const CSeq_loc &orig_loc = mf.GetLocation();
920  ENa_strand q_strand = sequence::GetStrand(orig_loc, scope);
921  TSeqRange total_q_range = orig_loc.GetTotalRange();
922  if (!orig_loc.IsPartialStop(eExtreme_Biological)) {
923  /// Remove stop codon
924  if (q_strand == eNa_strand_minus) {
925  total_q_range.SetFrom(total_q_range.GetFrom() + 3);
926  }
927  else {
928  total_q_range.SetTo(total_q_range.GetTo() - 3);
929  }
930  }
932  /**
933  cerr << "orig loc: " << MSerial_AsnText << orig_loc;
934  cerr << "orig strand: " << s_strand << endl;
935  cerr << "orig range: " << total_s_range << endl;
936  **/
938  if (mf.GetData().GetCdregion().IsSetFrame() &&
939  mf.GetData().GetCdregion().GetFrame() > 1)
940  {
941  TSeqPos offs = mf.GetData().GetCdregion().GetFrame() - 1;
942  if (q_strand == eNa_strand_minus) {
943  total_q_range.SetTo(total_q_range.GetTo() + offs);
944  }
945  else {
946  total_q_range.SetFrom(total_q_range.GetFrom() - offs);
947  }
948  }
949  CSeq_loc adjusted_loc(
950  query_id, total_q_range.GetFrom(),
951  total_q_range.GetTo(), q_strand);
953  // map the mRNA locations to the genome
954  cds_loc = mapper.Map(adjusted_loc);
956  /**
957  if (start_codon) {
958  cerr << "start codon: " << MSerial_AsnText << *start_codon;
959  }
960  if (stop_codon) {
961  cerr << "stop codon: " << MSerial_AsnText << *stop_codon;
962  }
963  **/
964  }
966  ENa_strand s_strand = sequence::GetStrand(*cds_loc, scope);
967  int direction = s_strand == eNa_strand_minus ? -1 : 1;
968  int from =
970  : (int)cds_loc->GetStop(eExtreme_Biological) + direction;
971  int to = from + 2 * direction;
972  CRef<CSeq_loc> codon;
973  if (to >= 0 && to < genomic_len) {
974  /// codon is simple interval
975  codon.Reset(new CSeq_loc(subject_id, min(from,to), max(from,to),
976  s_strand));
977  } else if (genomic_bsh.GetInst_Topology() ==
979  {
980  /// this is a circular genomic sequence, and codon crosses origin
981  CRef<CSeq_interval> int1, int2;
982  if (s_strand == eNa_strand_minus) {
983  int1.Reset(new CSeq_interval(subject_id, 0, from,
985  int1->SetFuzz_from().SetLim(CInt_fuzz::eLim_circle);
986  int2.Reset(new CSeq_interval(subject_id, to + genomic_len,
987  genomic_len - 1, eNa_strand_minus));
988  int2->SetFuzz_to().SetLim(CInt_fuzz::eLim_circle);
989  } else {
990  int1.Reset(new CSeq_interval(subject_id, from,
991  genomic_len - 1, eNa_strand_plus));
992  int1->SetFuzz_to().SetLim(CInt_fuzz::eLim_circle);
993  int2.Reset(new CSeq_interval(subject_id, 0, to - genomic_len,
994  eNa_strand_plus));
995  int2->SetFuzz_from().SetLim(CInt_fuzz::eLim_circle);
996  }
997  codon.Reset(new CSeq_loc);
998  codon->SetPacked_int().Set().push_back(int1);
999  codon->SetPacked_int().Set().push_back(int2);
1000  }
1002  if ( !codon ) {
1003  return 0.0;
1004  }
1006  //
1007  // evaluate for start-stop codon as needed
1008  //
1010  int gcode = 11;
1011  const CGenetic_code* gc = s_GetGeneticCode(align.GetSeq_id(1), scope);
1012  if (gc) {
1013  gcode = gc->GetId();
1014  }
1015  const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
1017  CSeqVector v(*codon, *scope, CBioseq_Handle::eCoding_Iupac);
1019  /**
1020  cerr << MSerial_AsnText << *start_codon;
1021  cerr << "gcode: " << gcode << endl;
1022  cerr << "bases: "
1023  << v[0] << v[1] << v[2] << endl;
1024  **/
1026  int state = tbl.SetCodonState(v[0], v[1], v[2]);
1027  if (m_StartCodon ? tbl.IsAnyStart(state) : tbl.IsOrfStop(state)) {
1028  return 1.0;
1029  }
1031  return 0.0;
1032  }
1035 private:
1037 };
1039 //////////////////////////////////////////////////////////////////////////////
1042 {
1043 public:
1044  virtual void PrintHelp(CNcbiOstream& ostr) const
1045  {
1046  ostr <<
1047  "Count of the number of internal stop codons encountered when "
1048  "translating the aligned coding region. Note that this has meaning "
1049  "only for Spliced-seg transcript alignments with a transcript that "
1050  "has an annotated cdregion, or for Spliced-seg protein alignments.";
1051  }
1053  virtual EComplexity GetComplexity() const { return eHard; };
1055  virtual bool IsInteger() const { return true; };
1057  virtual double Get(const CSeq_align& align, CScope* scope) const
1058  {
1060  if (align.GetSegs().IsSpliced()) {
1061  CInternalStopFinder stop_finder(*scope);
1062  return stop_finder.FindStops(align).size();
1063  }
1065  double score = 0;
1067  //
1068  // complicated
1069  //
1071  // first, generate a gene model
1072  CFeatureGenerator generator(*scope);
1075  generator.SetAllowedUnaligned(10);
1077  CConstRef<CSeq_align> clean_align = generator.CleanAlignment(align);
1078  CSeq_annot annot;
1079  CBioseq_set bset;
1080  generator.ConvertAlignToAnnot(*clean_align, annot, bset);
1081  if (bset.GetSeq_set().empty() ||
1082  !bset.GetSeq_set().front()->IsSetAnnot())
1083  {
1084  return score;
1085  }
1087  CScope transcribed_mrna_scope(*CObjectManager::GetInstance());
1088  transcribed_mrna_scope.AddTopLevelSeqEntry(*bset.GetSeq_set().front());
1089  CRef<CSeq_feat> cds = bset.GetSeq_set().front()
1090  -> GetSeq().GetAnnot().front()
1091  -> GetData().GetFtable().front();
1093  if (cds) {
1094  cds->SetData().SetCdregion().ResetCode_break();
1095  string trans;
1096  CSeqTranslator::Translate(*cds, transcribed_mrna_scope, trans);
1098  NStr::EndsWith(trans, "*"))
1099  {
1100  trans.resize(trans.size() - 1);
1101  }
1103  ITERATE (string, i, trans) {
1104  score += (*i == '*');
1105  }
1107  /**
1108  cerr << "align: "
1109  << CSeq_id_Handle::GetHandle(align.GetSeq_id(0))
1110  << " x "
1111  << CSeq_id_Handle::GetHandle(align.GetSeq_id(1))
1112  << endl;
1114  if (cds->IsSetProduct()) {
1115  string seq;
1116  CSeqVector v(cds->GetProduct(), *scope, CBioseq_Handle::eCoding_Iupac);
1117  v.GetSeqData(v.begin(), v.end(), seq);
1118  cerr << "product: " << seq << endl;
1119  }
1120  cerr << "xlate: " << trans << endl;
1121  cerr << "count: " << score << endl;
1122  **/
1123  }
1125  return score;
1126  }
1127 };
1129 /////////////////////////////////////////////////////////////////////////////
1132 {
1133 public:
1137  : m_ScoreType(type)
1138  {}
1140  virtual EComplexity GetComplexity() const { return eHard; };
1142  virtual bool IsInteger() const { return m_ScoreType >= eStart; };
1144  virtual void PrintHelp(CNcbiOstream& ostr) const
1145  {
1146  switch (m_ScoreType) {
1147  case ePercentIdentity:
1148  ostr <<
1149  "Percent-identity score confined to the coding region "
1150  "associated with the align transcipt. Not supported "
1151  "for standard-seg alignments.";
1152  break;
1153  case ePercentCoverage:
1154  ostr <<
1155  "Percent-coverage score confined to the coding region "
1156  "associated with the align transcipt.";
1157  break;
1158  case eStart:
1159  ostr << "Start position of product's coding region.";
1160  break;
1161  case eEnd:
1162  ostr << "End position of product's coding region.";
1163  break;
1164  }
1165  ostr << " Note that this has meaning only if product has a coding "
1166  "region annotation.";
1167  }
1169  virtual double Get(const CSeq_align& align, CScope* scope) const
1170  {
1171  double score = -1;
1172  if (align.GetSegs().IsStd()) {
1173  return score;
1174  }
1176  CBioseq_Handle product = scope->GetBioseqHandle(align.GetSeq_id(0));
1177  if ( !product ) {
1179  "failed to retrieve sequence for " +
1180  align.GetSeq_id(0).AsFastaString());
1181  }
1184  if (cds) {
1185  switch (m_ScoreType) {
1186  case eStart:
1187  score = cds->GetLocation().GetStart(eExtreme_Positional);
1188  break;
1190  case eEnd:
1191  score = cds->GetLocation().GetStop(eExtreme_Positional);
1192  break;
1194  default:
1195  {{
1196  CRangeCollection<TSeqPos> cds_ranges;
1197  for (CSeq_loc_CI it(cds->GetLocation()); it; ++it) {
1198  cds_ranges += it.GetRange();
1199  }
1200  score = m_ScoreType == ePercentIdentity
1201  ? CScoreBuilder().GetPercentIdentity(*scope, align,
1202  cds_ranges)
1203  : CScoreBuilder().GetPercentCoverage(*scope, align,
1204  cds_ranges);
1205  break;
1206  }}
1207  }
1208  }
1209  return score;
1210  }
1212 private:
1214 };
1216 //////////////////////////////////////////////////////////////////////////////
1219 {
1220 public:
1222  : m_Row(row)
1223  {}
1225  virtual EComplexity GetComplexity() const { return eEasy; };
1227  virtual void PrintHelp(CNcbiOstream& ostr) const
1228  {
1229  ostr << (m_Row == 0
1230  ? "Percentage of query sequence aligned to subject (0.0-100.0)"
1231  : "Percentage of subject sequence aligned to query (0.0-100.0)");
1232  }
1235  virtual double Get(const CSeq_align& align, CScope* scope) const
1236  {
1237  if (m_Row == 0) {
1238  return CScoreBuilder().GetPercentCoverage(*scope, align);
1239  }
1241  /// Calculate coverage on subject
1242  size_t covered_bases = align.GetAlignLength(false /* don't include gaps */);
1243  size_t seq_len = scope->GetSequenceLength(align.GetSeq_id(1));
1244  return covered_bases ? 100.0f * double(covered_bases) / double(seq_len)
1245  : 0.0;
1246  }
1248 private:
1249  int m_Row;
1250 };
1252 //////////////////////////////////////////////////////////////////////////////
1255 {
1256 public:
1257  CScore_Taxid(int row, const string &rank = "")
1258  : m_Row(row)
1259  , m_Rank(rank)
1260  {
1261  }
1263  virtual EComplexity GetComplexity() const { return eHard; };
1265  virtual bool IsInteger() const { return true; };
1267  virtual void PrintHelp(CNcbiOstream& ostr) const
1268  {
1269  if (m_Row == 0) {
1270  ostr << "Taxid of query sequence";
1271  }
1272  else if (m_Row == 1) {
1273  ostr << "Taxid of subject sequence";
1274  }
1275  }
1277  virtual double Get(const CSeq_align& align, CScope* scope) const
1278  {
1279  TTaxId taxid = scope->GetTaxId(align.GetSeq_id(m_Row));
1280  if (!m_Rank.empty()) {
1281  m_Taxon.Init();
1282  taxid = m_Taxon.GetAncestorByRank(taxid, m_Rank.c_str());
1283  }
1284  return TAX_ID_TO(double, taxid);
1285  }
1287 private:
1288  int m_Row;
1289  string m_Rank;
1290  mutable CTaxon1 m_Taxon;
1291 };
1293 //////////////////////////////////////////////////////////////////////////////
1296 {
1297 public:
1299  {
1300  }
1302  virtual void PrintHelp(CNcbiOstream& ostr) const
1303  {
1304  ostr <<
1305  "Position of last splice site. Note that this has meaning only "
1306  "for Spliced-seg transcript alignments, and only if the alignment "
1307  "has at least two exons.";
1308  }
1310  virtual EComplexity GetComplexity() const { return eEasy; };
1312  virtual bool IsInteger() const { return true; };
1314  virtual double Get(const CSeq_align& align, CScope* ) const
1315  {
1316  if (align.GetSegs().IsSpliced())
1317  {
1318  const CSpliced_seg &seg = align.GetSegs().GetSpliced();
1319  if (seg.CanGetExons() && seg.GetExons().size() > 1 &&
1320  seg.CanGetProduct_type() &&
1322  seg.CanGetProduct_strand() &&
1324  {
1325  const CSpliced_exon &last_spliced_exon =
1327  ? **++align.GetSegs().GetSpliced().GetExons().begin()
1328  : **++align.GetSegs().GetSpliced().GetExons().rbegin();
1329  if (last_spliced_exon.CanGetProduct_end()) {
1330  return last_spliced_exon.GetProduct_end().GetNucpos();
1331  }
1332  }
1333  }
1334  NCBI_THROW(CSeqalignException, eUnsupported,
1335  "last_splice_site score inapplicable");
1336  return 0;
1337  }
1338 };
1341 //////////////////////////////////////////////////////////////////////////////
1344 {
1345 public:
1346  CScore_Overlap(int row, bool include_gaps)
1347  : m_Row(row)
1348  , m_IncludeGaps(include_gaps)
1349  {
1350  }
1352  virtual void PrintHelp(CNcbiOstream& ostr) const
1353  {
1354  string row_name = m_Row == 0 ? "query" : "subject";
1355  string range_type = m_IncludeGaps ? "total aligned range" : "aligned bases";
1356  ostr <<
1357  "size of overlap of " + range_type + " with any alignments "
1358  "over the same " + row_name + " sequence that have previously "
1359  "passed this filter. Assumes that input alignments "
1360  "are collated by " + row_name + ", and then sorted by priority for "
1361  "inclusion in the output.";
1362  }
1364  virtual EComplexity GetComplexity() const { return eEasy; };
1366  virtual bool IsInteger() const { return true; };
1368  virtual double Get(const CSeq_align& align, CScope* ) const
1369  {
1370  CRangeCollection<TSeqPos> overlap;
1371  if (align.GetSeq_id(m_Row).Match(m_CurrentSeq)) {
1372  overlap = m_CoveredRanges;
1373  if (m_IncludeGaps) {
1374  overlap &= align.GetSeqRange(m_Row);
1375  } else {
1376  overlap &= align.GetAlignedBases(m_Row);
1377  }
1378  }
1379  return overlap.GetCoveredLength();
1380  }
1382  virtual void UpdateState(const objects::CSeq_align& align)
1383  {
1384  const CSeq_id &aligned_id = align.GetSeq_id(m_Row);
1385  if (!aligned_id.Match(m_CurrentSeq)) {
1386  m_CurrentSeq.Assign(aligned_id);
1388  }
1389  if (m_IncludeGaps) {
1390  m_CoveredRanges += align.GetSeqRange(m_Row);
1391  } else {
1392  m_CoveredRanges += align.GetAlignedBases(m_Row);
1393  }
1394  }
1396 private:
1397  int m_Row;
1401 };
1404 //////////////////////////////////////////////////////////////////////////////
1407 {
1408 public:
1409  CScore_OverlapBoth(int row, bool include_gaps)
1410  : m_Row(row)
1411  , m_IncludeGaps(include_gaps)
1412  {
1413  }
1415  virtual void PrintHelp(CNcbiOstream& ostr) const
1416  {
1417  string row_name = m_Row == 0 ? "query" : "subject";
1418  string range_type = m_IncludeGaps ? "total aligned range" : "aligned bases";
1419  ostr <<
1420  "size of overlap of " + range_type + " with any alignments "
1421  "over the same " + row_name + " sequence that have previously "
1422  "passed this filter. Assumes that input alignments "
1423  "are collated by " + row_name + ", and then sorted by priority for "
1424  "inclusion in the output.";
1425  }
1427  virtual EComplexity GetComplexity() const { return eEasy; };
1429  virtual bool IsInteger() const { return true; };
1431  virtual double Get(const CSeq_align& align, CScope* ) const
1432  {
1436  CRangeCollection<TSeqPos> overlap;
1438  m_CoveredRanges.find(make_pair(q, s));
1440  if (it != m_CoveredRanges.end()) {
1441  if (m_IncludeGaps) {
1442  overlap += align.GetSeqRange(m_Row);
1443  } else {
1444  overlap += align.GetAlignedBases(m_Row);
1445  }
1447  overlap &= it->second;
1448  }
1449  return overlap.GetCoveredLength();
1450  }
1452  virtual void UpdateState(const objects::CSeq_align& align)
1453  {
1454  CSeq_id_Handle q = CSeq_id_Handle::GetHandle(align.GetSeq_id(0));
1455  CSeq_id_Handle s = CSeq_id_Handle::GetHandle(align.GetSeq_id(1));
1457  TData::iterator it =
1459  (make_pair(q, s),
1460  CRangeCollection<TSeqPos>())).first;
1462  if (m_IncludeGaps) {
1463  it->second += align.GetSeqRange(m_Row);
1464  } else {
1465  it->second += align.GetAlignedBases(m_Row);
1466  }
1467  }
1469 private:
1470  int m_Row;
1475 };
1477 //////////////////////////////////////////////////////////////////////////////
1480 {
1481 public:
1483  : m_Row(row)
1484  {
1485  }
1487  virtual void PrintHelp(CNcbiOstream& ostr) const
1488  {
1489  ostr <<
1490  "restrict to the first N subjects seen for each query";
1491  }
1493  virtual EComplexity GetComplexity() const { return eEasy; };
1495  virtual bool IsInteger() const { return true; };
1497  virtual double Get(const CSeq_align& align, CScope* ) const
1498  {
1499  int index_row = m_Row;
1500  int alt_row = abs(index_row - 1);
1501  CSeq_id_Handle id1 = CSeq_id_Handle::GetHandle(align.GetSeq_id(index_row));
1502  CSeq_id_Handle id2 = CSeq_id_Handle::GetHandle(align.GetSeq_id(alt_row));
1503  TOrdinalPos& ranks = m_Ids[id1];
1504  TOrdinalPos::iterator it = ranks.find(id2);
1505  if (it == ranks.end()) {
1506  it = ranks.insert(TOrdinalPos::value_type(id2, ranks.size())).first;
1508  /**
1509  LOG_POST(Error << " q=" << qid
1510  << " s=" << id2
1511  << " ord=" << it->second);
1512  **/
1513  }
1514  return it->second;
1515  }
1517 private:
1521  int m_Row;
1522  mutable TIds m_Ids;
1523 };
1526 //////////////////////////////////////////////////////////////////////////////
1529 {
1530 public:
1533  {
1534  }
1536  virtual void PrintHelp(CNcbiOstream& ostr) const
1537  {
1538  ostr <<
1539  "Recompute a raw BLAST score for an arbitrary protein-to-DNA "
1540  "alignment, using a Spliced-seg as input. Computation is "
1541  "constrained to accept only protein-to-nucleotide Spliced-seg "
1542  "alignments and is slightly different than the raw BLAST score, "
1543  "in that gap computations differ due to the lack of true "
1544  "composition based statistics. These differences are minimal.";
1545  }
1547  virtual EComplexity GetComplexity() const { return eHard; };
1549  virtual bool IsInteger() const { return true; };
1551  virtual double Get(const CSeq_align& align, CScope* scope) const
1552  {
1553  // check assumptions:
1554  //
1555  if ( !align.GetSegs().IsSpliced() ) {
1556  NCBI_THROW(CSeqalignException, eUnsupported,
1557  "CScore_TblastnScore: "
1558  "valid only for spliced-seg alignments");
1559  }
1561  if ( align.GetSegs().GetSpliced().GetProduct_type() !=
1563  NCBI_THROW(CSeqalignException, eUnsupported,
1564  "CScore_TblastnScore: "
1565  "valid only for protein spliced-seg alignments");
1566  }
1568  int score = m_ScoreLookup.GetBlastScore(*scope, align);
1570  return score;
1571  }
1573 private:
1575 };
1578 //////////////////////////////////////////////////////////////////////////////
1581 {
1582 public:
1585  {
1586  }
1588  virtual void PrintHelp(CNcbiOstream& ostr) const
1589  {
1590  ostr <<
1591  "Adjusted protein score (ratio of actual score to perfect score)";
1592  }
1594  virtual EComplexity GetComplexity() const { return eHard; };
1596  virtual double Get(const CSeq_align& align, CScope* scope) const
1597  {
1600  //
1601  // compute the BLAST score
1602  //
1603  int score = m_ScoreLookup.GetBlastScore(*scope, align);
1605  //
1606  // compute the BLAST score for a degenerate perfect alignment for
1607  // the two sequences
1608  //
1609  double q_perfect = x_GetPerfectScore(*scope, idh);
1610  double s_perfect = x_GetPerfectScore
1611  (*scope, CSeq_id_Handle::GetHandle(align.GetSeq_id(1)));
1613  double perfect_score = max(q_perfect, s_perfect);
1614  return perfect_score ? score / perfect_score : 0;
1615  }
1617 private:
1620  double x_GetPerfectScore(CScope& scope, const CSeq_id_Handle& idh) const
1621  {
1622  CBioseq_Handle bsh = scope.GetBioseqHandle(idh);
1623  if ( !bsh ) {
1625  "failed to retrieve sequence for " +
1626  idh.AsString());
1627  }
1629  CSeq_align perfect_align;
1630  CDense_seg& seg = perfect_align.SetSegs().SetDenseg();
1631  CRef<CSeq_id> id(new CSeq_id);
1632  id->Assign(*idh.GetSeqId());
1633  seg.SetIds().push_back(id);
1634  seg.SetIds().push_back(id);
1635  seg.SetNumseg(1);
1636  seg.SetStarts().push_back(0);
1637  seg.SetStarts().push_back(0);
1638  seg.SetLens().push_back(bsh.GetBioseqLength());
1640  return m_ScoreLookup.GetBlastScore(scope, perfect_align);
1641  }
1642 };
1646 {
1647 public:
1653  : m_Edge(edge), m_InfoType(type)
1654  {}
1656  virtual void PrintHelp(CNcbiOstream& ostr) const
1657  {
1658  ostr << (m_InfoType == eLength ? "Length" : "Identity percentage")
1659  << " of the " << (m_Edge == e5Prime ? "5'" : "3'")
1660  << " exon. Note that this score has "
1661  "meaning only for Spliced-seg alignments, as would be generated "
1662  "by Splign or ProSplign, and only if it has at least one intron.";
1663  }
1665  virtual EComplexity GetComplexity() const { return eEasy; };
1667  virtual bool IsInteger() const { return m_InfoType == eLength; };
1669  virtual double Get(const CSeq_align& align, CScope* scope) const
1670  {
1671  if (!align.GetSegs().IsSpliced() ||
1672  align.GetSegs().GetSpliced().GetExons().size() == 1)
1673  {
1674  NCBI_THROW(CSeqalignException, eUnsupported,
1675  "CScore_EdgeExonInfo: "
1676  "valid only for spliced-seg alignments with at least one intron");
1677  }
1678  const CSpliced_seg::TExons &exons =
1679  align.GetSegs().GetSpliced().GetExons();
1680  CConstRef<CSpliced_exon> exon = m_Edge == e5Prime ? exons.front()
1681  : exons.back();
1682  if (m_InfoType == eLength) {
1683  return exon->GetGenomic_end() - exon->GetGenomic_start() + 1;
1684  } else {
1685  if (exon->IsSetScores()) {
1686  ITERATE (CScore_set::Tdata, score_it, exon->GetScores().Get()) {
1687  if ((*score_it)->CanGetId() && (*score_it)->GetId().IsStr()
1688  && (*score_it)->GetId().GetStr() == "idty")
1689  {
1690  return (*score_it)->GetValue().GetReal() * 100;
1691  }
1692  }
1693  }
1694  /// Exon percent identity not stored; calculate it
1695  TSeqRange product_span;
1696  product_span.Set(exon->GetProduct_start().AsSeqPos(),
1697  exon->GetProduct_end().AsSeqPos());
1698  return CScoreBuilder().GetPercentIdentity(*scope, align,
1699  product_span);
1700  }
1701  }
1703 private:
1706 };
1709 {
1710  CFeat_CI gene_it(bsh, CSeqFeatData::e_Gene);
1711  if (!gene_it) {
1712  NCBI_THROW(CException, eUnknown, "No gene feature");
1713  }
1715  CMappedFeat gene = *gene_it;
1716  if (++gene_it) {
1717  NCBI_THROW(CException, eUnknown, "Multiple gene features");
1718  }
1720  if (gene.GetNamedDbxref("GeneID")) {
1721  return gene.GetNamedDbxref("GeneID")->GetTag().GetId();
1722  }
1724  /// Fallback; use LocusID
1725  if (gene.GetData().GetGene().IsSetDb()) {
1726  for (const CRef<CDbtag> &db : gene.GetData().GetGene().GetDb()) {
1727  if (db->GetDb() == "LocusID" && db->GetTag().IsId()) {
1728  return db->GetTag().GetId();
1729  }
1730  }
1731  }
1733  NCBI_THROW(CException, eUnknown, "Gene id not set");
1734 }
1737 {
1738 public:
1740  : m_Row(row)
1741  {
1742  }
1744  virtual void PrintHelp(CNcbiOstream& ostr) const
1745  {
1746  ostr << "Gene ID of " << (m_Row == 0 ? "query" : "subject");
1747  }
1749  virtual EComplexity GetComplexity() const { return eEasy; };
1751  virtual bool IsInteger() const { return true; };
1753  virtual double Get(const CSeq_align& align, CScope* scope) const
1754  {
1755  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(m_Row));
1756  if ( !bsh ) {
1758  "failed to retrieve sequence for " +
1759  align.GetSeq_id(m_Row).AsFastaString());
1760  }
1761  return CScoreLookup::GetGeneId(bsh);
1762  }
1764 private:
1765  int m_Row;
1766 };
1768 /////////////////////////////////////////////////////////////////////////////
1771 {
1772 public:
1774  virtual void PrintHelp(CNcbiOstream& ostr) const
1775  {
1776  ostr << "CRC of the strucural parts of the alignment";
1777  }
1779  virtual EComplexity GetComplexity() const { return eEasy; };
1781  virtual bool IsInteger() const { return true; };
1783  virtual double Get(const CSeq_align& align, CScope*) const
1784  {
1785  CScoreBuilder Builder;
1786  return Builder.ComputeTieBreaker(align);
1787  }
1788 };
1790 //////////////////////////////////////////////////////////////////////////////
1793 {
1794 public:
1795  virtual void PrintHelp(CNcbiOstream& ostr) const
1796  {
1797  ostr <<
1798  "1 if rna Seq-feat based on this alignment is partial; "
1799  "0 if it is complete";
1800  }
1802  virtual EComplexity GetComplexity() const { return eHard; };
1804  virtual bool IsInteger() const { return true; };
1806  virtual double Get(const CSeq_align& align, CScope* scope) const
1807  {
1808  CFeatureGenerator generator(*scope);
1809  generator.SetAllowedUnaligned(10);
1811  CConstRef<CSeq_align> clean_align = generator.CleanAlignment(align);
1812  CSeq_annot annot;
1813  CBioseq_set bset;
1814  generator.ConvertAlignToAnnot(*clean_align, annot, bset);
1815  for (const CRef<CSeq_feat> &feat : annot.GetData().GetFtable()) {
1816  if (feat->GetData().IsRna()) {
1817  return feat->IsSetPartial() && feat->GetPartial();
1818  }
1819  }
1822  "Can't generate rna sequence from alignment");
1823  }
1824 };
1826 //////////////////////////////////////////////////////////////////////////////
1829 {
1830 public:
1831  virtual void PrintHelp(CNcbiOstream& ostr) const
1832  {
1833  ostr <<
1834  "1 if query is a mRNA and its coding region has ribosomal "
1835  "slippage; 0 otherwise";
1836  }
1838  virtual EComplexity GetComplexity() const { return eEasy; };
1840  virtual bool IsInteger() const { return true; };
1842  virtual double Get(const CSeq_align& align, CScope* scope) const
1843  {
1844  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(0));
1845  if ( !bsh ) {
1847  "failed to retrieve sequence for " +
1848  align.GetSeq_id(0).AsFastaString());
1849  }
1851  CFeat_CI feat_it(bsh, CSeqFeatData::e_Cdregion);
1852  return feat_it && feat_it->IsSetExcept_text() &&
1853  feat_it->GetExcept_text().find("ribosomal slippage") != string::npos;
1854  }
1855 };
1857 //////////////////////////////////////////////////////////////////////////////
1860 {
1861 public:
1863  : m_Row(row)
1864  {
1865  }
1867  virtual void PrintHelp(CNcbiOstream& ostr) const
1868  {
1869  ostr <<
1870  "Computes the percent of residues in the aligned "
1871  << (m_Row == 0 ? "query" : "subject")
1872  << " region that would be filtered by 'seg'";
1873  }
1875  virtual EComplexity GetComplexity() const { return eEasy; };
1877  virtual bool IsInteger() const { return false; };
1879  virtual double Get(const CSeq_align& align, CScope* scope) const
1880  {
1881  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(m_Row));
1882  if ( !bsh ) {
1884  "failed to retrieve sequence for " +
1885  align.GetSeq_id(0).AsFastaString());
1886  }
1888  if ( !bsh.IsProtein() ) {
1890  "alignment filter requires that the requested "
1891  "sequence be a protein");
1892  }
1894  TSeqRange r = align.GetSeqRange(m_Row);
1895  string seq;
1897  vec.GetSeqData(r.GetFrom(), r.GetTo(), seq);
1899  string seq_iupac;
1901  vec.GetSeqData(r.GetFrom(), r.GetTo(), seq_iupac);
1903  //
1904  // this uses lower-level calls in BLAST to run 'seg' on the covered
1905  // sequence
1906  //
1909  BlastSeqLoc* seq_locs = NULL;
1910  SeqBufferSeg((unsigned char *), seq.size(), 0, sp, &seq_locs);
1911  SegParametersFree(sp);
1913  // now, count how many masked residues we have
1914  vector<size_t> counts(seq.size(), 0);
1915  for (BlastSeqLoc *itr = seq_locs; itr; itr = itr->next) {
1916  for (int i = itr->ssr->left; i <= itr->ssr->right; ++ i) {
1917  counts[i] = 1;
1918  }
1919  //cerr << " seg range: [" << itr->ssr->left << ".." << itr->ssr->right << "]: " << itr->ssr->right - itr->ssr->left + 1 << " / " << pos.size() << " total" << endl;
1920  }
1921  BlastSeqLocFree(seq_locs);
1923  // report the number of masked residues
1924  size_t count_x = 0;
1925  for (const auto& i : counts) {
1926  count_x += i;
1927  }
1928  double val = count_x * 100.0 / seq.size();
1930  /**
1931  CSeq_id_Handle idh = sequence::GetId(bsh, sequence::eGetId_Best);
1932  cerr
1933  << idh << "(" << r << "): seg-pct = " << val
1934  << ", seq = " << seq_iupac
1935  << endl;
1936  **/
1938  return val;
1939  }
1941 private:
1942  size_t m_Row;
1943 };
1945 //////////////////////////////////////////////////////////////////////////////
1948 {
1949 public:
1951  : m_Row(row)
1952  {
1953  }
1955  virtual void PrintHelp(CNcbiOstream& ostr) const
1956  {
1957  ostr <<
1958  "Computes the value of Shannon's entropy for the specified "
1959  "aligned "
1960  << (m_Row == 0 ? "query" : "subject") << " region";
1961  }
1963  virtual EComplexity GetComplexity() const { return eEasy; };
1965  virtual bool IsInteger() const { return false; };
1967  virtual double Get(const CSeq_align& align, CScope* scope) const
1968  {
1969  CBioseq_Handle bsh = scope->GetBioseqHandle(align.GetSeq_id(m_Row));
1970  if ( !bsh ) {
1972  "failed to retrieve sequence for " +
1973  align.GetSeq_id(0).AsFastaString());
1974  }
1975  TSeqRange r = align.GetSeqRange(m_Row);
1976  string seq;
1978  vec.GetSeqData(r.GetFrom(), r.GetTo(), seq);
1980  int word_size = 4;
1981  if (bsh.IsProtein()) {
1982  word_size = 1;
1983  }
1984  double val = ComputeNormalizedProteinEntropy(seq, word_size);
1986  /**
1987  CSeq_id_Handle idh = sequence::GetId(bsh, sequence::eGetId_Best);
1988  cerr
1989  << idh << "(" << r << "): entropy = " << val
1990  << ", seq = " << seq
1991  << endl;
1992  **/
1994  return val;
1995  }
1997 private:
1998  size_t m_Row;
1999 };
2001 /////////////////////////////////////////////////////////////////////////////
2005 {
2008  ("align_length_ungap",
2009  CIRef<IScore>(new CScore_AlignLength(false /* include gaps */))));
2012  ("gap_count",
2013  CIRef<IScore>(new CScore_GapCount(false))));
2016  ("gap_basecount",
2017  CIRef<IScore>(new CScore_GapCount(true))));
2020  ("query_gap_length",
2021  CIRef<IScore>(new CScore_GapCount(true, 0))));
2024  ("subject_gap_length",
2025  CIRef<IScore>(new CScore_GapCount(true, 1))));
2028  ("product_gap_length",
2029  CIRef<IScore>(new CScore_GapCount(true, 0, true))));
2032  ("genomic_gap_length",
2033  CIRef<IScore>(new CScore_GapCount(true, 1, true))));
2036  ("frame",
2040  ("qframe",
2044  ("sframe",
2048  ("nonframe_indel",
2049  CIRef<IScore>(new CScore_FrameShifts(-1, false))));
2052  ("qnonframe_indel",
2053  CIRef<IScore>(new CScore_FrameShifts(0, false))));
2056  ("snonframe_indel",
2057  CIRef<IScore>(new CScore_FrameShifts(1, false))));
2060  ("symmetric_overlap",
2065  ("symmetric_overlap_min",
2070  ("3prime_unaligned",
2075  ("polya", CIRef<IScore>(new CScore_Polya)));
2079  ("min_exon_len",
2084  ("max_intron_len",
2089  ("longest_gap",
2092  {{
2093  CIRef<IScore> score(new CScore_AlignStartStop(0, true));
2096  ("query_start", score));
2099  ("5prime_unaligned", score));
2102  ("query_end",
2103  CIRef<IScore>(new CScore_AlignStartStop(0, false))));
2104  }}
2108  ("internal_unaligned",
2113  ("cds_internal_stops",
2117  ("cds_start",
2121  ("cds_end",
2125  ("cds_pct_identity",
2129  ("cds_pct_coverage",
2134  ("query_coverage",
2135  CIRef<IScore>(new CScore_Coverage(0))));
2139  ("subject_coverage",
2140  CIRef<IScore>(new CScore_Coverage(1))));
2144  ("align_length_ratio",
2149  ("subject_start",
2150  CIRef<IScore>(new CScore_AlignStartStop(1, true))));
2153  ("subject_end",
2154  CIRef<IScore>(new CScore_AlignStartStop(1, false))));
2156  {{
2157  CIRef<IScore> score(new CScore_SequenceLength(0));
2160  ("query_length", score));
2163  ("product_length", score));
2166  ("subject_length",
2168  }}
2172  ("query_taxid",
2173  CIRef<IScore>(new CScore_Taxid(0))));
2176  ("subject_taxid",
2177  CIRef<IScore>(new CScore_Taxid(1))));
2180  ("query_species",
2181  CIRef<IScore>(new CScore_Taxid(0, "species"))));
2184  ("subject_species",
2185  CIRef<IScore>(new CScore_Taxid(1, "species"))));
2189  ("last_splice_site",
2194  ("exon_count",
2199  ("query_overlap",
2200  CIRef<IScore>(new CScore_Overlap(0, true))));
2204  ("query_overlap_nogaps",
2205  CIRef<IScore>(new CScore_Overlap(0, false))));
2209  ("subject_overlap",
2210  CIRef<IScore>(new CScore_Overlap(1, true))));
2214  ("subject_overlap_nogaps",
2215  CIRef<IScore>(new CScore_Overlap(1, false))));
2219  ("query_subject_overlap",
2220  CIRef<IScore>(new CScore_OverlapBoth(1, true))));
2224  ("query_subject_overlap_nogaps",
2225  CIRef<IScore>(new CScore_OverlapBoth(1, false))));
2229  ("subject_ordinal_pos",
2230  CIRef<IScore>(new CScore_OrdinalPos(0))));
2234  ("query_ordinal_pos",
2235  CIRef<IScore>(new CScore_OrdinalPos(1))));
2239  ("prosplign_tblastn_score",
2240  CIRef<IScore>(new CScore_TblastnScore(*this))));
2244  ("blast_score_ratio",
2245  CIRef<IScore>(new CScore_BlastRatio(*this))));
2249  ("start_codon",
2250  CIRef<IScore>(new CScore_StartStopCodon(true))));
2254  ("stop_codon",
2255  CIRef<IScore>(new CScore_StartStopCodon(false))));
2259  ("5prime_exon_len",
2266  ("3prime_exon_len",
2273  ("5prime_exon_pct_identity",
2280  ("3prime_exon_pct_identity",
2287  ("query_geneid",
2288  CIRef<IScore>(new CScore_GeneID(0))));
2291  ("subject_geneid",
2292  CIRef<IScore>(new CScore_GeneID(1))));
2296  ("query_entropy",
2297  CIRef<IScore>(new CScore_Entropy(0))));
2300  ("subject_entropy",
2301  CIRef<IScore>(new CScore_Entropy(1))));
2305  ("query_seg_pct",
2306  CIRef<IScore>(new CScore_SegPct(0))));
2309  ("subject_seg_pct",
2310  CIRef<IScore>(new CScore_SegPct(1))));
2314  ("min_indel_to_splice",
2319  ("partial",
2320  CIRef<IScore>(new CScore_Partial())));
2324  ("ribosomal_slippage",
2329  ("tiebreaker",
2331 }
2334 void CScoreLookup::UpdateState(const objects::CSeq_align& align)
2335 {
2337  m_Scores[*it]->UpdateState(align);
2338  }
2339 }
2342  const string &score_name)
2343 {
2344  ostr << " * " << score_name << endl;
2346  list<string> tmp;
2347  NStr::Wrap(HelpText(score_name), 72, tmp);
2348  ITERATE (list<string>, i, tmp) {
2349  ostr << " " << *i << endl;
2350  }
2351 }
2354 {
2355  ostr << "Build-in score names: " << endl;
2357  x_PrintDictionaryEntry(ostr, it->first);
2358  }
2359  ostr << endl;
2361  ostr << "Computed tokens: " << endl;
2363  x_PrintDictionaryEntry(ostr, it->first);
2364  }
2365 }
2367 string CScoreLookup::HelpText(const string &score_name)
2368 {
2370  CSeq_align::ScoreNameMap().find(score_name);
2371  if (score_it != CSeq_align::ScoreNameMap().end()) {
2372  return CSeq_align::HelpText(score_it->second);
2373  }
2375  TScoreDictionary::const_iterator token_it = m_Scores.find(score_name);
2376  if (token_it != m_Scores.end()) {
2377  m_ScoresUsed.insert(score_name);
2378  CNcbiOstrstream os;
2379  token_it->second->PrintHelp(os);
2380  return string(CNcbiOstrstreamToString(os));
2381  }
2383  return "assumed to be a score on the Seq-align";
2384 }
2387 Complexity(const string &score_name)
2388 {
2390  CSeq_align::ScoreNameMap().find(score_name);
2391  if (score_it != CSeq_align::ScoreNameMap().end()) {
2392  return IScore::eEasy;
2393  }
2395  TScoreDictionary::const_iterator token_it = m_Scores.find(score_name);
2396  if (token_it != m_Scores.end()) {
2397  return token_it->second->GetComplexity();
2398  }
2400  NCBI_THROW(CAlgoAlignUtilException, eScoreNotFound, score_name);
2401 }
2403 bool CScoreLookup::IsIntegerScore(const objects::CSeq_align& align,
2404  const string &score_name)
2405 {
2407  CSeq_align::ScoreNameMap().find(score_name);
2408  if (score_it != CSeq_align::ScoreNameMap().end()) {
2409  return CSeq_align::IsIntegerScore(score_it->second);
2410  }
2412  TScoreDictionary::const_iterator token_it = m_Scores.find(score_name);
2413  if (token_it != m_Scores.end()) {
2414  return token_it->second->IsInteger();
2415  }
2417  ITERATE (CSeq_align::TScore, stored_score_it, align.GetScore()) {
2418  if ((*stored_score_it)->CanGetValue() &&
2419  (*stored_score_it)->CanGetId() &&
2420  (*stored_score_it)->GetId().IsStr() &&
2421  (*stored_score_it)->GetId().GetStr() == score_name)
2422  {
2423  return (*stored_score_it)->GetValue().IsInt();
2424  }
2425  }
2426  return false;
2427 }
2429 double CScoreLookup::GetScore(const objects::CSeq_align& align,
2430  const string &score_name)
2431 {
2432  double score;
2433  if (align.GetNamedScore(score_name, score)) {
2434  return score;
2435  }
2437  if (m_Scope.IsNull()) {
2439  m_Scope->AddDefaults();
2440  }
2442  /// Score not found in alignmnet; look for it among built-in scores
2444  CSeq_align::ScoreNameMap().find(score_name);
2445  if (score_it != CSeq_align::ScoreNameMap().end()) {
2446  return ComputeScore(*m_Scope, align, score_it->second);
2447  }
2449  /// Not a built-in score; look for it among computed tokens
2450  TScoreDictionary::const_iterator token_it = m_Scores.find(score_name);
2451  if (token_it != m_Scores.end()) {
2452  m_ScoresUsed.insert(score_name);
2453  return token_it->second->Get(align, &*m_Scope);
2454  }
2456  NCBI_THROW(CAlgoAlignUtilException, eScoreNotFound, score_name);
2457 }
