NCBI C++ ToolKit
select_alignments_alt.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: select_alignments_alt.cpp 68538 2015-08-11 20:18:41Z souvorov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
35 #include <algo/gnomon/annot.hpp>
36 
37 #include <map>
38 #include <list>
39 #include <sstream>
40 
41 
43 BEGIN_SCOPE(gnomon)
44 
45 bool CModelCompare::CanBeConnectedIntoOne(const CGeneModel& a, const CGeneModel& b)
46 {
47  if (a.Strand() != b.Strand())
48  return false;
49  if (Precede(a.Limits(),b.Limits())) {
50  if(a.OpenRightEnd() && b.OpenLeftEnd())
51  return true;
52  } else if (Precede(b.Limits(),a.Limits())) {
53  if(b.OpenRightEnd() && a.OpenLeftEnd())
54  return true;
55  }
56  return false;
57 }
58 
59 
61  size_t commonspl = 0;
62  if(a.Strand() != b.Strand() || !a.IntersectingWith(b)) return commonspl;
63  for(size_t i = 1; i < a.Exons().size(); ++i) {
64  for(size_t j = 1; j < b.Exons().size(); ++j) {
65  if(a.Exons()[i-1].GetTo() == b.Exons()[j-1].GetTo())
66  ++commonspl;
67  if(a.Exons()[i].GetFrom() == b.Exons()[j].GetFrom() )
68  ++commonspl;
69  }
70  }
71 
72  return commonspl;
73 }
74 
75 
76 bool CModelCompare::AreSimilar(const CGeneModel& a, const CGeneModel& b, int tolerance)
77 {
78  if(a.Strand() != b.Strand())
79  return false;
80 
81  if(a.ReadingFrame().NotEmpty() && b.ReadingFrame().NotEmpty()) {
82  if(!a.ReadingFrame().IntersectingWith(b.ReadingFrame()) || a.GetCdsInfo().PStops() != b.GetCdsInfo().PStops())
83  return false;
84 
85  if(a.Exons().size() == 1 && b.Exons().size()==1) { // both coding; reading frames intersecting
86  TSignedSeqRange acds = a.GetCdsInfo().Cds();
87  TSignedSeqRange bcds = b.GetCdsInfo().Cds();
88  int common_point = (acds & bcds).GetFrom();
89  if(a.FShiftedLen(acds.GetFrom(),common_point,false)%3 != b.FShiftedLen(bcds.GetFrom(),common_point,false)%3) // different frames
90  return false;
91  }
92  }
93 
94  TSignedSeqRange intersection = a.Limits() & b.Limits();
95  TSignedSeqPos mutual_min = intersection.GetFrom();
96  TSignedSeqPos mutual_max = intersection.GetTo();
97 
98  int amin = 0;
99  while(amin < (int)a.Exons().size() && a.Exons()[amin].GetTo() < mutual_min) ++amin;
100  if(amin == (int)a.Exons().size()) return false;
101 
102  int amax = (int)a.Exons().size()-1;
103  while(amax >=0 && a.Exons()[amax].GetFrom() > mutual_max) --amax;
104  if(amax < 0) return false;
105 
106  int bmin = 0;
107  while(bmin < (int)b.Exons().size() && b.Exons()[bmin].GetTo() < mutual_min) ++bmin;
108  if(bmin == (int)b.Exons().size()) return false;
109 
110  int bmax = (int)b.Exons().size()-1;
111  while(bmax >=0 && b.Exons()[bmax].GetFrom() > mutual_max) --bmax;
112  if(bmax < 0) return false;
113 
114  if(amax-amin != bmax-bmin) return false;
115 
116 // head-to-tail overlap
117  if (amin != 0 || size_t(amax) != a.Exons().size()-1 || bmin != 0 || size_t(bmax) != b.Exons().size()-1)
118  return false;
119 
120  for( ; amin <= amax; ++amin, ++bmin) {
121  if(abs(max(mutual_min,a.Exons()[amin].GetFrom())-max(mutual_min,b.Exons()[bmin].GetFrom())) >= tolerance)
122  return false;
123  if(abs(min(mutual_max,a.Exons()[amin].GetTo())-min(mutual_max,b.Exons()[bmin].GetTo())) >= tolerance)
124  return false;
125  }
126 
127  return true;
128 }
129 
130 
131 bool CModelCompare::BadOverlapTest(const CGeneModel& a, const CGeneModel& b) { // returns true for bad overlap
132  if((!a.TrustedmRNA().empty() || !a.TrustedProt().empty()) && (!b.TrustedmRNA().empty() || !b.TrustedProt().empty()))
133  return false;
134  // else if(a.Limits().IntersectingWith(b.Limits()) && (a.Exons().size() == 1 || b.Exons().size() == 1))
135  // return true;
136  else
137  return CountCommonSplices(a,b) > 0;
138 }
139 
140 
141 bool CModelCompare::RangeNestedInIntron(TSignedSeqRange r, const CGeneModel& algn, bool check_in_holes) {
142  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
143  if(check_in_holes || (algn.Exons()[i-1].m_ssplice && algn.Exons()[i].m_fsplice)) {
144  TSignedSeqRange intron(algn.Exons()[i-1].GetTo()+1,algn.Exons()[i].GetFrom()-1);
145  if(Include(intron, r))
146  return true;
147  }
148  }
149  return false;
150 }
151 
152 
154  if(a.Strand() != b.Strand() || !a.IntersectingWith(b)) return false;
155 
156  for(unsigned int i = 0; i < a.Exons().size(); ++i) {
157  for(unsigned int j = 0; j < b.Exons().size(); ++j) {
158  if(a.Exons()[i] == b.Exons()[j])
159  return true;
160  }
161  }
162 
163  for(unsigned int i = 1; i < a.Exons().size(); ++i) {
164  TSignedSeqRange introna(a.Exons()[i-1].GetTo()+1,a.Exons()[i].GetFrom()-1);
165  for(unsigned int j = 1; j < b.Exons().size(); ++j) {
166  TSignedSeqRange intronb(b.Exons()[j-1].GetTo()+1,b.Exons()[j].GetFrom()-1);
167  if(introna == intronb)
168  return true;
169  }
170  }
171 
172  return false;
173 }
174 
175 
177  TGeneModelList& dest)
178 {
179  NON_CONST_ITERATE(TGeneModelList, im, chains) {
180  CGeneModel& model = *im;
181  CCDSInfo cds_info = model.GetCdsInfo();
182  if(cds_info.ReadingFrame().Empty())
183  continue;
184 
185  if(cds_info.IsMappedToGenome())
186  cds_info = cds_info.MapFromOrigToEdited(model.GetAlignMap());
187  TSignedSeqRange cds = cds_info.Cds();
188 
189  bool gapfilled = false;
190  int genome_cds = 0;
191  for(int ie = 0; ie < (int)model.Exons().size(); ++ie) {
192  if(model.Exons()[ie].Limits().Empty())
193  gapfilled = true;
194  else
195  genome_cds += (cds&model.TranscriptExon(ie)).GetLength();
196  }
197 
198  if(gapfilled && genome_cds < 45) {
199  model.Status() |= CGeneModel::eSkipped;
200  model.AddComment("Most CDS in genomic gap");
201  }
202  }
203 
204  ITERATE(TGeneModelList, it, chains) {
205  if(it->Status()&CGeneModel::eSkipped) {
206  bad_aligns.push_back(*it);
207  } else {
208  dest.push_back(*it);
209  }
210  }
211 }
212 
213 
215 {
216  TGeneModelList models;
217  FilterGenes(chains, bad_aligns, models);
218  return models;
219 }
220 
221 END_SCOPE(gnomon)
CCDSInfo MapFromOrigToEdited(const CAlignMap &amap) const
bool IsMappedToGenome() const
TSignedSeqRange Cds() const
TSignedSeqRange ReadingFrame() const
unsigned int & Status()
const TExons & Exons() const
virtual CAlignMap GetAlignMap() const
TSignedSeqRange TranscriptExon(int i) const
void AddComment(const string &comment)
const CCDSInfo & GetCdsInfo() const
void FilterGenes(TGeneModelList &chains, TGeneModelList &bad_aligns, TGeneModelList &dest)
Filters genes, excluding genes flagged to be skipped.
static bool HaveCommonExonOrIntron(const CGeneModel &a, const CGeneModel &b)
static bool RangeNestedInIntron(TSignedSeqRange r, const CGeneModel &algn, bool check_in_holes=true)
static size_t CountCommonSplices(const CGeneModel &a, const CGeneModel &b)
static bool AreSimilar(const CGeneModel &a, const CGeneModel &b, int tolerance)
static bool BadOverlapTest(const CGeneModel &a, const CGeneModel &b)
#define bool
Definition: bool.h:34
bool Precede(TSignedSeqRange l, TSignedSeqRange r)
bool Include(TSignedSeqRange big, TSignedSeqRange small)
list< CGeneModel > TGeneModelList
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool Empty(void) const
Definition: range.hpp:148
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
const struct ncbi::grid::netcache::search::fields::SIZE size
#define abs(a)
Definition: ncbi_heapmgr.c:130
unsigned int a
Definition: ncbi_localip.c:102
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define const
Definition: zconf.h:232
Modified on Sat Apr 13 11:46:18 2024 by modify_doxy.py rev. 669887