NCBI C++ ToolKit
feature_match.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #include <ncbi_pch.hpp>
2 
3 #include <corelib/ncbistd.hpp>
4 #include <corelib/ncbiobj.hpp>
5 
7 #include <objmgr/mapped_feat.hpp>
8 
9 #include <objects/seq/Bioseq.hpp>
10 
11 //#include <objmgr/bioseq_handle.hpp>
12 
14 #include <objmgr/util/sequence.hpp>
15 #include <objmgr/util/feature.hpp>
16 
18 
19 
22 
23 //using namespace sequence;
24 //LCOV_EXCL_START
25 //not currently using
26 bool CMatchFeat::operator < (const CMatchFeat& o) const
27 {
28  const CMatchFeat& l = *this;
29  const CMatchFeat& r = o;
30 
31  const CSeq_feat& lf = l.GetFeat();
32  const CSeq_feat& rf = r.GetFeat();
33 
34  if (l.m_pos_start != r.m_pos_start)
35  return l.m_pos_start < r.m_pos_start;
36 
37  if (l.m_pos_stop != r.m_pos_stop)
38  return l.m_pos_stop < r.m_pos_stop;
39 
40  if (lf.Compare(rf) < 0)
41  return true;
42 
43  return false;
44 }
45 
47 {
48  template<class _T>
49  bool operator()(const CRef<_T>& l, const CRef<_T>& r) const
50  {
51  return *l < *r;
52  }
53 };
54 
55 template<typename TFeatList>
57  vector < CRef<CMatchCDS> > & cds_list,
58  vector < CRef<CMatchmRNA> > & mrna_list,
59  const TFeatList & feat_list, CScope* scope, ENoteCDS eNoteCDS)
60 {
61  // predict the size of the final arrays
62  cds_list.reserve(feat_list.size() / 2);
63  mrna_list.reserve(feat_list.size() / 2);
64 
65  // fill in cds_list, mrna_list
66  for (auto& feat_it : feat_list) {
67  const CSeq_feat& feature = feat_it.GetOriginalFeature();
68  if (feature.IsSetData()) {
69  if (feature.GetData().IsCdregion()) {
70  cds_list.push_back(Ref(new CMatchCDS(feat_it)));
71  }
72  else
73  if (feature.GetData().IsRna() &&
75  mrna_list.push_back(Ref(new CMatchmRNA(feat_it)));
76  }
77  }
78  }
79 
80  // sort them
81  sort(cds_list.begin(), cds_list.end(), feat_loc_compare());
82  sort(mrna_list.begin(), mrna_list.end(), feat_loc_compare());
83 
84  if (cds_list.empty() || mrna_list.empty())
85  return;
86 
87  auto mrna_begin = mrna_list.begin();
88 
89  size_t compare_attempts = 0;
90  for (auto& cds_it : cds_list)
91  {
92  CMatchCDS& cds_match = *cds_it;
93  const CSeq_feat& cds = cds_match.GetFeat();
94 
96  if (cds.IsSetExcept_text()
97  && (NStr::FindNoCase(cds.GetExcept_text(), "ribosomal slippage") != string::npos
98  || NStr::FindNoCase(cds.GetExcept_text(), "trans-splicing") != string::npos)) {
99  overlap_type = sequence::eOverlap_SubsetRev;
100  }
101 
102 
103  while (mrna_begin != mrna_list.end() &&
104  (**mrna_begin).maxpos() < cds_match.minpos())
105  mrna_begin++;
106 
107  for (auto mrna_it = mrna_begin;
108  mrna_it != mrna_list.end() &&
109  (**mrna_it).minpos() <= cds_match.maxpos() &&
110  cds_match.minpos() <= (**mrna_it).maxpos();
111  ++mrna_it)
112  {
113  CMatchmRNA& mrna_match = **mrna_it;
114  const CSeq_feat& mrna = mrna_match.GetFeat();
115  if (!mrna_match.IsAccountedFor()) {
116  compare_attempts++;
117  if (TestForOverlapEx(cds.GetLocation(), mrna.GetLocation(), overlap_type, scope) >= 0) {
118  cds_match.AddmRNA(mrna_match);
119  if (eNoteCDS == eNoteCDS_No) {
120  mrna_match.AddCDS(cds_match);
121  }
122  if (validator::s_IdXrefsAreReciprocal(cds, mrna)) {
123  cds_match.SetXrefMatch(mrna_match);
124  if (eNoteCDS == eNoteCDS_No) {
125  mrna_match.SetCDS(cds);
126  }
127  else {
128  mrna_match.SetAccountedFor(true);
129  }
130  mrna_match.SetCDS(cds);
131  }
132  }
133  else
134  {
135  //cout << "problem" << endl;
136  }
137  }
138  }
139 
140  if (!cds_match.HasmRNA()) {
141  cds_match.AssignSinglemRNA();
142  }
143  }
144  // cout << "Totals CDS, mRNAs, compare attempts:" << cds_list.size() << " " << mrna_list.size() << " " << compare_attempts << endl;
145 }
146 
147 CMatchFeat::CMatchFeat(const CMappedFeat &feat) : m_feat(feat.GetSeq_feat())
148 {
151 }
152 
153 
155 {
156  bool rval = false;
157 
158  if (m_Cds) {
159  return true;
160  }
161  else {
162  // iterate through underlying cdss that aren't accounted for
163  vector < CRef<CMatchCDS> >::iterator cds_it =
164  m_UnderlyingCDSs.begin();
165  for (; cds_it != m_UnderlyingCDSs.end() && !rval; ++cds_it) {
166  if (!(*cds_it)->HasmRNA()) {
167  return true;
168  }
169  }
170  }
171  return false;
172 }
173 
174 
175 bool CMatchmRNA::MatchesUnderlyingCDS(unsigned int partial_type) const
176 {
177  bool rval = false;
178 
181 
182  if (m_Cds) {
183  if (partial_type == sequence::eSeqlocPartial_Nostart) {
184  if (m_Cds->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
185  rval = true;
186  }
187  else {
188  rval = false;
189  }
190  }
191  else if (partial_type == sequence::eSeqlocPartial_Nostop) {
192  if (m_Cds->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
193  rval = true;
194  }
195  else {
196  rval = false;
197  }
198  }
199 #if 0
200  }
201  else {
202  // iterate through underlying cdss that aren't accounted for
203  vector < CRef<CMatchCDS> >::iterator cds_it =
204  m_UnderlyingCDSs.begin();
205  while (cds_it != m_UnderlyingCDSs.end() && !rval) {
206  if (!(*cds_it)->HasmRNA()) {
207  if (partial_type == eSeqlocPartial_Nostart) {
208  if ((*cds_it)->m_Cds->GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
209  rval = true;
210  }
211  else {
212  rval = false;
213  }
214  }
215  else if (partial_type == eSeqlocPartial_Nostop) {
216  if ((*cds_it)->m_Cds->GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
217  rval = true;
218  }
219  else {
220  rval = false;
221  }
222  }
223  }
224  ++cds_it;
225  }
226 #endif
227  }
228  return rval;
229 }
230 
231 
232 bool CMatchmRNA::MatchAnyUnderlyingCDS(unsigned int partial_type) const
233 {
234  bool rval = false;
235 
238 
239  // iterate through underlying cdss that aren't accounted for
240  vector < CRef<CMatchCDS> >::const_iterator cds_it = m_UnderlyingCDSs.begin();
241  while (cds_it != m_UnderlyingCDSs.end() && !rval) {
242  if (partial_type == sequence::eSeqlocPartial_Nostart) {
243  if ((*cds_it)->GetFeat().GetLocation().GetStart(eExtreme_Biological) == mrna_start) {
244  rval = true;
245  }
246  else {
247  rval = false;
248  }
249  }
250  else if (partial_type == sequence::eSeqlocPartial_Nostop) {
251  if ((*cds_it)->GetFeat().GetLocation().GetStop(eExtreme_Biological) == mrna_stop) {
252  rval = true;
253  }
254  else {
255  rval = false;
256  }
257  }
258  ++cds_it;
259  }
260  return rval;
261 }
262 
263 
264 // only assign if the coding region has only one overlapping unaccounted for mRNA
266 {
268 
269  vector < CRef<CMatchmRNA> >::iterator mrna_it =
270  m_OverlappingmRNAs.begin();
271  for (; mrna_it != m_OverlappingmRNAs.end(); ++mrna_it) {
272  if (!(*mrna_it)->IsAccountedFor()) {
273  if (!match) {
274  match.Reset(*mrna_it);
275  }
276  else {
277  // found more than one, can't assign either
278  match.Reset();
279  break;
280  }
281  }
282 
283  }
284  if (match) {
286  match->SetCDS(*m_feat);
287  }
288 }
289 
290 
291 int CMatchCDS::GetNummRNA(bool &loc_unique)
292 {
293  int num = 0;
294  loc_unique = true;
295 
296  if (HasmRNA()) {
297  // count the one assigned
298  num++;
299  }
300  vector < CRef<CMatchmRNA> >::iterator mrna_it =
301  m_OverlappingmRNAs.begin();
302  vector < string > product_list;
303  while (mrna_it != m_OverlappingmRNAs.end()) {
304  if (!(*mrna_it)->IsAccountedFor()) {
305  // count overlapping unassigned mRNAS
306  if ((*mrna_it)->GetFeat().IsSetProduct()) {
307  string label;
308  (*mrna_it)->GetFeat().GetProduct().GetLabel(&label);
309  product_list.push_back(label);
310  }
311  num++;
312  }
313  ++mrna_it;
314  }
315  if (product_list.size() > 1) {
316  stable_sort(product_list.begin(), product_list.end());
317  vector < string >::iterator s1 = product_list.begin();
318  vector < string >::iterator s2 = s1;
319  s2++;
320  while (s2 != product_list.end()) {
321  if (NStr::Equal(*s1, *s2)) {
322  loc_unique = false;
323  break;
324  }
325  ++s1;
326  ++s2;
327  }
328  }
329  return num;
330 }
331 
333 {
334  // nothing needed yet
335 }
336 
337 
339 {
340  // nothing needed yet
341 }
342 
343 
344 void CmRNAAndCDSIndex::SetBioseq(const std::vector<CMappedFeat> *feat_list, CScope* scope)
345 {
346  m_CdsList.clear();
347  m_mRNAList.clear();
348 
349  if (!feat_list) {
350  return;
351  }
352 
354  m_CdsList, m_mRNAList, *feat_list, scope, eNoteCDS_Yes);
355 }
356 
357 
359 {
360  vector < CRef<CMatchmRNA> >::iterator mrna_it = m_mRNAList.begin();
361  for (; mrna_it != m_mRNAList.end(); ++mrna_it) {
362  if (mrna.GetOriginalFeature().Equals((*mrna_it)->GetFeat())) {
363  return (*mrna_it);
364  }
365  }
366  return CRef<CMatchmRNA>();
367 }
368 
369 
370 bool CmRNAAndCDSIndex::MatchmRNAToCDSEnd(const CMappedFeat& mrna, unsigned int partial_type)
371 {
372  bool rval = false;
373 
374  CRef<CMatchmRNA> match_mrna = FindMatchmRNA(mrna);
375  if (match_mrna && match_mrna->MatchesUnderlyingCDS(partial_type)) {
376  rval = true;
377  }
378  return rval;
379 }
380 
381 //LCOV_EXCL_STOP
382 
383 
384 
385 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
#define bool
Definition: bool.h:34
CMappedFeat –.
Definition: mapped_feat.hpp:59
bool HasmRNA() const
vector< CRef< CMatchmRNA > > m_OverlappingmRNAs
void AddmRNA(CMatchmRNA &mrna)
CRef< CMatchmRNA > m_AssignedMrna
void SetXrefMatch(CMatchmRNA &mrna)
int GetNummRNA(bool &loc_unique)
void AssignSinglemRNA()
TSeqPos m_pos_start
const CSeq_feat & GetFeat() const
TSeqPos maxpos() const
TSeqPos minpos() const
CMatchFeat(const CMappedFeat &feat)
CConstRef< CSeq_feat > m_feat
TSeqPos m_pos_stop
vector< CRef< CMatchCDS > > m_UnderlyingCDSs
bool MatchesUnderlyingCDS(unsigned int partial_type) const
void SetAccountedFor(bool val)
void SetCDS(const CSeq_feat &cds)
bool IsAccountedFor() const
CConstRef< CSeq_feat > m_Cds
bool MatchAnyUnderlyingCDS(unsigned int partial_type) const
bool HasCDSMatch()
void AddCDS(CMatchCDS &cds)
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
int Compare(const CSeq_feat &f2) const
Compare relative order of this feature and feature f2, ordering first by features' coordinates,...
Definition: Seq_feat.hpp:242
vector< CRef< CMatchCDS > > m_CdsList
CRef< CMatchmRNA > FindMatchmRNA(const CMappedFeat &mrna)
bool MatchmRNAToCDSEnd(const CMappedFeat &mrna, unsigned int partial_type)
void SetBioseq(const std::vector< CMappedFeat > *feat_list, CScope *scope)
vector< CRef< CMatchmRNA > > m_mRNAList
Include a standard set of the NCBI C++ Toolkit most basic headers.
void s_SetUpXrefPairs(vector< CRef< CMatchCDS > > &cds_list, vector< CRef< CMatchmRNA > > &mrna_list, const TFeatList &feat_list, CScope *scope, ENoteCDS eNoteCDS)
ENoteCDS
@ eNoteCDS_No
@ eNoteCDS_Yes
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
EOverlapType
Int8 TestForOverlapEx(const CSeq_loc &loc1, const CSeq_loc &loc2, EOverlapType type, CScope *scope=0, TOverlapFlags flags=fOverlap_Default)
Updated version of TestForOverlap64().
@ eSeqlocPartial_Nostart
@ eSeqlocPartial_Nostop
@ eOverlap_SubsetRev
1st is a subset of 2nd ranges
@ eOverlap_CheckIntRev
1st is a subset of 2nd with matching boundaries
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static const char label[]
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsCdregion(void) const
Check if variant Cdregion is selected.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
bool IsRna(void) const
Check if variant Rna is selected.
constexpr auto sort(_Init &&init)
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
bool s_IdXrefsAreReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
Utility macros and typedefs for exploring NCBI objects from seqfeat.asn.
bool operator()(const CRef< _T > &l, const CRef< _T > &r) const
#define const
Definition: zconf.h:232
Modified on Thu Mar 28 17:13:27 2024 by modify_doxy.py rev. 669887