NCBI C++ ToolKit
overlaps.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: overlaps.cpp 92833 2021-02-17 20:26:48Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 #include <strstream>
35 
36 // all things overlaps
37 // that i've done
38 
39 int CReadBlastApp::find_overlap(TSimpleSeqs::iterator& seq, const TSimpleSeqs::iterator& ext_rna,
40  TSimpleSeqs& seqs, TSimpleSeqs& best_seq)
41 {
42  int nseq=0;
43  int from = ext_rna->exons[0].from;
44  int to = ext_rna->exons[ext_rna->exons.size()-1].to;
45  string ext_rna_range = printed_range(ext_rna);
46  for(;seq!=seqs.end(); seq++, nseq++)
47  {
48  int from2 = seq->exons[0].from;
49  int to2 = seq->exons[seq->exons.size()-1].to;
50  bool over_origin = seq->exons.size()>1 && to2-from2 > m_length/2;
51  string ext_rna_range2 = printed_range(seq);
52  if(PrintDetails()) NcbiCerr << "find_overlap"
53  << "[" << ext_rna_range << "]"
54  << "[" << ext_rna_range2 << "]" << ", trying..." << NcbiEndl;
55 
56  if(to2>=from || over_origin)
57  {
58  if(!over_origin) { if(PrintDetails()) NcbiCerr << "to2>=from" << NcbiEndl; }
59  else { if(PrintDetails()) NcbiCerr << "over_origin" << NcbiEndl; }
60  if(from2<=to || over_origin)
61  {
62  if(!over_origin) { if(PrintDetails()) NcbiCerr << "from2<=to" << NcbiEndl;}
63  else { if(PrintDetails()) NcbiCerr << "over_origin 2" << NcbiEndl;}
64  TSimpleSeqs::iterator seq2 = seq;
65  for(;seq2!=seqs.end(); seq2++)
66  {
67  int from2 = seq2->exons[0].from;
68  // int to2 = seq2->exons[seq->exons.size()-1].to;
69  string ext_rna_range2 = printed_range(seq2);
70  if(PrintDetails()) NcbiCerr << "\tfind_overlap"
71  << "[" << ext_rna_range << "]"
72  << "[" << ext_rna_range2 << "]" << ", trying 2..." << NcbiEndl;
73  if(from2>to) break;// last seq
74  int this_overlap;
75  overlaps(ext_rna, seq2, this_overlap);
76  if(PrintDetails()) NcbiCerr << "\tfind_overlap"
77  << "[" << ext_rna_range << "]"
78  << "[" << ext_rna_range2 << "]" << ", overlap = " << this_overlap << NcbiEndl;
79  if(this_overlap>0)
80  {
81 // best_seq.push_back(*seq); // this is serious.
82  best_seq.push_back(*seq2);
83  }
84  }
85  }
86  break;
87  }
88  }
89  return nseq;
90 }
91 
92 int CReadBlastApp::find_overlap(TSimpleSeqs::iterator& seq, const TSimpleSeqs::iterator& ext_rna,
93  TSimpleSeqs& seqs, int& overlap)
94 {
95  int nseq=0;
96  int from = ext_rna->exons[0].from;
97  int to = ext_rna->exons[ext_rna->exons.size()-1].to;
98  CNcbiStrstream ext_rna_range_stream; ext_rna_range_stream << from << "..." << to << '\0';
99  string ext_rna_range = ext_rna_range_stream.str();
100  TSimpleSeqs::iterator& best_seq = seq;
101  for(;seq!=seqs.end(); seq++, nseq++)
102  {
103  int from2 = seq->exons[0].from;
104  int to2 = seq->exons[seq->exons.size()-1].to;
105  CNcbiStrstream ext_rna_range_stream2; ext_rna_range_stream2 << from2 << "..." << to2 << '\0';
106  string ext_rna_range2 = ext_rna_range_stream2.str();
107  if(PrintDetails()) NcbiCerr << "find_overlap"
108  << "[" << ext_rna_range << "]"
109  << "[" << ext_rna_range2 << "]" << ", trying..." << NcbiEndl;
110 
111  if(to2>=from)
112  {
113  if(PrintDetails()) NcbiCerr << "to2>=from" << NcbiEndl;
114  if(from2<=to)
115  {
116  if(PrintDetails()) NcbiCerr << "from2<=to" << NcbiEndl;
117  TSimpleSeqs::iterator seq2 = seq;
118  for(;seq2!=seqs.end(); seq2++)
119  {
120  int from2 = seq2->exons[0].from;
121  int to2 = seq2->exons[seq->exons.size()-1].to;
122  CNcbiStrstream ext_rna_range_stream2; ext_rna_range_stream2 << from2 << "..." << to2 << '\0';
123  string ext_rna_range2 = ext_rna_range_stream2.str();
124  if(PrintDetails()) NcbiCerr << "\tfind_overlap"
125  << "[" << ext_rna_range << "]"
126  << "[" << ext_rna_range2 << "]" << ", trying 2..." << NcbiEndl;
127  if(from2>to) break;// last seq
128  int this_overlap;
129  overlaps(ext_rna, seq2, this_overlap);
130  if(PrintDetails()) NcbiCerr << "\tfind_overlap"
131  << "[" << ext_rna_range << "]"
132  << "[" << ext_rna_range2 << "]" << ", overlap = " << this_overlap << NcbiEndl;
133  if(this_overlap>overlap)
134  {
135  overlap=this_overlap;
136  best_seq = seq; // this one
137  }
138  }
139  }
140  break;
141  }
142  }
143  return nseq;
144 }
145 
146 int CReadBlastApp::overlaps(const TSimpleSeqs::iterator& seq1, const TSimpleSeqs::iterator& seq2, int& overlap)
147 {
148  overlap = 0;
149  for(TSimplePairs::const_iterator e1=seq1->exons.begin(); e1!=seq1->exons.end(); e1++)
150  {
151  for(TSimplePairs::const_iterator e2=seq2->exons.begin(); e2!=seq2->exons.end(); e2++)
152  {
153  int o = min(e2->to, e1->to)-max(e1->from, e2->from)+1;
154  if(o>0) overlap+=o;
155  }
156  }
157  return overlap;
158 }
159 
161  (
162  CBioseq& seq,
163  const CBioseq::TAnnot& annots
164  )
165 {
166  bool result = false;
167  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[annots] starts" << NcbiEndl;
169  ITERATE(CBioseq::TAnnot, gen_feature, annots)
170  {
171  if ( !(*gen_feature)->GetData().IsFtable() ) continue;
172  bool lres;
173 
174  lres = overlaps_prot_na(seq, (*gen_feature)->GetData().GetFtable());
175  result = lres || result;
176  }
178  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[annots] ends" << NcbiEndl;
179  return result;
180 }
182  (
183  const CBioseq::TAnnot& annots
184  )
185 {
186  bool result = false;
187  if(PrintDetails()) NcbiCerr << "overlaps_na[annots] starts" << NcbiEndl;
189  ITERATE(CBioseq::TAnnot, gen_feature, annots)
190  {
191  if ( !(*gen_feature)->GetData().IsFtable() ) continue;
192  bool lres;
193 
194  lres = overlaps_na((*gen_feature)->GetData().GetFtable());
195  result = lres || result;
196 
197  }
199  if(PrintDetails()) NcbiCerr << "overlaps_na[annots] ends" << NcbiEndl;
200  return result;
201 }
202 
204  (
205  CBioseq& seq,
206  const CSeq_annot::C_Data::TFtable& feats
207  )
208 {
209  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats] starts" << NcbiEndl;
211  bool result = false;
212  LocMap loc_map;
213  GetLocMap(loc_map, feats); // gene features for each location
214  EMyFeatureType seq_type = get_my_seq_type(seq);
215  string n2 = GetStringDescr (seq);
216  string name2 = GetProtName(seq);
217 /*
218  bool hasLoc = hasGenomicInterval(seq);
219  const CSeq_interval& seq_interval = getGenomicInterval(seq);
220 */
221  // bool hasLoc = hasGenomicLocation(seq);
222  const CSeq_loc& seq_interval = getGenomicLocation(seq);
223  TSeqPos from2, to2;
224  ENa_strand strand2;
225  getFromTo(seq_interval, from2, to2, strand2);
226 
228  {
229  int overlap;
230  if( !(*f1)->GetData().IsRna() ) continue;
231 // bool lres=overlaps_prot_na(n2, seq_interval, **f1, overlap);
232  bool lres=overlaps(seq_interval, (*f1)->GetLocation(), overlap);
233  // CSeqFeatData::E_Choice esite = (*f1)->GetData().Which();
234  if(lres && overlap >= m_rna_overlapThreshold)
235  {
236  string n1 = GetLocusTag(**f1, loc_map);
237  string trna_type = get_trna_string(**f1);
238  string name1;
239  if(trna_type.size()>0) name1 = trna_type;
240  else name1 = GetRNAname(**f1);
241  EMyFeatureType rna_feat_type = get_my_feat_type(**f1, loc_map);
242 // check overlaps with protein
243  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: FOUND OVERLAP" << NcbiEndl;
244  TSeqPos from1, to1;
245  ENa_strand strand1;
246  getFromTo((*f1)->GetLocation(), from1, to1, strand1);
247  int min1, min2, max1, max2;
248  min1 = min(from1, to1);
249  min2 = min(from2, to2);
250  max1 = max(from1, to1);
251  max2 = max(from2, to2);
252  // int mint; int maxt;
253  // mint = min(min1, min2);
254  // maxt = max(max1, max2);
255 
256  distanceReportStr *report = new distanceReportStr;
257  int left_frame = (from1-1)%3+1;
258  int right_frame = (from2-1)%3+1;
259 
260  report->left_strand = strand1;
261  report->right_strand = strand2;
262  report->q_loc_left_from = from1;
263  report->q_loc_right_from = from2;
264  report->q_loc_left_to = to1;
265  report->q_loc_right_to = to2;
266  report->q_id_left = n1;
267  report->q_id_right = n2;
268  report->q_name_left = name1;
269  report->q_name_right = name2;
270  report->space = overlap; // not used
271  report->left_frame = left_frame;
272  report->right_frame = right_frame;
273  report->loc1 = &((*f1)->GetLocation());
274  report->loc2 = &seq_interval;
275 
276  char bufferchar[20480]; memset(bufferchar, 0, 20480);
277  strstream buffer(bufferchar, 20480);
278  printOverlapReport(report, buffer);
279 
280  CNcbiStrstream buff_misc_feat_rna;
281  buff_misc_feat_rna
282  << "potential RNA location ("
283  << name1 << ") that overlaps protein (" << get_title(seq) << ")" << '\0';
284 
285  CNcbiStrstream buff_misc_feat_protein;
286  buff_misc_feat_protein
287  << "potential protein location ("
288  << get_title(seq) << ") that overlaps RNA (" << name1 << ")" << '\0';
289 
290 
291  CNcbiStrstream misc_feat_rna;
292  misc_feat_rna << buff_misc_feat_rna.str() << '\0';
293  CNcbiStrstream misc_feat_protein;
294  misc_feat_protein << buff_misc_feat_protein.str() << '\0';
295 
296  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: created RNA buffer: " << buff_misc_feat_rna.str() << "\n";
297  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: created protein buffer: " << buff_misc_feat_protein.str() << "\n";
298  problemStr problem = {eRnaOverlap, buffer.str(), "", "", "", -1, -1, eNa_strand_unknown };
299  m_diag[n1].problems.push_back(problem);
300  bool removeit=false;
301  string removen = "";
302 // problemStr problemCOH = {eRemoveOverlap, "", misc_feat.str(), "", "", mint, maxt, eNa_strand_unknown };
303  if (rna_feat_type == eMyFeatureType_pseudo_tRNA && seq_type != eMyFeatureType_hypo_CDS)
304  {
305  NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: RNA location "
306  << n1 << " marked for deletion (pseudo)" << "\n";
307  removen = n1;
308  removeit=true;
309  }
310  else if (rna_feat_type == eMyFeatureType_atypical_tRNA && seq_type != eMyFeatureType_hypo_CDS)
311  {
312  NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: RNA location "
313  << n1 << " marked for deletion (atypical)" << "\n";
314  removen = n1;
315  removeit=true;
316  }
317  else if
318  (
319  (
320  (rna_feat_type == eMyFeatureType_normal_tRNA ||
321  rna_feat_type == eMyFeatureType_atypical_tRNA
322  )
323  && seq_type == eMyFeatureType_hypo_CDS
324  ) ||
325  rna_feat_type == eMyFeatureType_rRNA
326  )
327  {
328  NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: CDS and gene "
329  << n2 << " marked for deletion (hypothetical)" << "\n";
330  removen = n2;
331  removeit=true;
332  }
333  else
334  {
335  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: no deletion\n";
336  removeit=false;
337  }
338  if(removeit)
339  {
340  if(removen == n1) // RNA location is deleted
341  {
342  problemStr problemCOH = {eRemoveOverlap, "", misc_feat_rna.str(), "", "", min1, max1, strand1};
343  m_diag[removen].problems.push_back(problemCOH);
344  }
345  else // protein location is deleted
346  {
347  problemStr problemCOH = {eRemoveOverlap, "", misc_feat_protein.str(), "", "", min2, max2, strand2};
348  m_diag[removen].problems.push_back(problemCOH);
349  }
350  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats]: sequence "
351  << "[" << removen << "]"
352  << " is marked for removal"
353  << NcbiEndl;
354  try
355  {
357  if(seqs!=NULL) append_misc_feature(*seqs, removen, eRemoveOverlap);
358  }
359  catch(...)
360  {
361  NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for " << removen << NcbiEndl;
362  }
363  }
364  }
365  result = lres || result;
366  } // if(lres && overlap >= m_rna_overlapThreshold)
367 
369  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[seq,feats] ends" << NcbiEndl;
370  return result;
371 }
372 
374  (
375  const CSeq_annot::C_Data::TFtable& feats
376  )
377 {
378  if(PrintDetails()) NcbiCerr << "overlaps_na[feats] starts" << NcbiEndl;
380  bool result = false;
381  LocMap loc_map;
382  GetLocMap(loc_map, feats);
384  {
385  if(PrintDetails()) NcbiCerr << "overlaps_na[feats] cycling through rna_feats" << NcbiEndl;
386 // check overlaps with externally defined tRNAs
387  if ( !(*f1)->GetData().IsRna() ) continue;
388  CRNA_ref::EType rna_type = (*f1)->GetData().GetRna().GetType();
389  if(rna_type != CRNA_ref::eType_tRNA && rna_type != CRNA_ref::eType_rRNA ) continue;
390  string type1;
391  if ( rna_type == CRNA_ref::eType_tRNA )
392  {
393  if ( !(*f1)->GetData().GetRna().CanGetExt() ) continue;
394  try { type1 = Get3type((*f1)->GetData().GetRna());}
395  catch (...)
396  {
397  NcbiCerr << "overlaps_na[feats]: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl;
398  throw;
399  }
400  }
401  else
402  {
403  type1 = GetRRNAtype((*f1)->GetData().GetRna());
404  }
405  if(type1.size()==0) continue;
406  match_na(**f1, type1);
407  }
408  result = true;
410  if(PrintDetails()) NcbiCerr << "overlaps_na[feats] ends" << NcbiEndl;
411  return result;
412 }
413 
415  (
416  const string& n1,
417  const CSeq_interval& i1,
418  const CSeq_feat& f2,
419  int& overlap
420  )
421 {
422  bool result = false;
423  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2] starts" << NcbiEndl;
424  overlap=0;
425  string n2="not gene";
426  if(f2.GetData().IsGene()) f2.GetData().GetGene().GetLabel(&n2);
427  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2]: input: "
428  << n1
429  << ","
430  << n2
431  << NcbiEndl;
432  result = overlaps(i1, f2.GetLocation(), overlap);
433 
434  if(PrintDetails()) NcbiCerr << "overlaps_prot_na[n1,i1,f2] ends" << NcbiEndl;
435  return result;
436 
437 }
438 
439 
441  (
442  const CSeq_feat& f1,
443  const CSeq_feat& f2,
444  int& overlap
445  )
446 {
447  bool result = false;
448  if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2] starts" << NcbiEndl;
449  overlap=0;
450 
451  string n1; f1.GetData().GetGene().GetLabel(&n1);
452  string n2; f2.GetData().GetGene().GetLabel(&n2);
453  if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2]: input: "
454  << n1
455  << ","
456  << n2
457  << NcbiEndl;
458  if(n1==n2) return result;
459 
460  result = overlaps(f1.GetLocation(), f2.GetLocation(), overlap);
461 
462  if(PrintDetails()) NcbiCerr << "overlaps_na[f1,f2] ends" << NcbiEndl;
463  return result;
464 
465 }
466 template <typename t1, typename t2> bool
468  (
469 /*
470  const CSeq_loc& l1,
471  const CSeq_loc& l2,
472 */
473  const t1& l1,
474  const t2& l2,
475  int& overlap
476  )
477 {
478  bool result = false;
479  overlap=0;
480  for (CTypeConstIterator<CSeq_interval> i1 = ncbi::ConstBegin(l1); i1; ++i1)
481  {
482  TSeqPos from1, to1, from2, to2;
483  ENa_strand strand1, strand2;
484  int min1, min2, max1, max2;
485  getFromTo( *i1, from1, to1, strand1);
486  min1 = min(from1, to1);
487  max1 = max(from1, to1);
488  for (CTypeConstIterator<CSeq_interval> i2 = ncbi::ConstBegin(l2); i2; ++i2)
489  {
490  getFromTo( *i2, from2, to2, strand2);
491  min2 = min(from2, to2);
492  max2 = max(from2, to2);
493  int overlap_start, overlap_end;
494  overlap_end = min(max1, max2);
495  overlap_start = max(min1, min2);
496 
497  bool result2 = overlap_end >= overlap_start;
498  if(!result2) continue;
499  overlap+=overlap_end - overlap_start + 1;
500  result=true;
501  }
502  }
503  return result;
504 }
505 
507  (
508  const CSeq_loc& l1,
509  int from2, int to2,
510  int& overlap
511  )
512 {
513  bool result = false;
514  overlap=0;
515  TSeqPos from1, to1;
516  int min1, min2, max1, max2;
517  min2 = min(from2, to2);
518  max2 = max(from2, to2);
519  for (CTypeConstIterator<CSeq_interval> i1 = ::ConstBegin(l1); i1; ++i1)
520  {
521  ENa_strand strand1;
522  getFromTo(*i1, from1, to1, strand1);
523  min1 = min(from1, to1);
524  max1 = max(from1, to1);
525  int overlap_start, overlap_end;
526  overlap_end = min(max1, max2);
527  overlap_start = max(min1, min2);
528 
529  bool result2 = overlap_end >= overlap_start;
530  if(result2) result=true; // found one segment that overlaps with the second input segment
531  overlap+=overlap_end - overlap_start + 1;
532  }
533  return result;
534 }
535 
536 bool CReadBlastApp::complete_overlap // l1 is covered by l2
537  (
538  const CSeq_loc& l1,
539  const CSeq_loc& l2
540  )
541 {
542  bool result = false;
543  for (CTypeConstIterator<CSeq_interval> i1 = ::ConstBegin(l1); i1; ++i1)
544  {
545  TSeqPos from1, to1, from2, to2;
546  ENa_strand strand1, strand2;
547  int min1, min2, max1, max2;
548  getFromTo( *i1, from1, to1, strand1);
549  min1 = min(from1, to1);
550  max1 = max(from1, to1);
551  result = false; // assume this piece
552  for (CTypeConstIterator<CSeq_interval> i2 = ::ConstBegin(l2); i2; ++i2)
553  {
554  getFromTo( *i2, from2, to2, strand2);
555 // note that this does not take into account weird cases when two pieces of l2 are stuck together without any gap, covering i1 in combination
556  min2 = min(from2, to2);
557  max2 = max(from2, to2);
558  if(min2<=min1 && max2>=max1) // found completely covering piece
559  {
560  if(PrintDetails()) NcbiCerr << "complete_overlap: "
561  << from1 << " ... " << to1 << " "
562  << from2 << " ... " << to2 << " "
563  << NcbiEndl;
564  result=true;
565  break;
566  }
567  }
568  if(!result) return result;
569  }
570  return result;
571 }
572 
574 // genomic interval is already stored from the NA_annotations
575  (
576  const CBioseq& left,
577  const CBioseq& right
578  )
579 {
580  bool result=false;
581  // check if prot
582  // string qname = CSeq_id::GetStringDescr (left, CSeq_id::eFormat_FastA);
583  string qname = GetStringDescr (left);
584  string qrname = GetStringDescr (right);
585  if(PrintDetails()) NcbiCerr << "overlaps, seq level " << NcbiEndl;
586  if(PrintDetails()) NcbiCerr << "left " << GetStringDescr (left) << NcbiEndl;
587  if(PrintDetails()) NcbiCerr << "right " << GetStringDescr (right) << NcbiEndl;
588 // assuming that protein sequences come in one piece of seq-set
589  if(left.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
590  if(PrintDetails()) NcbiCerr << "left is aa" << NcbiEndl;
591  if(right.GetInst().GetMol()!=CSeq_inst::eMol_aa) return result;
592  if(PrintDetails()) NcbiCerr << "right is aa" << NcbiEndl;
593 /*
594  if(!hasGenomicInterval(left)) return result;
595  if(!hasGenomicInterval(right)) return result;
596  const CSeq_interval& left_genomic_int = getGenomicInterval(left);
597  const CSeq_interval& right_genomic_int = getGenomicInterval(right);
598 */
599  if(!hasGenomicLocation(left)) return result;
600  if(!hasGenomicLocation(right)) return result;
601  const CSeq_loc& left_genomic_int = getGenomicLocation(left);
602  const CSeq_loc& right_genomic_int = getGenomicLocation(right);
603 
604  if(PrintDetails()) NcbiCerr << "Got intervals" << NcbiEndl;
605  TSeqPos from1, to1, from2, to2;
606  ENa_strand left_strand;
607  ENa_strand right_strand;
608  getFromTo(left_genomic_int, from1, to1, left_strand);
609  getFromTo(right_genomic_int, from2, to2, right_strand);
610 
611  if(PrintDetails()) NcbiCerr << "Got strands" << NcbiEndl;
612  int left_frame=-0xFF, right_frame=-0xFF;
613  if(left_genomic_int.IsInt())
614  {
615  left_frame = (from1-1)%3+1;
616  }
617  if(right_genomic_int.IsInt())
618  {
619  right_frame = (from2-1)%3+1;
620  }
621 
622  if(left_strand != eNa_strand_plus && left_genomic_int.IsInt()) left_frame=-left_frame;
623  if(right_strand != eNa_strand_plus && right_genomic_int.IsInt()) right_frame=-right_frame;
624 /*
625  Tue 10/7/2008 9:25 AM, Bill Klimke + his consultation w/ Leigh Riley
626  opposite strands overlaps should be treated exactly the same way
627 */
628  // if(left_strand != right_strand) return result;
629  if(PrintDetails()) NcbiCerr << "Matching strands" << NcbiEndl;
630  int space =
631  (min((int)to1, (int)to2)-
632  max((int)from2, (int)from1)
633  +1
634  )/3;
635 
636  bool complete_overlaps = false;
637  int scratch_overlap;
638  result = overlaps(left_genomic_int, right_genomic_int, scratch_overlap);
639  bool left_covered_by_right=false;
640  bool right_covered_by_left=false;
641  if(result) complete_overlaps = (left_covered_by_right=complete_overlap(left_genomic_int, right_genomic_int))
642  || (right_covered_by_left=complete_overlap(right_genomic_int, left_genomic_int));
643  if(PrintDetails()) NcbiCerr << "space = " << space
644  << ", complete_overlap = " << complete_overlaps
645  << ", result = " << result
646  << NcbiEndl;
647  if(result && scratch_overlap >= m_cds_overlapThreshold) // overlap
648  {
649  distanceReportStr *report = new distanceReportStr;
650  report->left_strand = left_strand;
651  report->right_strand = right_strand;
652  report->q_loc_left_from = from1;
653  report->q_loc_right_from = from2;
654  report->q_loc_left_to = to1;
655  report->q_loc_right_to = to2;
656  report->q_id_left = GetStringDescr (left);
657  report->q_id_right = GetStringDescr (right);
658  report->q_name_left = GetProtName(left);
659  report->q_name_right = GetProtName(right);
660  report->space = space;
661  report->left_frame = left_frame;
662  report->right_frame = right_frame;
663 
664  char bufferchar[20480]; memset(bufferchar, 0, 20480);
665  strstream buffer(bufferchar, 20480);
666  printOverlapReport(report, buffer);
667 /*
668  CNcbiStrstream misc_feat;
669  misc_feat << "potential protein locations )" << GetProtName(left)
670  << ") and " << printed_range(right)
671  << " overlap by " << scratch_overlap
672  << "bp"
673  << NcbiEndl << '\0';
674 */
675  CNcbiStrstream misc_feat_left;
676  misc_feat_left
677  << "potential protein location (" << GetProtName(left)
678  << ") that overlaps protein (" << GetProtName(right) << ")" << NcbiEndl << '\0';
679 
680  CNcbiStrstream misc_feat_right;
681  misc_feat_right
682  << "potential protein location (" << GetProtName(right)
683  << ") that overlaps protein (" << GetProtName(left) << ")" << NcbiEndl << '\0';
684 
685  // problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
686  // problemStr problemO = {eOverlap, buffer.str(), misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
687  // problemStr problem;
688  // problemStr problemCOH = {eRemoveOverlap , "", misc_feat.str(), "", "", -1, -1, eNa_strand_unknown };
689 
690 
691  if(complete_overlaps)
692  {
693  // problem = problemCO;
694  if(report->q_name_left.find("hypothetical")!=string::npos && left_covered_by_right && !right_covered_by_left)
695  {
696  NcbiCerr << "CReadBlastApp::overlaps: WARNING: sequence of a hypothetical protein "
697  << "[" << qname << "]"
698  << " is marked for removal because of a complete overlap"
699  << NcbiEndl;
700  problemStr problemCOH = {eRemoveOverlap , "", misc_feat_left.str(), "", "", (int)from1, (int)to1, left_strand};
701  problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
702  m_diag[qname].problems.push_back(problemCOH);
703  m_diag[qname].problems.push_back(problemCO);
704  try
705  {
707  if(seqs!=NULL) append_misc_feature(*seqs, qname, eRemoveOverlap);
708  }
709  catch(...)
710  {
711  NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for "
712  << qname << NcbiEndl;
713  }
714  }
715  if(report->q_name_right.find("hypothetical")!=string::npos && right_covered_by_left)
716  {
717  NcbiCerr << "CReadBlastApp::overlaps: WARNING: sequence of a hypothetical protein "
718  << "[" << qrname << "]"
719  << " is marked for removal because of a complete overlap"
720  << NcbiEndl;
721  problemStr problemCOH = {eRemoveOverlap , "", misc_feat_right.str(), "", "", (int)from2, (int)to2, right_strand};
722  problemStr problemCO = {eCompleteOverlap, buffer.str(), misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
723  m_diag[qrname].problems.push_back(problemCOH);
724  m_diag[qrname].problems.push_back(problemCO);
725  try
726  {
728  if(seqs!=NULL) append_misc_feature(*seqs, qrname, eRemoveOverlap);
729  }
730  catch(...)
731  {
732  NcbiCerr << "overlaps_prot_na[seq,feats]: WARNING: get_parent_seqset threw when trying to append misc_feature for "
733  << qrname << NcbiEndl;
734  }
735  }
736  {
737  problemStr problemO_l = {eCompleteOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
738  problemStr problemO_r = {eCompleteOverlap, "", misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
739  m_diag[qname].problems.push_back(problemO_l);
740  m_diag[qrname].problems.push_back(problemO_r);
741  }
742  }
743  else
744  {
745  problemStr problemO_l = {eOverlap, buffer.str(), misc_feat_left.str(), "", "", -1, -1, eNa_strand_unknown };
746  problemStr problemO_r = {eOverlap, "", misc_feat_right.str(), "", "", -1, -1, eNa_strand_unknown };
747  m_diag[qname].problems.push_back(problemO_l);
748  m_diag[qrname].problems.push_back(problemO_r);
749  }
750  delete report;
751  }
752  return result;
753 }
754 
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
static bool PrintDetails(int current_verbosity=m_current_verbosity)
static string GetProtName(const CBioseq &seq)
Definition: shortcuts.cpp:82
CConstBeginInfo ConstBegin(void)
void GetLocMap(LocMap &loc_map, const CSeq_annot::C_Data::TFtable &feats)
Definition: locations.cpp:203
static void IncreaseVerbosity(void)
bool overlaps_na(const CBioseq::TAnnot &annots)
Definition: overlaps.cpp:182
static void printOverlapReport(distanceReportStr *report, ostream &out=NcbiCout)
int find_overlap(TSimpleSeqs::iterator &seq, const TSimpleSeqs::iterator &ext_rna, TSimpleSeqs &seqs, int &overlap)
Definition: overlaps.cpp:92
bool complete_overlap(const CSeq_loc &l1, const CSeq_loc &l2)
Definition: overlaps.cpp:537
static int m_rna_overlapThreshold
static void DecreaseVerbosity(void)
int overlaps(const TSimpleSeqs::iterator &seq1, const TSimpleSeqs::iterator &seq2, int &overlap)
Definition: overlaps.cpp:146
static const CSeq_loc & getGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:120
bool overlaps_prot_na(CBioseq &seq, const CBioseq::TAnnot &annots)
Definition: overlaps.cpp:161
static void getFromTo(const CSeq_loc &loc, TSeqPos &from, TSeqPos &to, ENa_strand &strand)
Definition: locations.cpp:34
bool match_na(const CSeq_feat &f1, const string &type1)
Definition: match.cpp:38
void append_misc_feature(CBioseq_set::TSeq_set &seqs, const string &name, EProblem problem_type)
Definition: problems.cpp:921
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
@ eOverlap
CSeq_locs overlap.
#define NcbiEndl
Definition: ncbistre.hpp:548
#define NcbiCerr
Definition: ncbistre.hpp:544
EType
type of RNA feature
Definition: RNA_ref_.hpp:95
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TGene & GetGene(void) const
Get the variant data.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
list< CRef< CSeq_entry > > TSeq_set
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
T max(T x_, T y_)
T min(T x_, T y_)
static uint8_t * buffer
Definition: pcre2test.c:1016
EMyFeatureType
@ eMyFeatureType_normal_tRNA
@ eMyFeatureType_atypical_tRNA
@ eMyFeatureType_pseudo_tRNA
@ eMyFeatureType_rRNA
@ eMyFeatureType_hypo_CDS
@ eRemoveOverlap
@ eRnaOverlap
@ eCompleteOverlap
string get_trna_string(const CSeq_feat &feat)
Definition: shortcuts.cpp:197
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
string Get3type(const CRNA_ref &rna)
Definition: shortcuts.cpp:115
EMyFeatureType get_my_feat_type(const CSeq_feat &feat, const LocMap &loc_map)
Definition: shortcuts.cpp:246
string get_title(const CBioseq &seq)
Definition: shortcuts.cpp:159
EMyFeatureType get_my_seq_type(const CBioseq &seq)
Definition: shortcuts.cpp:176
string GetLocusTag(const CSeq_feat &f, const LocMap &loc_map)
CBioseq_set::TSeq_set * get_parent_seqset(const CBioseq &seq)
string GetRNAname(const CSeq_feat &feat)
Definition: shortcuts.cpp:229
list< TSimpleSeq > TSimpleSeqs
string GetRRNAtype(const CRNA_ref &rna)
Definition: shortcuts.cpp:101
string printed_range(const TSeqPos from2, const TSeqPos to2)
Definition: shortcuts.cpp:320
CRef< const CSeq_loc > loc1
CRef< const CSeq_loc > loc2
else result
Definition: token2.c:20
Modified on Wed Sep 04 15:02:41 2024 by modify_doxy.py rev. 669887