NCBI C++ ToolKit
missing.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: missing.cpp 91324 2020-10-09 14:48:11Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 *
30 * ===========================================================================
31 */
32 #include <ncbi_pch.hpp>
33 #include "read_blast_result.hpp"
34 
35 // internal functions
36 
37 static int get_max_distance(const int range_scale);
38 
40  (
41  const CBioseq::TAnnot& annots
42  )
43 {
44  bool result = false;
45  if(PrintDetails()) NcbiCerr << "CheckMissingRibosomalRNA[annots] starts" << NcbiEndl;
47  ITERATE(CBioseq::TAnnot, gen_feature, annots)
48  {
49  if ( !(*gen_feature)->GetData().IsFtable() ) continue;
50  bool lres = CheckMissingRibosomalRNA((*gen_feature)->GetData().GetFtable());
51  result = lres || result;
52  }
54  if(PrintDetails()) NcbiCerr << "CheckMissingRibosomalRNA[annots] ends" << NcbiEndl;
55  return result;
56 }
57 
59  (
60  const CSeq_annot::C_Data::TFtable& feats
61  )
62 {
63 // all rRNAs must be in one table!
64 
65  if(PrintDetails()) NcbiCerr << "CheckMissingRibosomalRNA[feats] starts" << NcbiEndl;
67  bool result = false;
68  bool has5S=false;
69  bool has16S=false;
70  bool has23S=false;
72  {
73  if( !(*f1)->GetData().IsRna() ) continue;
74  CRNA_ref::EType rna_type = (*f1)->GetData().GetRna().GetType();
75  if( rna_type == CRNA_ref::eType_rRNA )
76  {
77  if( !(*f1)->GetData().GetRna().CanGetExt() )
78  {
79  NcbiCerr << "CReadBlastApp::CheckMissingRibosomalRNA[feats]: FATAL: no ext feature in rRNA" << NcbiEndl;
80  throw;
81  }
82  string type = GetRRNAtype((*f1)->GetData().GetRna());
83  if(type == "5S") {has5S = true; }
84  if(type == "16S") {has16S = true; }
85  if(type == "23S") {has23S = true; }
86  }
87  }
88  if(!has5S)
89  NcbiCerr << "CReadBlastApp::CheckMissingRibosomalRNA[feats]: ERROR: 5S ribosomal RNA is missing" << NcbiEndl;
90  if(!has16S)
91  NcbiCerr << "CReadBlastApp::CheckMissingRibosomalRNA[feats]: ERROR: 16S ribosomal RNA is missing" << NcbiEndl;
92  if(!has23S)
93  NcbiCerr << "CReadBlastApp::CheckMissingRibosomalRNA[feats]: ERROR: 23S ribosomal RNA is missing" << NcbiEndl;
95  if(PrintDetails()) NcbiCerr << "CheckMissingRibosomalRNA[feats] ends" << NcbiEndl;
96  return result;
97 }
98 
99 void CReadBlastApp::ugly_simple_overlaps_call(int& n_user_neighbors, int& n_ext_neighbors,
100  TSimpleSeqs::iterator& ext_rna,
101  TSimpleSeqs::iterator& first_user_in_range, TSimpleSeqs::iterator& first_user_non_in_range,
102  TSimpleSeqs& seqs, int max_distance,
103  TSimpleSeqs::iterator& first_ext_in_range, TSimpleSeqs::iterator& first_ext_non_in_range,
104  string& bufferstr)
105 {
106  if(PrintDetails())
107  {
108  if(first_user_in_range==seqs.end())
109  {
110  NcbiCerr << "ugly_simple_overlaps_call: first_user_in_range is already at the end" << NcbiEndl;
111  }
112  else
113  {
114  NcbiCerr << "ugly_simple_overlaps_call: first_user_in_range = " << printed_range(first_user_in_range) << NcbiEndl;
115  }
116  }
117 
118  n_user_neighbors = get_neighboring_sequences(ext_rna, first_user_in_range, first_user_non_in_range,
119  seqs, max_distance);
120  n_ext_neighbors = get_neighboring_sequences(ext_rna, first_ext_in_range, first_ext_non_in_range,
121  m_extRNAtable2, max_distance);
122  if(PrintDetails())
123  {
124  if(first_user_in_range==seqs.end())
125  {
126  NcbiCerr << "ugly_simple_overlaps_call: after call: first_user_in_range is already at the end" << NcbiEndl;
127  }
128  else
129  {
130  NcbiCerr << "ugly_simple_overlaps_call: after call: first_user_in_range = " << printed_range(first_user_in_range) << NcbiEndl;
131  }
132  }
133 
135  addSimpleTab(buffer, "CENTER_REFERENCE", ext_rna, max_distance);
136  for(TSimpleSeqs::iterator entry = first_ext_in_range; entry!= first_ext_non_in_range; entry++)
137  {
138  if(entry==ext_rna) continue; // addSimpleTab(buffer, "CENTER_REFERENCE", entry);
139  else addSimpleTab(buffer, "REFERENCE", entry, max_distance);
140  }
141  for(TSimpleSeqs::iterator entry = first_user_in_range; entry!=first_user_non_in_range; entry++)
142  {
143  addSimpleTab(buffer, "VICINITY", entry, max_distance);
144  }
145  buffer << '\0';
146  bufferstr=buffer.str();
147 }
148 
150 {
151  int nabsent=0;
152  int saved_m_verbosity_threshold = m_verbosity_threshold;
153  // m_verbosity_threshold = 300;
154  if(PrintDetails()) NcbiCerr << "simple_overlaps starts: " << NcbiEndl;
155  TSimpleSeqs& seqs=m_simple_seqs; // now calculated in CopyGenestoforgotthename
156 
157  TSimpleSeqs::iterator first_user_in_range = seqs.begin();
158  TSimpleSeqs::iterator first_user_non_in_range = seqs.begin();
159  TSimpleSeqs::iterator first_ext_in_range = m_extRNAtable2.begin();
160  TSimpleSeqs::iterator first_ext_non_in_range = m_extRNAtable2.begin();
161  TSimpleSeqs::iterator seq = seqs.begin();
163  {
164  int from, to;
165  from = ext_rna->exons[0].from;
166  to = ext_rna->exons[ext_rna->exons.size()-1].to;
167  ENa_strand strand = ext_rna->exons[0].strand;
168  int range_scale = to - from;
169  int max_distance = get_max_distance(range_scale);
170  string type2 = ext_rna->name;
171  string ext_rna_range = printed_range(ext_rna);
172  if(PrintDetails()) NcbiCerr << "simple_overlaps[" << type2 << "[" << ext_rna_range << "]" << "]" << NcbiEndl;
173 // find BEST overlap, not good enough here
174  TSimpleSeqs best_seq;
175  find_overlap(seq, ext_rna, seqs, best_seq); // this will slide seq along seqs
176  bool absent = true;
177  string diag_name = ext_rna->name;
178 // for buffer
179  int n_user_neighbors=0; int n_ext_neighbors = 0; string bufferstr="";
180  NON_CONST_ITERATE(TSimpleSeqs, seq2, best_seq)
181  {
182  int overlap=0;
183  overlaps(ext_rna, seq2, overlap);
184  CNcbiStrstream seq2_range_stream;
185  string seq2_range = printed_range(seq2);
186  if(PrintDetails()) NcbiCerr << "simple_overlaps"
187  << "[" << type2
188  << "[" << ext_rna_range << "]"
189  << "[" << seq2_range << "]"
190  << "]"
191  << ". "
192  << "Overlap = " << overlap
193  << NcbiEndl;
194  if(PrintDetails()) NcbiCerr << "ext_rna->type = " << ext_rna->type << NcbiEndl;
195  if(PrintDetails()) NcbiCerr << "seq2->type = " << seq2->type << NcbiEndl;
196  if(PrintDetails()) NcbiCerr << "strand = " << int(strand) << NcbiEndl;
197  if(PrintDetails()) NcbiCerr << "seq2->exons[0].strand = " << int(seq2->exons[0].strand) << NcbiEndl;
198  absent = absent && (!overlap || ext_rna->type != seq2->type); // Absent
199  bool bad_strand = (overlap>0 && ext_rna->type == seq2->type && strand != seq2->exons[0].strand); // BadStrand
200  if(!bad_strand) continue;
201  string diag_name2 = seq2->name;
202  int from2, to2;
203  from2 = seq2->exons[0].from;
204  to2 = seq2->exons[seq2->exons.size()-1].to;
205  bool undef_strand = seq2->exons[0].strand == eNa_strand_unknown;
206  if(!bufferstr.size())
207  {
208  if(PrintDetails())
209  {
210  if(first_user_in_range==seqs.end())
211  {
212  NcbiCerr << "simple_overlaps: first_user_in_range is already at the end" << NcbiEndl;
213  }
214  else
215  {
216  NcbiCerr << "simple_overlaps: first_user_in_range = " << printed_range(first_user_in_range) << NcbiEndl;
217  }
218  }
219  ugly_simple_overlaps_call(n_user_neighbors, n_ext_neighbors,
220  ext_rna, first_user_in_range, first_user_non_in_range, seqs, max_distance,
221  first_ext_in_range, first_ext_non_in_range, bufferstr);
222  }
223  CNcbiStrstream misc_feat;
224  string seq_range = printed_range(seq);
225  EProblem trnaStrandProblem = undef_strand ? eTRNAUndefStrand : eTRNABadStrand;
226  misc_feat << "RNA does not match strand for feature located at " << seq_range << NcbiEndl;
227  misc_feat << '\0';
228 // this goes to the misc_feat, has to be original location, and name, corrected strand
229  problemStr problem = {trnaStrandProblem, bufferstr, misc_feat.str(), "", "", from2, to2, strand};
230  m_diag[diag_name2].problems.push_back(problem);
231  if(PrintDetails()) NcbiCerr << "simple_overlaps: adding problem:" << "\t"
232  << diag_name << "\t"
233  << "eTRNABadStrand" << "\t"
234  << bufferstr << "\t"
235  << NcbiEndl;
236 // this goes to the log, has to be new
237  problemStr problem2 = {trnaStrandProblem, bufferstr, "", "", "", from, to, strand};
238  m_diag[diag_name].problems.push_back(problem2);
239 
240  } // best_Seq iteration NON_CONST_ITERATE(TSimpleSeqs, seq2, best_seq)
241 
242  if(absent)
243  {
244  if(!bufferstr.size())
245  {
246  ugly_simple_overlaps_call(n_user_neighbors, n_ext_neighbors,
247  ext_rna, first_user_in_range, first_user_non_in_range, seqs, max_distance,
248  first_ext_in_range, first_ext_non_in_range, bufferstr);
249  }
250  CNcbiStrstream misc_feat;
251  misc_feat << "no RNA in the input this type: " <<type2 << "[" << ext_rna_range << "]" << NcbiEndl;
252  misc_feat << '\0';
253  problemStr problem = {eTRNAAbsent , "", misc_feat.str(), "", "", from, to, strand};
254  m_diag[diag_name].problems.push_back(problem);
255  if(PrintDetails()) NcbiCerr << "simple_overlaps: adding problem:" << "\t"
256  << diag_name << "\t"
257  << "eTRNAAbsent" << "\t"
258  << bufferstr << "\t"
259  << NcbiEndl;
260  problemStr problem2 = {eTRNAAbsent , bufferstr, "", "", "", -1, -1, strand};
261  m_diag[diag_name].problems.push_back(problem2);
262  nabsent++;
263  }
264 // if no neighbors were in sight we need to settle for the current seq
265  if(first_user_in_range==seqs.end()) first_user_in_range=seq;
266 
267  } // NON_CONST_ITERATE(TSimpleSeqs, ext_rna, m_extRNAtable2)
268  if(PrintDetails()) NcbiCerr << "simple_overlaps ends: " << NcbiEndl;
269  m_verbosity_threshold = saved_m_verbosity_threshold;
270 
271  return nabsent;
272 }
273 
274 
276  const TSimpleSeqs::iterator& ext_rna,
277  TSimpleSeqs::iterator& first_user_in_range, TSimpleSeqs::iterator& first_user_non_in_range,
278  TSimpleSeqs& seqs, const int max_distance)
279 // given ext_rna, shifts the window first_user_in_range:first_user_non_in_range in the list of
280 // previously sorted seqs
281 {
282  int from = ext_rna->exons[0].from;
283  int to = ext_rna->exons[ext_rna->exons.size()-1].to;
284 
285  bool first_in_range_set = false;
286  int n=0;
287  if(PrintDetails())
288  {
289  if(first_user_in_range==seqs.end())
290  {
291  NcbiCerr << "get_neighboring_sequences: first_user_in_range is already at the end" << NcbiEndl;
292  }
293  else
294  {
295  NcbiCerr << "get_neighboring_sequences: first_user_in_range = "
296  << printed_range(first_user_in_range) << NcbiEndl;
297  }
298  }
299 
300  TSimpleSeqs::iterator seq = first_user_in_range;
301  for(; seq !=seqs.end(); seq++)
302  {
303  int key = seq->key;
304  int from2 = seq->exons[0].from;
305  int to2 = seq->exons[seq->exons.size()-1].to;
306  if(to2-from2> 50000)
307  {
308  NcbiCerr << "get_neighboring_sequences: WARNING: span of annotation "
309  << seq->locus_tag << ""
310  << "[" << seq->name<< "],"
311  << "[" << seq->description<< "]"
312  << " is > 50000, probably a break in a circular molecule cutting across the annotation. This annotation will be ignored." << NcbiEndl;
313  continue;
314  }
315  if(PrintDetails())
316  {
317  NcbiCerr << "get_neighboring_sequences: seq = " << printed_range(seq) << NcbiEndl;
318  NcbiCerr << "get_neighboring_sequences: first_in_range_set = " << first_in_range_set << NcbiEndl;
319  }
320  int proximity = sequence_proximity(from, to, from2, to2, key, max_distance);
321  if(proximity<0) continue;
322  if (proximity==0) // in the range
323  {
324  if(!first_in_range_set) { first_user_in_range = seq; first_in_range_set=true; }
325  n++;
326  }
327  else // already after the range
328  {
329  break;
330  }
331  }
332  if(PrintDetails()) NcbiCerr << "get_neighboring_sequences: after cycle first_in_range_set = "
333  << first_in_range_set << NcbiEndl;
334  first_user_non_in_range = seq;
335  if(!first_in_range_set) {first_user_in_range = first_user_non_in_range = seqs.end();}
336  if(first_user_non_in_range==seqs.end())
337  {
338  n++; n--;
339  }
340  if(first_user_in_range==seqs.end())
341  {
342  n++; n--;
343  }
344  if(PrintDetails()) NcbiCerr << "get_neighboring_sequences: returning: " << n << NcbiEndl;
345  return n;
346 }
347 
348 int CReadBlastApp::sequence_proximity(const int target_from, const int target_to,
349  const int from, const int to, const int key)
350 {
351  // int proximity = 0;
352  int range_scale = target_to - target_from;
353  int max_distance = get_max_distance(range_scale);
354  return sequence_proximity(target_from, target_to, from, to, key, max_distance);
355 }
356 
357 int get_max_distance(const int range_scale)
358 {
359  int neighbor_factor = 10;
360  int min_range = 0; // 3000;
361  int max_range = 5000; // 3000;
362  int max_distance = range_scale * neighbor_factor;
363  if(max_distance < min_range) max_distance = min_range;
364  if(max_distance > max_range) max_distance = max_range;
365  return max_distance;
366 }
367 
368 int CReadBlastApp::sequence_proximity(const int target_from, const int target_to,
369  const int from, const int to, const int key, const int max_distance)
370 {
371  if(to < target_from - max_distance ) return -1;
372  if(from > target_to + max_distance ) return +1;
373  return 0;
374 }
375 
376 void CReadBlastApp::addSimpleTab(CNcbiStrstream& buffer, const string tag, const TSimpleSeqs::iterator& ext_rna,
377  const int max_distance)
378 {
379  ITERATE(TSimplePairs, e, ext_rna->exons)
380  {
381  string strandt = e->strand == eNa_strand_minus ? "-" : "+";
382  buffer << tag << "\t"
383  << max_distance << "\t"
384  << ext_rna->type << "\t"
385  << ext_rna->name << "(" << ext_rna->locus_tag << ")" << "\t"
386  << ext_rna->exons[0].from << "\t"
387  << ext_rna->description << "\t"
388  << e->from << "\t"
389  << e->to << "\t"
390  << strandt;
391  buffer << NcbiEndl;
392  }
393 }
394 
static int m_verbosity_threshold
static bool PrintDetails(int current_verbosity=m_current_verbosity)
static int sequence_proximity(const int target_from, const int target_to, const int from, const int to, const int key)
Definition: missing.cpp:348
static void IncreaseVerbosity(void)
static int get_neighboring_sequences(const TSimpleSeqs::iterator &ext_rna, TSimpleSeqs::iterator &first_user_in_range, TSimpleSeqs::iterator &first_user_non_in_range, TSimpleSeqs &seqs, const int max_distance)
Definition: missing.cpp:275
TSimpleSeqs m_simple_seqs
int find_overlap(TSimpleSeqs::iterator &seq, const TSimpleSeqs::iterator &ext_rna, TSimpleSeqs &seqs, int &overlap)
Definition: overlaps.cpp:92
static void addSimpleTab(CNcbiStrstream &buffer, const string tag, const TSimpleSeqs::iterator &ext_rna, const int max_distance)
Definition: missing.cpp:376
int simple_overlaps(void)
Definition: missing.cpp:149
static void DecreaseVerbosity(void)
int overlaps(const TSimpleSeqs::iterator &seq1, const TSimpleSeqs::iterator &seq2, int &overlap)
Definition: overlaps.cpp:146
TSimpleSeqs m_extRNAtable2
bool CheckMissingRibosomalRNA(const CBioseq::TAnnot &annots)
Definition: missing.cpp:40
void ugly_simple_overlaps_call(int &n_user_neighbors, int &n_ext_neighbors, TSimpleSeqs::iterator &ext_rna, TSimpleSeqs::iterator &first_user_in_range, TSimpleSeqs::iterator &first_user_non_in_range, TSimpleSeqs &seqs, int max_distance, TSimpleSeqs::iterator &first_ext_in_range, TSimpleSeqs::iterator &first_ext_non_in_range, string &bufferstr)
Definition: missing.cpp:99
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NcbiEndl
Definition: ncbistre.hpp:548
#define NcbiCerr
Definition: ncbistre.hpp:544
EType
type of RNA feature
Definition: RNA_ref_.hpp:95
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
yy_size_t n
static int get_max_distance(const int range_scale)
Definition: missing.cpp:357
const struct ncbi::grid::netcache::search::fields::KEY key
const char * tag
static pcre_uint8 * buffer
Definition: pcretest.c:1051
@ eTRNABadStrand
@ eTRNAUndefStrand
@ eTRNAAbsent
vector< TSimplePair > TSimplePairs
list< TSimpleSeq > TSimpleSeqs
string GetRRNAtype(const CRNA_ref &rna)
Definition: shortcuts.cpp:101
string printed_range(const TSeqPos from2, const TSeqPos to2)
Definition: shortcuts.cpp:320
Definition: type.c:6
else result
Definition: token2.c:20
Modified on Fri Dec 08 08:21:56 2023 by modify_doxy.py rev. 669887