NCBI C++ ToolKit
collect_simple.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: collect_simple.cpp 46133 2010-06-15 16:13:03Z badrazat $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Azat Badretdin
27 *
28 * File Description:
29 * Collects sequence information in NCBI Toolkit format and stores it in a
30 * simple sequences format in memory
31 *
32 * ===========================================================================
33 */
34 #include <ncbi_pch.hpp>
35 #include "read_blast_result.hpp"
36 
37 static int gene_feat_fit(TSimpleSeqs::iterator& seq, int from, int to);
38 
40 {
41 // collect stuff from proteins
42  for(CTypeIterator< CSeq_entry > s = Begin(); s; ++s)
43  {
44  if(s->IsSet()) continue;
45  if(!is_prot_entry(s->GetSeq())) continue;
46  TSimpleSeq seq;
47  seq.description = GetProtName(s->GetSeq());
48  seq.name = GetStringDescr (s->GetSeq());
49  seq.type = "CDS";
50  seq.seq = CRef<CBioseq>(&(s->SetSeq()));
51  const CSeq_loc& loc = getGenomicLocation(s->GetSeq());
52  addLoctoSimpleSeq(seq, loc);
53  seqs.push_back(seq);
54  if(PrintDetails())
55  {
56  NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to CDS: "
57  << "(" << seq.name << ")"
58  << "(" << printed_range(loc) << ")"
59  << "(" << seq.key << ":" << printed_range(seq) << ")"
60  << NcbiEndl;
61  }
62  }
63 // collect features from RNAs and genes
64  string name;
65  TSimpleSeqs genes;
66  for(CTypeIterator< CSeq_feat > f = Begin(); f; ++f)
67  {
68  const CSeq_loc& loc = f->GetLocation();
69  if(f->GetData().IsGene())
70  {
71  name = "Bad or no locus tag";
72  if (f->GetData().GetGene().CanGetLocus_tag())
73  name = f->GetData().GetGene().GetLocus_tag();
74 // I am assuming that each RNA feature is preceded by a gene
75  TSimpleSeq gene;
76  gene.type = "gene";
77  gene.locus_tag = name;
78  addLoctoSimpleSeq(gene, loc);
79  genes.push_back(gene);
80  if(PrintDetails())
81  {
82  NcbiCerr << "DEBUG: CollectSimpleSeqs(): added loc to gene: "
83  << "(" << name << ")"
84  << "(" << printed_range(loc) << ")"
85  << "(" << gene.key << ":" << printed_range(gene) << ")"
86  << NcbiEndl;
87  }
88  continue;
89  }
90  else if(!f->GetData().IsRna()) continue;
91  CRNA_ref::EType rna_type = f->GetData().GetRna().GetType();
92  string description="Bad or no descriptioin";
93  if ( rna_type == CRNA_ref::eType_tRNA )
94  {
95  if ( f->GetData().GetRna().CanGetExt() )
96  {
97  string type1;
98  try { type1 = Get3type(f->GetData().GetRna());}
99  catch (...)
100  {
101  NcbiCerr << "simple_overlaps: FATAL: cannot get aminoacid type for one trna feats" << NcbiEndl;
102  throw;
103  }
104  description = "tRNA:" + type1;
105  }
106  } // if tRNA
107  else
108  {
109  if(f->GetData().GetRna().CanGetExt() &&
110  f->GetData().GetRna().GetExt().IsName())
111  description = f->GetData().GetRna().GetExt().GetName();
112  }
113  TSimpleSeq seq;
114  if ( rna_type == CRNA_ref::eType_tRNA ) { seq.type = "tRNA"; }
115  else if ( rna_type == CRNA_ref::eType_rRNA ) { seq.type = GetRRNAtype(f->GetData().GetRna());}
116  else if ( rna_type == CRNA_ref::eType_premsg ) { seq.type = "premsg"; }
117  else if ( rna_type == CRNA_ref::eType_mRNA ) { seq.type = "mRNA"; }
118  else if ( rna_type == CRNA_ref::eType_snRNA ) { seq.type = "snRNA"; }
119  else if ( rna_type == CRNA_ref::eType_scRNA ) { seq.type = "scRNA"; }
120  else if ( rna_type == CRNA_ref::eType_snoRNA ) { seq.type = "snoRNA"; }
121  else if ( rna_type == CRNA_ref::eType_other ) { seq.type = "other RNA"; }
122  else { seq.type = "unknown RNA"; }
123  seq.name = name;
124  seq.description = description;
125  addLoctoSimpleSeq(seq, loc);
126  seqs.push_back(seq);
127  } // features
128 
129 // need to tidy up before doing what is next
130  seqs.sort(less_simple_seq);
131  genes.sort(less_simple_seq);
132 
133 // now go over all gene features and match them to seqs features;
134 // first of all do all exact locations
135  TSimpleSeqs::iterator seq = seqs.begin();
136  for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); )
137  {
138  string gene_range = printed_range(gene);
139  int seq_from=0, seq_to=0;
140  int gene_from = gene->exons[0].from;
141  int gene_to = gene->exons[0].to;
142  for(;seq!=seqs.end(); seq++)
143  {
144  string seq_range = printed_range(seq);
145  seq_from = seq->exons[0].from;
146  seq_to = seq->exons[0].to;
147  if(PrintDetails())
148  {
149  NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") to reach gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl;
150  }
151  if(gene->key<=seq->key) break;
152  }
153  if(seq==seqs.end()) break;
154 
155  seq_from = seq->exons[0].from;
156  seq_to = seq->exons[0].to;
157  string seq_range = printed_range(seq);
158  if(PrintDetails())
159  {
160  NcbiCerr << "DEBUG: CollectSimpleSeqs(): sliding seq " << seq_range << "(key: " << seq->key << ") reached gene " << gene_range << "(key: " << gene->key << "), locus=" << gene->locus_tag << NcbiEndl;
161  }
162  seq_to = seq->exons[seq->exons.size()-1].to;
163  if(seq->exons[0].strand != eNa_strand_plus) // JIRA-PR-147
164  {
165  seq_to = seq->exons[0].to;
166  seq_from= seq->exons[seq->exons.size()-1].from;
167  }
168  gene_to = gene->exons[gene->exons.size()-1].to;
169  if(seq_to==gene_to && seq_from==gene_from) // match
170  {
171  seq->locus_tag = gene->locus_tag;
172  gene=genes.erase(gene++);
173  }
174  else gene++;
175 
176  }
177 /////////////////////////////
178 // now try to assign non-exact gene-CDS matches
179 /////////////////////////////
180  seq=seqs.begin();
181  for(TSimpleSeqs::iterator gene = genes.begin(); gene!=genes.end(); )
182  {
183  string gene_printed_range = printed_range(gene);
184  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches: gene: " << gene_printed_range << NcbiEndl;
185  int gene_from = gene->exons[0].from;
186 // find first sequence that could match a gene
187  TSimpleSeqs::iterator seq_start=seq;
188  for(;seq_start!=seqs.end(); seq_start++)
189  {
190  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying seq_start: "
191  << printed_range(seq_start) << NcbiEndl;
192  if(seq_start->locus_tag != "")
193  {
194  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: "
195  << seq->locus_tag << ", continue..."<<NcbiEndl;
196  continue; // this is done
197  }
198 
199  int seq_from = seq_start->exons[0].strand == eNa_strand_plus ? seq_start->exons[0].from : seq_start->exons[seq_start->exons.size()-1].from;
200  if(gene_from<=seq_from) break; // in case there are cross-origin seqs, they will be in the end of seqs list, so they will be tested the last, thus this incorrect sliding should be fine
201  }
202  if(seq_start==seqs.end()) break; // done with seqs
203 // now check if other ends fit
204  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_start: " << printed_range(seq_start) << NcbiEndl;
205  int seq_to = seq_start->exons[0].strand == eNa_strand_plus
206  ? seq_start->exons[seq_start->exons.size()-1].to
207  : seq_start->exons[0].to;
208  int gene_to = gene->exons[gene->exons.size()-1].to;
209  if ( gene->exons[0].strand != eNa_strand_plus ) gene_to = gene->exons[0].to;
210  if (seq_to > gene_to)
211  {
212  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: sequences jumped over this gene, this gene does not fit any sequence, will be flagged later" << NcbiEndl;
213 // sequences jumped over this gene, this gene does not fit any sequence, will be flagged later
214  gene++;
215  continue;
216  }
217 // end find first sequence that could match a gene
218 // find first sequence that does not match a gene
219  TSimpleSeqs::iterator seq_end = seq_start;
220  int nmatches=0;
221  for(;seq_end!=seqs.end() &&
222  gene_to >= (seq_end->exons[0].strand == eNa_strand_plus
223  ? seq_end->exons[seq_end->exons.size()-1].to
224  : seq_end->exons[0].to);
225  seq_end++)
226  {
227  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: trying to find: current_seq_end "
228  << printed_range(seq_end)
229  << ", gene_to = " << gene_to
230  << ", seq_end.to = " << (seq_end->exons[0].strand == eNa_strand_plus
231  ? seq_end->exons[seq_end->exons.size()-1].to
232  : seq_end->exons[0].to)
233  << NcbiEndl;
234 
235  if(seq_end->type == "CDS" && seq_end->locus_tag == "" ) nmatches++;
236  }
237  if(seq_end!=seqs.end() ) seq_end++;
238  if(PrintDetails())
239  {
240  if(seq_end!=seqs.end() )
241  NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: " << printed_range(seq_start) << NcbiEndl;
242  else
243  NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: found seq_end: end()" << NcbiEndl;
244  }
245 // end find first sequence that does not match a gene
246  if(PrintDetails())
247  {
248  if(seq_end!=seqs.end() )
249  NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: " << printed_range(seq_end) << NcbiEndl;
250  else
251  NcbiCerr << "non-exact gene-CDS matches(" << nmatches << "): seq_end: end()" << NcbiEndl;
252  }
253  if(nmatches>1)
254  {
255  string range = printed_range(gene);
256  NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene["<<gene_printed_range<<"] matches several (" << nmatches << ") CDS features: "
257  << "locus = " << gene->locus_tag << ", "
258  << "[" << range << "]" << NcbiEndl;
259  }
260 
261 // look at all found fits
262  bool gene_used=false;
263 // find best fit and assign locus tag only for that feature
264  TSimpleSeqs::iterator best_seq=seqs.end();
265  int best_gene_feat_fit = 0x0FFFFFFF; // intentionally less than the const in gene_feat_fit function
266  for(seq=seq_start; seq!=seq_end; seq++)
267  {
268  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq) << NcbiEndl;
269  if(seq->locus_tag != "") continue; // this is done already
270  if(PrintDetails()) NcbiCerr << "non-exact gene-CDS matches["<<gene_printed_range<<"]: match: " << printed_range(seq)
271  << " does not have a locus tag yet"
272  << NcbiEndl;
273 /*
274  if(seq->type != "CDS" )
275  {
276  string range = printed_range(seq);
277  NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: non-CDS sequence does not have a gene with exactly the same boundaries: "
278  << "type = " << seq->type << ", "
279  << "name = " << seq->name << ", "
280  << "[" << range << "]" << NcbiEndl;
281  }
282  else
283 */
284  {
285  int fit=gene_feat_fit(seq, gene_from, gene_to);
286  if(fit <= best_gene_feat_fit )
287  {
288  best_seq=seq; best_gene_feat_fit = fit;
289  }
290  }
291  } // for(seq=seq_start; seq!=seq_end; seq++)
292 // found suitable seqs
293  if(best_seq!=seqs.end())
294  {
295  best_seq->locus_tag = gene->locus_tag;
296  gene_used = true;
297  }
298 // go to next gene
299  if(gene_used) gene=genes.erase(gene);
300  else gene++;
301  }
302 
303 // swipe over seqs flag those that do not have locus tag
304  NON_CONST_ITERATE(TSimpleSeqs,seq, seqs)
305  {
306  if(seq->locus_tag != "")
307  {
308  if(seq->type == "CDS")
309  {
310  for(CTypeIterator<CSeq_feat> feat=::Begin(*(seq->seq)); feat; ++feat)
311  {
312  if(feat->CanGetComment() && feat->GetComment().find("Genomic Location: ") != string::npos)
313  {
314  string comment = "Genomic Location: " + seq->locus_tag;
315  feat->SetComment(comment);
316  }
317  }
318  }
319  continue;
320  }
321  string range = printed_range(seq);
322  NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: ERROR: feature does not have a matching gene: "
323  << "type = " << seq->type << ", "
324  << "name = " << seq->name << ", "
325  << "[" << range << "]" << NcbiEndl;
326  }
327 // swipe over genes and flag those that are not used
328  NON_CONST_ITERATE(TSimpleSeqs,gene, genes)
329  {
330  string range = printed_range(gene);
331  NcbiCerr << "CReadBlastApp::CollectSimpleSeqs: WARNING: gene does not match any feature: "
332  << "locus = " << gene->locus_tag << ", "
333  << "[" << range << "]" << NcbiEndl;
334  }
335 
336 // simple seqs collected
337 
338  return 1;
339 
340 }
341 
342 int gene_feat_fit(TSimpleSeqs::iterator& seq, int from, int to)
343 {
344  int r=0xFFFFFFFF;
345  int from2 = seq->exons[0].from;
346  int to2 = seq->exons[seq->exons.size()-1].to;
347  if(seq->exons[0].strand != eNa_strand_plus)
348  {
349  from2 = seq->exons[seq->exons.size()-1].from;
350  to2 = seq->exons[0].to;
351  }
352 // feature seq should be within gene
353  if(from2<from) return r; // no fit at all
354  if(to2>to ) return r; // no fit at all
355 
356  return from2-from+to-to2;
357 }
358 
360 {
361  seq.exons.clear();
362  seq.key=kMax_Int ;
364  inter; ++inter)
365  {
366  TSeqPos from, to;
367  ENa_strand strand;
368  getFromTo(*inter, from, to, strand);
369  TSimplePair exon; exon.from=from; exon.to=to; exon.strand=strand;
370  exon.fuzzy_from = inter->IsPartialStart(eExtreme_Positional);
371  exon.fuzzy_to = inter->IsPartialStop (eExtreme_Positional);
372  if(seq.key>(int)from)
373  {
374  seq.key = (int)from;
375  }
376  if(PrintDetails())
377  {
378  NcbiCerr << "addLoctoSimpleSeq(): exon ("<< printed_range(exon) << ")" << NcbiEndl;
379  }
380  seq.exons.push_back(exon);
381  }
382  TSeqPos from, to;
383  ENa_strand strand;
384  getFromTo(loc, from, to, strand);
385  if((int)seq.exons.size()>1 &&
386  (int)to-(int)from > (int)m_length/2)
387 // over the origin annotation
388  {
389  int i=0;
391  inter; ++inter, ++i)
392  {
393  TSeqPos from, to;
394  ENa_strand strand;
395  getFromTo(*inter, from, to, strand);
396  if(i==0) seq.key = (int)from; // initialize
397  else
398  {
399  if((int)from-seq.key > m_length/2) seq.key = (int)from; // large gap, make it from here
400  }
401  }
402 
403  }
404 }
405 
406 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
static bool PrintDetails(int current_verbosity=m_current_verbosity)
static string GetProtName(const CBioseq &seq)
Definition: shortcuts.cpp:82
void addLoctoSimpleSeq(TSimpleSeq &seq, const CSeq_loc &loc)
int CollectSimpleSeqs(TSimpleSeqs &seqs)
static bool is_prot_entry(const CBioseq &seq)
static bool less_simple_seq(const TSimpleSeq &first, const TSimpleSeq &second)
static const CSeq_loc & getGenomicLocation(const CBioseq &seq)
Definition: locations.cpp:120
static void getFromTo(const CSeq_loc &loc, TSeqPos &from, TSeqPos &to, ENa_strand &strand)
Definition: locations.cpp:34
CBeginInfo Begin(void)
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
static int gene_feat_fit(TSimpleSeqs::iterator &seq, int from, int to)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
#define kMax_Int
Definition: ncbi_limits.h:184
#define NcbiEndl
Definition: ncbistre.hpp:548
#define NcbiCerr
Definition: ncbistre.hpp:544
EType
type of RNA feature
Definition: RNA_ref_.hpp:95
@ eType_scRNA
will become ncRNA, with RNA-gen.class = scRNA
Definition: RNA_ref_.hpp:102
@ eType_snoRNA
will become ncRNA, with RNA-gen.class = snoRNA
Definition: RNA_ref_.hpp:103
@ eType_snRNA
will become ncRNA, with RNA-gen.class = snRNA
Definition: RNA_ref_.hpp:101
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
range(_Ty, _Ty) -> range< _Ty >
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
string Get3type(const CRNA_ref &rna)
Definition: shortcuts.cpp:115
list< TSimpleSeq > TSimpleSeqs
string GetRRNAtype(const CRNA_ref &rna)
Definition: shortcuts.cpp:101
string printed_range(const TSeqPos from2, const TSeqPos to2)
Definition: shortcuts.cpp:320
CRef< CBioseq > seq
TSimplePairs exons
Modified on Sun Apr 21 03:44:01 2024 by modify_doxy.py rev. 669887