NCBI C++ ToolKit
glimmer_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: glimmer_reader.cpp 91383 2020-10-21 16:07:12Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
34 #include <objtools/error_codes.hpp>
40 #include <objects/seq/Seq_data.hpp>
41 #include <objmgr/util/sequence.hpp>
42 
43 #include <corelib/ncbiutil.hpp>
44 
45 
46 #define NCBI_USE_ERRCODE_X Objtools_Rd_Glimmer
47 
50 
51 
53 {
54 }
55 
56 
58  int genetic_code_idx)
59 {
60  CRef<CSeq_entry> entry(new CSeq_entry);
61  CRef<CSeq_annot> annot(new CSeq_annot);
62  entry->SetSet().SetSeq_set();
63  entry->SetSet().SetAnnot().push_back(annot);
64 
65  /// parse line by line
66  /// we will be lax and skip enpty lines; we will also permit a few comments
67  string line;
68  string defline;
69  CSeq_id_Handle idh;
70  TSeqPos seq_length = 0;
71 
72  size_t errs = 0;
73  size_t count = 0;
74  while (NcbiGetlineEOL(istr, line)) {
75  ++count;
76  if (line.empty() || line[0] == '#' ||
77  (line.size() >= 2 && line[0] == '/' && line[1] == '/')) {
78  continue;
79  }
80 
81  if (defline.empty()) {
82  if (line[0] == '>') {
83  defline = line;
84  string s = defline;
85  string::size_type pos = s.find_first_of(" ");
86  if (pos != string::npos) {
87  s.erase(pos);
88  }
89  s.erase(0, 1);
90 
91  CBioseq::TId ids;
92  CSeq_id::ParseFastaIds(ids, s);
93 
95  idh = CSeq_id_Handle::GetHandle(*best);
96 
97  CBioseq_Handle bsh = scope.GetBioseqHandle(idh);
98  if ( !bsh ) {
100  "Failed to find sequence: " + s);
101  }
102  seq_length = bsh.GetBioseqLength();
103  } else {
104  CNcbiOstrstream ostr;
105  ostr << "CGlimmerReader::ReadAnnot(): line "
106  << count << ": failed to identify defline: " << line;
107  string msg = string(CNcbiOstrstreamToString(ostr));
108  ERR_POST_X(1, Error << msg);
110  }
111  } else {
112  list<string> toks;
113  NStr::Split(line, " \t", toks, NStr::fSplit_Tokenize);
114  if (toks.size() != 5) {
115  CNcbiOstrstream ostr;
116  ostr << "CGlimmerReader::ReadAnnot(): line "
117  << count << ": invalid number of tokens: "
118  << "found " << toks.size() << ", expected 5: " << line;
119  string msg = string(CNcbiOstrstreamToString(ostr));
120  ERR_POST_X(2, Error << msg);
121  ++errs;
122  if (errs > 5) {
124  }
125  }
126 
127  list<string>::iterator it = toks.begin();
128 
129  /// token 1: ORF identifier
130  string orf_name = *it++;
131 
132  /// token 2: start position
133  TSeqPos start_pos = 0;
134  try {
135  start_pos = NStr::StringToInt(*it++);
136  start_pos -= 1;
137  }
138  catch (CException&) {
139  CNcbiOstrstream ostr;
140  ostr << "CGlimmerReader::ReadAnnot(): line "
141  << count << ": failed to identify start pos: " << line;
142  string msg = string(CNcbiOstrstreamToString(ostr));
143  ERR_POST_X(3, Error << msg);
144 
145  ++errs;
146  if (errs > 5) {
148  } else {
149  continue;
150  }
151  }
152 
153  /// token 3: stop position
154  TSeqPos stop_pos = 0;
155  try {
156  stop_pos = NStr::StringToInt(*it++);
157  stop_pos -= 1;
158  }
159  catch (CException&) {
160  CNcbiOstrstream ostr;
161  ostr << "CGlimmerReader::ReadAnnot(): line "
162  << count << ": failed to identify stop pos: " << line;
163  string msg = string(CNcbiOstrstreamToString(ostr));
164  ERR_POST_X(4, Error << msg);
165 
166  ++errs;
167  if (errs > 5) {
169  } else {
170  continue;
171  }
172  }
173 
174  /// stop may be less than start!
175 
176  /// token 4: frame + strand
177  ENa_strand strand = eNa_strand_plus;
178  try {
179  int frame = NStr::StringToInt(*it++);
180  if (frame > 3 || frame < -3) {
181  NCBI_THROW(CException, eUnknown, "frame out of range");
182  }
183 
184  if (frame < 0) {
185  strand = eNa_strand_minus;
186  }
187  }
188  catch (CException&) {
189  CNcbiOstrstream ostr;
190  ostr << "CGlimmerReader::ReadAnnot(): line "
191  << count << ": failed to identify frame: " << line;
192  string msg = string(CNcbiOstrstreamToString(ostr));
193  ERR_POST_X(5, Error << msg);
194 
195  ++errs;
196  if (errs > 5) {
198  } else {
199  continue;
200  }
201  }
202 
203  /// token 5: score
204  //double score = 0;
205  try {
206  /*score =*/ NStr::StringToDouble(*it++);
207  }
208  catch (CException&) {
209  CNcbiOstrstream ostr;
210  ostr << "CGlimmerReader::ReadAnnot(): line "
211  << count << ": failed to identify score: " << line;
212  string msg = string(CNcbiOstrstreamToString(ostr));
213  ERR_POST_X(6, Error << msg);
214 
215  ++errs;
216  if (errs > 5) {
218  } else {
219  continue;
220  }
221  }
222 
223  ///
224  /// build our features
225  ///
226 
227  /// CDS feat
228  CRef<CSeq_feat> cds_feat(new CSeq_feat());
229  if (strand == eNa_strand_plus && start_pos > stop_pos) {
230  /// circular cds_feature; make two intervals
231  CRef<CSeq_interval> ival;
232 
233  ival.Reset(new CSeq_interval);
234  ival->SetFrom(start_pos);
235  ival->SetTo (seq_length - 1);
236  cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
237 
238  ival.Reset(new CSeq_interval);
239  ival->SetFrom(0);
240  ival->SetTo (stop_pos);
241  cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
242 
243  } else if (strand == eNa_strand_minus && start_pos < stop_pos) {
244  /// circular cds_feature; make two intervals
245  CRef<CSeq_interval> ival;
246 
247  ival.Reset(new CSeq_interval);
248  ival->SetFrom(0);
249  ival->SetTo (start_pos);
250  cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
251 
252  ival.Reset(new CSeq_interval);
253  ival->SetFrom(stop_pos);
254  ival->SetTo (seq_length - 1);
255  cds_feat->SetLocation().SetPacked_int().Set().push_back(ival);
256 
257  } else {
258  cds_feat->SetLocation().SetInt().SetFrom(min(start_pos, stop_pos));
259  cds_feat->SetLocation().SetInt().SetTo (max(start_pos, stop_pos));
260  }
261  cds_feat->SetLocation().SetStrand(strand);
262  cds_feat->SetLocation().SetId(*idh.GetSeqId());
263 
264  CCdregion& cdr = cds_feat->SetData().SetCdregion();
265  if (genetic_code_idx) {
267  d->SetId(genetic_code_idx);
268  cdr.SetCode().Set().push_back(d);
269  }
270 
271  CRef<CSeq_feat> gene_feat(new CSeq_feat);
272  gene_feat->SetData().SetGene().SetLocus(orf_name);
273  gene_feat->SetLocation().Assign(cds_feat->GetLocation());
274 
275  annot->SetData().SetFtable().push_back(gene_feat);
276  annot->SetData().SetFtable().push_back(cds_feat);
277  }
278  }
279  LOG_POST_X(7, Info << "CGlimmerReader::Read(): parsed " << count << " lines, " << errs << " errors");
280 
281  string prefix("lcl|prot");
282  count = 0;
283  NON_CONST_ITERATE (CSeq_annot::TData::TFtable, it, annot->SetData().SetFtable()) {
284  CSeq_feat& feat = **it;
286  continue;
287  }
288 
289  CRef<CSeq_entry> sub_entry(new CSeq_entry);
290  CBioseq& bioseq = sub_entry->SetSeq();
291 
292  /// establish our inst
293  CSeq_inst& inst = bioseq.SetInst();
294  CSeqTranslator::Translate(**it, scope,
295  inst.SetSeq_data().SetIupacaa().Set(),
296  false /* trim trailing stop */);
299  inst.SetLength(inst.SetSeq_data().SetIupacaa().Set().size());
300 
301  /// create a readable seq-id
302  CNcbiOstrstream ostr;
303  ostr << prefix << setw(7) << setfill('0') << ++count;
304  string id_str = string(CNcbiOstrstreamToString(ostr));
305 
306  CRef<CSeq_id> id(new CSeq_id(id_str));
307  bioseq.SetId().push_back(id);
308 
309  /// set the product on the feature
310  feat.SetProduct().SetWhole().Assign(*id);
311 
312  /// save our bioseq
313  /// this is done last to preserve our data in a serializable form
314  entry->SetSet().SetSeq_set().push_back(sub_entry);
315  }
316 
317  return entry;
318 }
319 
320 
User-defined methods of the data storage class.
CBioseq_Handle –.
CCdregion –.
Definition: Cdregion.hpp:66
CRef< objects::CSeq_entry > Read(CNcbiIstream &istr, objects::CScope &scope, int genetic_code_idx=11)
read in and create a seq-annot for the glimmer input we also optionally create proteins for the CDSs ...
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
USING_SCOPE(objects)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:687
#define LOG_POST_X(err_subcode, message)
Definition: ncbidiag.hpp:553
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
CConstRef< CSeq_id > GetSeqId(void) const
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:772
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSeqPos GetBioseqLength(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetProduct(TProduct &value)
Assign a value to Product data member.
Definition: Seq_feat_.cpp:110
void SetCode(TCode &value)
Assign a value to Code data member.
Definition: Cdregion_.cpp:68
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetTo(TTo value)
Assign a value to To data member.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
void SetFrom(TFrom value)
Assign a value to From data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
Definition of all error codes used in objtools libraries.
Useful/utility classes and methods.
T max(T x_, T y_)
T min(T x_, T y_)
static const char * prefix[]
Definition: pcregrep.c:405
Modified on Wed Apr 17 13:09:52 2024 by modify_doxy.py rev. 669887