NCBI C++ ToolKit
rna_edit.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: rna_edit.cpp 95788 2021-12-23 13:29:04Z stakhovv $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Igor Filippov
27 *
28 * File Description:
29 * functions for parsing FindITS output
30 */
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbistd.hpp>
36 
40 
41 CFindITSParser::CFindITSParser(const char *input, CSeq_entry_Handle tse) : m_istr(input), m_tse(tse)
42 {
43  m_lr.Reset(ILineReader::New(m_istr));
44  if (m_lr.Empty())
45  {
46  NCBI_THROW(CException, eUnknown, "Unable to read Label RNA|ITS results");
47  }
48 }
49 
51 {
52  const CTempString& line = *++*m_lr;
53  return x_ParseLine(line, m_tse, m_bsh, m_negative, m_msg);
54 }
55 
56 // Dear Future self: https://xkcd.com/1421/
58 {
59  CRef <CSeq_feat> null_mrna;
60  vector<string> arr;
61  NStr::Split(line,"\t",arr);
62  if (arr.size() != 9)
63  {
64  if (arr.size() == 1)
65  msg = "No features found for: " + line;
66  else if (!arr.empty())
67  msg = "Malformed line: " + line;
68  return null_mrna;
69  }
70  string accession = arr[0];
71  string ssu = arr[2];
72  string its1 = arr[3];
73  string r58S = arr[4];
74  string its2 = arr[5];
75  string lsu = arr[6];
76  string error = arr[7];
77  string strand = arr[8];
78 
79  bsh = x_GetBioseqHandleFromIdGuesser(accession,tse);
80  if (!bsh)
81  {
82  msg = "No bioseq found for: " + accession;
83  return null_mrna;
84  }
85 
86  arr.clear();
88  if (!error.empty() && error != "Broken or partial sequence, no 5.8S!" && error != "Broken or partial sequence, only partial 5.8S!")
89  {
90  msg = "Error returned for: "+accession+" "+error;
91  return null_mrna;
92  }
93 
94  NStr::Split(ssu,":",arr);
95  ssu = arr.back();
97  arr.clear();
98 
99  NStr::Split(its1,":",arr);
100  its1 = arr.back();
102  arr.clear();
103 
104  NStr::Split(r58S,":",arr);
105  r58S = arr.back();
107  arr.clear();
108 
109  NStr::Split(its2,":",arr);
110  its2 = arr.back();
112  arr.clear();
113 
114  NStr::Split(lsu,":",arr);
115  lsu = arr.back();
117  arr.clear();
118 
119  bool ssu_present(false);
120  bool lsu_present(false);
121  bool ssu_too_large(false);
122  bool lsu_too_large(false);
123  bool r58S_too_large(false);
124  bool its1_span(false);
125  bool its2_span(false);
126 
127  vector<int> starts;
128  vector<int> stops;
129  vector<bool> spans;
130  int bioseq_length = bsh.GetBioseqLength();
131  GetSpan(ssu, starts, stops, spans);
132  GetSpan(its1, starts, stops, spans);
133  GetSpan(r58S, starts, stops, spans);
134  GetSpan(its2, starts, stops, spans);
135  GetSpan(lsu, starts, stops, spans);
136 
137  its1_span = spans[1];
138  its2_span = spans[3];
139 
140  vector<string> comments;
141  if (ssu != "Not found")
142  {
143  comments.push_back("small subunit ribosomal RNA");
144  ssu_present = true;
145  ssu_too_large = IsLengthTooLarge(ssu, 2200, 0, starts, stops, spans, bioseq_length);
146  }
147  if (its1 != "Not found")
148  {
149  comments.push_back("internal transcribed spacer 1");
150  }
151  if (r58S != "Not found")
152  {
153  comments.push_back("5.8S ribosomal RNA");
154  r58S_too_large = IsLengthTooLarge(r58S, 200, 2, starts, stops, spans, bioseq_length);
155  }
156  if (its2 != "Not found")
157  {
158  comments.push_back("internal transcribed spacer 2");
159  }
160  if (lsu != "Not found")
161  {
162  comments.push_back("large subunit ribosomal RNA");
163  lsu_present = true;
164  lsu_too_large = IsLengthTooLarge(lsu, 5100, 4, starts, stops, spans, bioseq_length);
165  }
166 
167  if (its1_span && its2_span && (r58S == "Not found" || r58S == "No end" || r58S == "No start"))
168  {
169  msg = "5.8S is not found while ITS1 and ITS2 spans exist in: "+accession;
170  return null_mrna;
171  }
172  if (ssu_too_large)
173  {
174  msg = "SSU too large in: "+accession;
175  return null_mrna;
176  }
177  if (lsu_too_large)
178  {
179  msg = "LSU too large in: "+accession;
180  return null_mrna;
181  }
182  if (r58S_too_large)
183  {
184  msg = "5.8S too large in: "+accession;
185  return null_mrna;
186  }
187 
188  string comment;
189  switch(comments.size())
190  {
191  case 0 : comment = "does not contain rna label";break;
192  case 1 :
193  {
194  if (!ssu_present && !lsu_present)
195  {
196  comment = "contains "+comments.front();
197  }
198  }
199  break;
200  case 2 : comment = "contains " + comments[0]+" and "+comments[1];break;
201  default : comment = "contains "+comments[0]; for (unsigned int j=1; j<comments.size()-1;j++) comment += ", "+comments[j]; comment += ", and "+comments.back();break;
202  }
203  negative = strand == "1";
204  if (comments.size() == 1 && (ssu_present || lsu_present))
205  return x_CreateRRna(comments.front(), bsh);
206  return x_CreateMiscRna(comment,bsh);
207 }
208 
209 void CFindITSParser :: GetSpan(const string& str, vector<int>& starts, vector<int>& stops, vector<bool>& spans)
210 {
211  int start, stop;
212  bool span(false);
213  vector<string> arr;
214  NStr::Split(str,"-",arr);
215  if (arr.size() == 2)
216  {
217  span = true;
218  start = NStr::StringToInt(arr.front(), NStr::fConvErr_NoThrow);
220  }
221  starts.push_back(start);
222  stops.push_back(stop);
223  spans.push_back(span);
224 }
225 
226 bool CFindITSParser :: IsLengthTooLarge(const string& str, int max_length,
227  int i,
228  const vector<int>& starts,
229  const vector<int>& stops,
230  const vector<bool>& spans,
231  int bioseq_length)
232 {
233  if (spans[i])
234  {
235  int start = starts[i];
236  int end = stops[i];
237  int length = end - start + 1;
238  return length > max_length;
239  }
240  if (str == "No end")
241  {
242  int start = 1;
243  for (int j = i - 1; j >= 0; j--)
244  {
245  if (spans[j])
246  {
247  start = stops[j] + 1;
248  break;
249  }
250  }
251  int end = bioseq_length;
252  int length = end - start + 1;
253  return length > max_length;
254  }
255  if (str == "No start")
256  {
257  int start = 1;
258  int end = bioseq_length;
259  for (int j = i + 1; j < spans.size(); j++)
260  {
261  if (spans[j])
262  {
263  end = starts[j] - 1;
264  break;
265  }
266  }
267  int length = end - start + 1;
268  return length > max_length;
269  }
270  return false;
271 }
272 
274 {
275  CRef <CSeq_feat> new_mrna (new CSeq_feat());
276  new_mrna->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
277  new_mrna->SetComment(comment);
278 
279  CRef<CSeq_loc> loc(new CSeq_loc());
280  loc->SetInt().SetFrom(0);
281  loc->SetInt().SetTo(bsh.GetBioseqLength()-1);
282  loc->SetInt().SetStrand(eNa_strand_plus);
285  loc->SetId(*bsh.GetSeqId());
286  new_mrna->SetLocation(*loc);
287 
288  new_mrna->SetPartial(true);
289  return new_mrna;
290 }
291 
293 {
294  CRef <CSeq_feat> new_rrna (new CSeq_feat());
295  new_rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
296  string remainder;
297  new_rrna->SetData().SetRna().SetRnaProductName(comment, remainder);
298 
299  CRef<CSeq_loc> loc(new CSeq_loc());
300  loc->SetInt().SetFrom(0);
301  loc->SetInt().SetTo(bsh.GetBioseqLength()-1);
302  loc->SetInt().SetStrand(eNa_strand_plus);
305  loc->SetId(*bsh.GetSeqId());
306  new_rrna->SetLocation(*loc);
307 
308  new_rrna->SetPartial(true);
309  return new_rrna;
310 }
311 
313 {
314  CRef<edit::CStringConstraint> constraint(new edit::CStringConstraint(id_str, edit::CStringConstraint::eMatchType_Equals));
316  while (bi)
317  {
318  if (edit::CSeqIdGuesser::DoesSeqMatchConstraint(*bi,constraint))
319  return *bi;
320  ++bi;
321  }
322 
323  return CBioseq_Handle();
324 }
325 
326 
330 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
string m_msg
Definition: rna_edit.hpp:74
CBioseq_Handle x_GetBioseqHandleFromIdGuesser(const string &id_str, objects::CSeq_entry_Handle tse)
Definition: rna_edit.cpp:312
CRef< CSeq_feat > x_CreateMiscRna(const string &comment, CBioseq_Handle bsh)
Definition: rna_edit.cpp:273
void GetSpan(const string &str, vector< int > &starts, vector< int > &stops, vector< bool > &spans)
Definition: rna_edit.cpp:209
CBioseq_Handle m_bsh
Definition: rna_edit.hpp:72
CRef< CSeq_feat > ParseLine()
Definition: rna_edit.cpp:50
CRef< ILineReader > m_lr
Definition: rna_edit.hpp:70
CRef< CSeq_feat > x_ParseLine(const CTempString &line, CSeq_entry_Handle tse, CBioseq_Handle &bsh, bool &negative, string &msg)
Definition: rna_edit.cpp:57
CRef< CSeq_feat > x_CreateRRna(const string &comment, CBioseq_Handle bsh)
Definition: rna_edit.cpp:292
CSeq_entry_Handle m_tse
Definition: rna_edit.hpp:71
bool IsLengthTooLarge(const string &str, int max_length, int i, const vector< int > &starts, const vector< int > &stops, const vector< bool > &spans, int bioseq_length)
Definition: rna_edit.cpp:226
CSeq_entry_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3280
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3313
TSeqPos GetBioseqLength(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
void SetPartial(TPartial value)
Assign a value to Partial data member.
Definition: Seq_feat_.hpp:971
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
static int input()
int i
Definition: fix_pub.hpp:45
T negative(T x_)
static const char * str(char *buf, int n)
Definition: stats.c:84
#define const
Definition: zconf.h:230
Modified on Thu Feb 29 12:17:17 2024 by modify_doxy.py rev. 669887