NCBI C++ ToolKit
fcs_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: fcs_reader.cpp 93365 2021-04-06 11:27:32Z stakhovv $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig, Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Reader for selected data file formats
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
38 
40 
41 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_data.hpp>
44 #include <objects/seq/Seq_ext.hpp>
51 
56 
57 //#include <objtools/readers/error_container.hpp>
58 
59 //#include "multireader.hpp"
60 //#include "table2asn_context.hpp"
61 
62 #include "fcs_reader.hpp"
63 #include "table2asn_context.hpp"
64 
65 
66 #include <common/test_assert.h> /* This header must go last */
67 
68 // exclude from code coverage per RW-589
69 //LCOV_EXCL_START
72 
73 namespace
74 {
76 {
77  switch (ch)
78  {
84  //case CSeq_data::e_Ncbipna: return CSeqUtil::e_Ncbipna;
87  //case CSeq_data::e_Ncbipaa: return CSeqUtil::e_Ncbipaa;
89  default:
90  return CSeqUtil::e_not_set;
91  }
92 }
93 }
94 
96  m_context(context)
97 {
98 }
99 
101 {
102 }
103 
105 {
106  vector<CTempString> values;
107  values.reserve(5);
108  while (!reader.AtEOF())
109  {
110  reader.ReadLine();
111  // First line is a collumn definitions
112  CTempString current = reader.GetCurrentLine();
113  if (current.empty())
114  continue;
115  if (*current.begin() == '#')
116  continue;
117 
118  // Each line except first is a set of values, first collumn is a sequence id
119  values.clear();
120  NStr::Split(current, "\t", values, NStr::fSplit_Tokenize);
121  if (values.size() == 5)
122  {
123  TColumns& new_cols = m_data["lcl|" + values[0]];
124  new_cols.name = values[0];
125  new_cols.length = NStr::StringToInt(values[1]);
126  if (values[2] != "-")
127  {
128  CTempString s1, s2;
129  NStr::SplitInTwo(values[2], "..", s1, s2, NStr::fSplit_ByPattern);
130  int start = NStr::StringToInt(s1) - 1;
131  int len = NStr::StringToInt(s2) - start;
132  new_cols.locs[start] = len;
133  }
134  new_cols.mode = values[3][0];
135  new_cols.source = values[4];
136  }
137  }
138 }
139 
141 {
142  if (!inst.IsSetSeq_data())
143  return;
144  int removed = 0;
145 
146  string strdata(inst.GetSeq_data().GetIupacna().Get());
147  ITERATE(Tlocs, it, locs)
148  {
149  strdata.erase(it->first - removed, it->second);
150  removed += it->second;
151  }
152 
153  inst.SetSeq_data().SetIupacna().Set(strdata);
154  inst.SetLength(strdata.length());
155 }
156 
158 {
159  if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap())
160  {
161  string* encoded_str = nullptr;
162  vector< char >* encoded_vec = nullptr;
163  CSeqUtil::TCoding src_coding = xGetCoding(lit.GetSeq_data().Which());
164  switch (lit.GetSeq_data().Which())
165  {
167  encoded_str = &lit.SetSeq_data().SetIupacna().Set();
168  break;
170  encoded_str = &lit.SetSeq_data().SetIupacaa().Set();
171  break;
173  encoded_vec = &lit.SetSeq_data().SetNcbi2na().Set();
174  break;
176  encoded_vec = &lit.SetSeq_data().SetNcbi4na().Set();
177  break;
179  encoded_vec = &lit.SetSeq_data().SetNcbi8na().Set();
180  break;
182  encoded_vec = &lit.SetSeq_data().SetNcbipna().Set();
183  break;
185  encoded_vec = &lit.SetSeq_data().SetNcbi8aa().Set();
186  break;
188  encoded_str = &lit.SetSeq_data().SetNcbieaa().Set();
189  break;
191  encoded_vec = &lit.SetSeq_data().SetNcbipaa().Set();
192  break;
194  encoded_vec = &lit.SetSeq_data().SetNcbistdaa().Set();
195  break;
196  default:
197  return;
198  }
199  string decoded;
200  if (encoded_vec)
201  {
202  CSeqConvert::Convert(*encoded_vec, src_coding, 0, lit.GetLength(), decoded, CSeqUtil::e_Iupacna);
203  }
204  else
205  if (encoded_str)
206  {
207  CSeqConvert::Convert(*encoded_str, src_coding, 0, lit.GetLength(), decoded, CSeqUtil::e_Iupacna);
208  }
209  decoded.erase(start, stop-start+1);
210  if (encoded_vec)
211  {
212  CSeqConvert::Convert(decoded, CSeqUtil::e_Iupacna, 0, decoded.length(), *encoded_vec, src_coding);
213  }
214  else
215  if (encoded_str)
216  {
217  CSeqConvert::Convert(decoded, CSeqUtil::e_Iupacna, 0, decoded.length(), *encoded_str, src_coding);
218  }
219  }
220  lit.SetLength() -= (stop-start+1);
221 }
222 
224 {
225  if (!(inst.IsSetExt() && inst.GetExt().IsDelta()))
226  return;
227 
228  CDelta_ext::Tdata& data = inst.SetExt().SetDelta().Set();
229  int current_abs = 0;
230  for (CDelta_ext::Tdata::iterator it = data.begin(); it != data.end();)
231  {
232  CDelta_seq& seq_data = **it;
233  CDelta_ext::Tdata::iterator removable = it++;
234 
235  const int orig_lit_len = seq_data.GetLiteral().GetLength();
236  int local_removed = 0;
237 
238  ITERATE(Tlocs, it_locs, locs)
239  {
240  const int start = it_locs->first;
241  if (start < current_abs+orig_lit_len-local_removed &&
242  current_abs < start+it_locs->second)
243  {
244  // found, lets trim
245  int local_start = max(start-current_abs, 0);
246  int local_end = start+it_locs->second-local_removed-current_abs-1;
247  local_end = min(local_end, (int)seq_data.GetLiteral().GetLength()-1);
248 
249  local_removed += (local_end-local_start+1);
250 
251  if (local_start == 0 && local_end == seq_data.GetLiteral().GetLength()-1)
252  {
253  data.erase(removable);
254  break;
255  }
256  else
257  {
258  xTrimLiteral(seq_data.SetLiteral(), local_start, local_end);
259  //cout << "Removing fragment:" << start+1 << ":" << start+it_locs->second << endl;
260  }
261 
262  }
263  }
264  inst.SetLength() -= local_removed;
265  current_abs += orig_lit_len;
266  }
267 
268 }
269 
271 {
272  if (seq.IsSetLength())
273  {
275  {
276  return true;
277  }
278  }
279  if (seq.IsSetInst() && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta())
280  {
282  {
283  const CDelta_seq& delta_seq = **it;
284  if (delta_seq.IsLoc())
285  return false;
286  if (delta_seq.IsLiteral() && delta_seq.GetLiteral().IsSetSeq_data())
287  return false;
288  }
289  return true;
290  }
291 
292  return false;
293 }
294 
295 // -r C:\Users\gotvyans\Desktop\cplusplus\results -i C:\Users\gotvyans\Desktop\cplusplus\data\test\x.fsa -s -fcs-file C:\Users\gotvyans\Desktop\cplusplus\data\test\FCSresults -fcs-trim -min-threshold 200 -a r10k
297 {
298  string label;
300  if (label.empty())
301  return true;
302 
303  if (entry.IsSeq())
304  {
305  if (entry.GetSeq().GetId().empty())
306  {
307  return true;
308  }
309  if (entry.GetSeq().IsSetLength())
310  {
312  {
313  return true;
314  }
315  }
316  }
317 
319 
320  if (it == m_data.end())
321  {
322  return false;
323  }
324 
325  {
326  const TColumns& data = it->second;
327  if (data.mode == 'X')
328  {
329  // remove it
330  return true;
331  }
332  else
333  if (data.mode == 'M')
334  {
335  if (m_context.m_fcs_trim && entry.GetSeq().IsSetInst())
336  {
337  if (entry.GetSeq().GetInst().IsSetExt())
338  xTrimExt(entry.SetSeq().SetInst(), data.locs);
339  else
340  xTrimData(entry.SetSeq().SetInst(), data.locs);
341 
342  if (xCheckLen(entry.GetSeq()))
343  return true;
344  }
345  else
346  ITERATE(Tlocs, it_loc, it->second.locs)
347  {
348  CRef<CSeq_feat> feat(new CSeq_feat());
349  feat->SetLocation().SetInt().SetId().Assign(*entry.SetSeq().SetId().front());
350  feat->SetLocation().SetInt().SetFrom(it_loc->first + 1);
351  feat->SetLocation().SetInt().SetTo(it_loc->first + it_loc->second);
352  feat->SetData().SetImp().SetKey("misc_feature");
353  feat->SetComment("possible contamination");
354 
355  CRef<CSeq_annot> set_annot(new CSeq_annot);
356  set_annot->SetData().SetFtable().push_back(feat);
357  entry.SetSeq().SetAnnot().push_back(set_annot);
358  }
359  }
360  ++it;
361  }
362 
363  return false;
364 }
365 
367 {
368  if (entry.IsSet())
369  {
370  CSeq_entry::TSet::TSeq_set::iterator it = entry.SetSet().SetSeq_set().begin();
371  while (it != entry.SetSet().SetSeq_set().end())
372  {
373  if ((**it).IsSeq())
374  {
375  if (AnnotateOrRemove(**it))
376  {
377  entry.SetSet().SetSeq_set().erase(it++);
378  continue;
379  }
380 
381  }
382  ++it;
383  }
384  }
385 }
386 
387 //LCOV_EXCL_STOP
388 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsSetLength(void) const
Definition: Bioseq.cpp:355
CDelta_seq –.
Definition: Delta_seq.hpp:66
bool xCheckLen(const objects::CBioseq &inst) const
Definition: fcs_reader.cpp:270
void xTrimData(objects::CSeq_inst &inst, const Tlocs &col) const
Definition: fcs_reader.cpp:140
void PostProcess(objects::CSeq_entry &entry)
Definition: fcs_reader.cpp:366
void LoadFile(ILineReader &linereader)
Definition: fcs_reader.cpp:104
void xTrimLiteral(objects::CSeq_literal &lit, int start, int end) const
Definition: fcs_reader.cpp:157
void xTrimExt(objects::CSeq_inst &inst, const Tlocs &col) const
Definition: fcs_reader.cpp:223
CForeignContaminationScreenReportReader(const CTable2AsnContext &context)
Definition: fcs_reader.cpp:95
const CTable2AsnContext & m_context
Definition: fcs_reader.hpp:54
bool AnnotateOrRemove(objects::CSeq_entry &entry) const
Definition: fcs_reader.cpp:296
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
@ e_Ncbi8na
Definition: sequtil.hpp:52
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_not_set
Definition: sequtil.hpp:44
@ e_Ncbi8aa
Definition: sequtil.hpp:56
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbi2na
Definition: sequtil.hpp:48
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
Definition: Seq_entry.hpp:56
@ eContent
Definition: Seq_entry.hpp:93
void GetLabel(string *label, ELabelType type) const
Definition: Seq_entry.cpp:274
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Operators to edit gaps in sequences.
USING_SCOPE(objects)
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TPrim & Get(void) const
Definition: serialbase.hpp:347
CTempString GetCurrentLine(void) const
void ReadLine(void)
Definition: line_reader.hpp:88
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
void clear(void)
Clears the string.
Definition: tempstr.hpp:351
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ fSplit_ByPattern
Require full delimiter strings.
Definition: ncbistr.hpp:2502
static const char label[]
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
TLiteral & SetLiteral(void)
Select the variant.
Definition: Delta_seq_.cpp:130
void SetLength(TLength value)
Assign a value to Length data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
const TLiteral & GetLiteral(void) const
Get the variant data.
Definition: Delta_seq_.cpp:124
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
bool IsLoc(void) const
Check if variant Loc is selected.
Definition: Delta_seq_.hpp:257
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
TLength GetLength(void) const
Get the Length member data.
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
bool IsLiteral(void) const
Check if variant Literal is selected.
Definition: Delta_seq_.hpp:263
bool IsSetSeq_data(void) const
may have the data Check if a value has been assigned to Seq_data data member.
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ e_Ncbipna
nucleic acid probabilities
Definition: Seq_data_.hpp:109
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbipaa
amino acid probabilities
Definition: Seq_data_.hpp:112
@ e_Ncbi8na
8 bit extended nucleic acid code
Definition: Seq_data_.hpp:108
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
@ e_Ncbi8aa
8 bit extended amino acid codes
Definition: Seq_data_.hpp:110
int len
T max(T x_, T y_)
T min(T x_, T y_)
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Wed Apr 24 14:11:41 2024 by modify_doxy.py rev. 669887