1 /* $Id: fcs_reader.cpp 93365 2021-04-06 11:27:32Z stakhovv $
2 * ===========================================================================
3 *
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig, Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Reader for selected data file formats
30 *
31 * ===========================================================================
32 */
34 #include <ncbi_pch.hpp>
41 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_data.hpp>
44 #include <objects/seq/Seq_ext.hpp>
57 //#include <objtools/readers/error_container.hpp>
59 //#include "multireader.hpp"
60 //#include "table2asn_context.hpp"
62 #include "fcs_reader.hpp"
63 #include "table2asn_context.hpp"
66 #include <common/test_assert.h> /* This header must go last */
68 // exclude from code coverage per RW-589
73 namespace
74 {
76 {
77  switch (ch)
78  {
84  //case CSeq_data::e_Ncbipna: return CSeqUtil::e_Ncbipna;
87  //case CSeq_data::e_Ncbipaa: return CSeqUtil::e_Ncbipaa;
89  default:
90  return CSeqUtil::e_not_set;
91  }
92 }
93 }
96  m_context(context)
97 {
98 }
101 {
102 }
105 {
106  vector<CTempString> values;
107  values.reserve(5);
108  while (!reader.AtEOF())
109  {
110  reader.ReadLine();
111  // First line is a collumn definitions
112  CTempString current = reader.GetCurrentLine();
113  if (current.empty())
114  continue;
115  if (*current.begin() == '#')
116  continue;
118  // Each line except first is a set of values, first collumn is a sequence id
119  values.clear();
120  NStr::Split(current, "\t", values, NStr::fSplit_Tokenize);
121  if (values.size() == 5)
122  {
123  TColumns& new_cols = m_data["lcl|" + values[0]];
124 = values[0];
125  new_cols.length = NStr::StringToInt(values[1]);
126  if (values[2] != "-")
127  {
128  CTempString s1, s2;
129  NStr::SplitInTwo(values[2], "..", s1, s2, NStr::fSplit_ByPattern);
130  int start = NStr::StringToInt(s1) - 1;
131  int len = NStr::StringToInt(s2) - start;
132  new_cols.locs[start] = len;
133  }
134  new_cols.mode = values[3][0];
135  new_cols.source = values[4];
136  }
137  }
138 }
141 {
142  if (!inst.IsSetSeq_data())
143  return;
144  int removed = 0;
146  string strdata(inst.GetSeq_data().GetIupacna().Get());
147  ITERATE(Tlocs, it, locs)
148  {
149  strdata.erase(it->first - removed, it->second);
150  removed += it->second;
151  }
153  inst.SetSeq_data().SetIupacna().Set(strdata);
154  inst.SetLength(strdata.length());
155 }
158 {
159  if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap())
160  {
161  string* encoded_str = nullptr;
162  vector< char >* encoded_vec = nullptr;
163  CSeqUtil::TCoding src_coding = xGetCoding(lit.GetSeq_data().Which());
164  switch (lit.GetSeq_data().Which())
165  {
167  encoded_str = &lit.SetSeq_data().SetIupacna().Set();
168  break;
170  encoded_str = &lit.SetSeq_data().SetIupacaa().Set();
171  break;
173  encoded_vec = &lit.SetSeq_data().SetNcbi2na().Set();
174  break;
176  encoded_vec = &lit.SetSeq_data().SetNcbi4na().Set();
177  break;
179  encoded_vec = &lit.SetSeq_data().SetNcbi8na().Set();
180  break;
182  encoded_vec = &lit.SetSeq_data().SetNcbipna().Set();
183  break;
185  encoded_vec = &lit.SetSeq_data().SetNcbi8aa().Set();
186  break;
188  encoded_str = &lit.SetSeq_data().SetNcbieaa().Set();
189  break;
191  encoded_vec = &lit.SetSeq_data().SetNcbipaa().Set();
192  break;
194  encoded_vec = &lit.SetSeq_data().SetNcbistdaa().Set();
195  break;
196  default:
197  return;
198  }
199  string decoded;
200  if (encoded_vec)
201  {
202  CSeqConvert::Convert(*encoded_vec, src_coding, 0, lit.GetLength(), decoded, CSeqUtil::e_Iupacna);
203  }
204  else
205  if (encoded_str)
206  {
207  CSeqConvert::Convert(*encoded_str, src_coding, 0, lit.GetLength(), decoded, CSeqUtil::e_Iupacna);
208  }
209  decoded.erase(start, stop-start+1);
210  if (encoded_vec)
211  {
212  CSeqConvert::Convert(decoded, CSeqUtil::e_Iupacna, 0, decoded.length(), *encoded_vec, src_coding);
213  }
214  else
215  if (encoded_str)
216  {
217  CSeqConvert::Convert(decoded, CSeqUtil::e_Iupacna, 0, decoded.length(), *encoded_str, src_coding);
218  }
219  }
220  lit.SetLength() -= (stop-start+1);
221 }
224 {
225  if (!(inst.IsSetExt() && inst.GetExt().IsDelta()))
226  return;
228  CDelta_ext::Tdata& data = inst.SetExt().SetDelta().Set();
229  int current_abs = 0;
230  for (CDelta_ext::Tdata::iterator it = data.begin(); it != data.end();)
231  {
232  CDelta_seq& seq_data = **it;
233  CDelta_ext::Tdata::iterator removable = it++;
235  const int orig_lit_len = seq_data.GetLiteral().GetLength();
236  int local_removed = 0;
238  ITERATE(Tlocs, it_locs, locs)
239  {
240  const int start = it_locs->first;
241  if (start < current_abs+orig_lit_len-local_removed &&
242  current_abs < start+it_locs->second)
243  {
244  // found, lets trim
245  int local_start = max(start-current_abs, 0);
246  int local_end = start+it_locs->second-local_removed-current_abs-1;
247  local_end = min(local_end, (int)seq_data.GetLiteral().GetLength()-1);
249  local_removed += (local_end-local_start+1);
251  if (local_start == 0 && local_end == seq_data.GetLiteral().GetLength()-1)
252  {
253  data.erase(removable);
254  break;
255  }
256  else
257  {
258  xTrimLiteral(seq_data.SetLiteral(), local_start, local_end);
259  //cout << "Removing fragment:" << start+1 << ":" << start+it_locs->second << endl;
260  }
262  }
263  }
264  inst.SetLength() -= local_removed;
265  current_abs += orig_lit_len;
266  }
268 }
271 {
272  if (seq.IsSetLength())
273  {
275  {
276  return true;
277  }
278  }
279  if (seq.IsSetInst() && seq.GetInst().IsSetExt() && seq.GetInst().GetExt().IsDelta())
280  {
282  {
283  const CDelta_seq& delta_seq = **it;
284  if (delta_seq.IsLoc())
285  return false;
286  if (delta_seq.IsLiteral() && delta_seq.GetLiteral().IsSetSeq_data())
287  return false;
288  }
289  return true;
290  }
292  return false;
293 }
295 // -r C:\Users\gotvyans\Desktop\cplusplus\results -i C:\Users\gotvyans\Desktop\cplusplus\data\test\x.fsa -s -fcs-file C:\Users\gotvyans\Desktop\cplusplus\data\test\FCSresults -fcs-trim -min-threshold 200 -a r10k
297 {
298  string label;
300  if (label.empty())
301  return true;
303  if (entry.IsSeq())
304  {
305  if (entry.GetSeq().GetId().empty())
306  {
307  return true;
308  }
309  if (entry.GetSeq().IsSetLength())
310  {
312  {
313  return true;
314  }
315  }
316  }
320  if (it == m_data.end())
321  {
322  return false;
323  }
325  {
326  const TColumns& data = it->second;
327  if (data.mode == 'X')
328  {
329  // remove it
330  return true;
331  }
332  else
333  if (data.mode == 'M')
334  {
335  if (m_context.m_fcs_trim && entry.GetSeq().IsSetInst())
336  {
337  if (entry.GetSeq().GetInst().IsSetExt())
338  xTrimExt(entry.SetSeq().SetInst(), data.locs);
339  else
340  xTrimData(entry.SetSeq().SetInst(), data.locs);
342  if (xCheckLen(entry.GetSeq()))
343  return true;
344  }
345  else
346  ITERATE(Tlocs, it_loc, it->second.locs)
347  {
348  CRef<CSeq_feat> feat(new CSeq_feat());
349  feat->SetLocation().SetInt().SetId().Assign(*entry.SetSeq().SetId().front());
350  feat->SetLocation().SetInt().SetFrom(it_loc->first + 1);
351  feat->SetLocation().SetInt().SetTo(it_loc->first + it_loc->second);
352  feat->SetData().SetImp().SetKey("misc_feature");
353  feat->SetComment("possible contamination");
355  CRef<CSeq_annot> set_annot(new CSeq_annot);
356  set_annot->SetData().SetFtable().push_back(feat);
357  entry.SetSeq().SetAnnot().push_back(set_annot);
358  }
359  }
360  ++it;
361  }
363  return false;
364 }
367 {
368  if (entry.IsSet())
369  {
370  CSeq_entry::TSet::TSeq_set::iterator it = entry.SetSet().SetSeq_set().begin();
371  while (it != entry.SetSet().SetSeq_set().end())
372  {
373  if ((**it).IsSeq())
374  {
375  if (AnnotateOrRemove(**it))
376  {
377  entry.SetSet().SetSeq_set().erase(it++);
378  continue;
379  }
381  }
382  ++it;
383  }
384  }
385 }
