NCBI C++ ToolKit
gaps_edit.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gaps_edit.cpp 100708 2023-08-31 20:11:59Z lavr $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Converts runs of 'N' or 'n' into a gap
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <corelib/ncbiutil.hpp>
37 
38 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_gap.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seq_ext.hpp>
48 
50 
52 
55 
56 namespace
57 {
58 
59 bool Make_Iupacna(const CSeq_data& data, string& decoded, TSeqPos len)
60 {
63  CTempString src;
64  switch (data.Which()) {
65 #define CODING_CASE(x) \
66  case CSeq_data::e_##x: \
67  src.assign(&data.Get##x().Get()[0], data.Get##x().Get().size()); \
68  src_coding = CSeqUtil::e_##x; \
69  CSeqConvert::Convert(src, src_coding, 0, len, decoded, dst_coding); \
70  break;
71  CODING_CASE(Ncbi2na)
72  CODING_CASE(Iupacna)
73  CODING_CASE(Iupacaa)
74  CODING_CASE(Ncbi4na)
75  CODING_CASE(Ncbi8na)
76  CODING_CASE(Ncbi8aa)
77  CODING_CASE(Ncbieaa)
78  CODING_CASE(Ncbistdaa)
79 #undef CODING_CASE
80  default:
81 // ERR_POST_X(1, Warning << "PackWithGaps: unsupported encoding "
82 // << CSeq_data::SelectionName(data.Which()));
83  return false;
84  }
85  return true;
86 }
87 
88 
90 {
91  string decoded;
92  if (!Make_Iupacna(data, decoded, len))
93  return CRef<CDelta_seq>();
94 
95  if (gap_start>0)
96  {
97  ext.AddAndSplit(CTempString(decoded, 0, gap_start), CSeq_data::e_Iupacna, gap_start, true, true);
98  }
99 
100  // here we need to check if segment is gap in fact
101  CDelta_seq& gap = ext.AddLiteral(gap_length);
102 
103  if (gap_start+gap_length < decoded.length())
104  {
105  ext.AddAndSplit(
106  CTempString(decoded, gap_start+gap_length, TSeqPos(decoded.length())-gap_start-gap_length),
107  CSeq_data::e_Iupacna, TSeqPos(decoded.length())-gap_start-gap_length, true, true);
108  }
109 
110  return CRef<CDelta_seq>(&gap);
111 }
112 
113 CDelta_seq& CloneLiteral(const CDelta_seq& delta, CDelta_ext& ext, TSeqPos len)
114 {
115  CDelta_seq& new_delta = ext.AddLiteral(len);
116  if (delta.GetLiteral().IsSetSeq_data())
117  new_delta.SetLiteral().SetSeq_data().Assign(delta.GetLiteral().GetSeq_data());
118 
119  return new_delta;
120 }
121 
123 {
124  if (delta.IsLiteral() && delta.GetLiteral().IsSetSeq_data())
125  return MakeGap(delta.GetLiteral().GetSeq_data(), len, ext, gap_start, gap_length);
126  else
127  {
128  if (gap_start>0)
129  {
130  CloneLiteral(delta, ext, gap_start);
131  }
132  CDelta_seq& new_gap = ext.AddLiteral(gap_length);
133  if (len>gap_start+gap_length)
134  CloneLiteral(delta, ext, len-gap_start-gap_length);
135  return CRef<CDelta_seq>(&new_gap);
136  }
137 }
138 
139 
140 CRef<CDelta_seq> MakeGap(CBioseq::TInst& inst, TSeqPos gap_start, TSeqPos gap_length)
141 {
142  if (inst.IsAa())
143  return CRef<CDelta_seq>();
144 
145  if (inst.IsSetExt()) {
146  CDelta_ext& ext = inst.SetExt().SetDelta();
147 
148  TSeqPos current = 0;
149  // let's find addressed literal
151  {
152  CRef<CDelta_seq> delta = *it;
153  if (delta->IsLiteral())
154  {
155  // literal without length specified break the locup
156  if (!delta->GetLiteral().IsSetLength())
157  return CRef<CDelta_seq>();
158 
159  TSeqPos lit_len = delta->GetLiteral().GetLength();
160 
161  if (gap_start < lit_len)
162  {
163  if (gap_start == 0 && lit_len == gap_length) // && !delta->GetLiteral().IsSetSeq_data())
164  return delta; // simply return the delta, don't create new
165  else
166  {
167  CDelta_ext new_ext; // temporal container
168  CRef<CDelta_seq> gap_seq = MakeGap(*delta, lit_len, new_ext, gap_start, gap_length);
169  if (gap_seq)
170  {
171  // replace the current delta with created
172  it = ext.Set().erase(it);
173  ext.Set().insert(it, new_ext.Set().begin(), new_ext.Set().end());
174  }
175  return gap_seq;
176  }
177  }
178  current += lit_len;
179  if (lit_len > gap_start)
180  return CRef<CDelta_seq>();
181  gap_start -= lit_len;
182  }
183  else
184  {
185  return CRef<CDelta_seq>();
186  }
187  }
188  }
189  else
190  if (inst.IsSetSeq_data())
191  {
192  // convert simple sequence into delta seq
193  const CSeq_data& data = inst.GetSeq_data();
194 
195  CDelta_ext& ext = inst.SetExt().SetDelta();
196 
197  CRef<CDelta_seq> delta = MakeGap(data, inst.GetLength(), ext, gap_start, gap_length);
198 
199  // finalize, if delta seq was not successfull
200  // revert to the original
201  if (ext.Get().size() > 1) {
203  inst.ResetSeq_data();
204  } else { // roll back
205  inst.ResetExt();
206  }
207  return delta;
208  }
209 
210  return CRef<CDelta_seq>();
211 
212 }
213 
214 }
215 
217  TSeqPos gapNmin, TSeqPos gap_Unknown_length) :
218  m_gap_type(gap_type),
219  m_DefaultEvidence(evidences),
220  m_gapNmin(gapNmin),
221  m_gap_Unknown_length(gap_Unknown_length)
222 {
223 }
224 
225 
227  const TEvidenceSet& defaultEvidence,
228  const TCountToEvidenceMap& countToEvidenceMap,
229  TSeqPos gapNmin,
230  TSeqPos gap_Unknown_length)
231  :
232  m_gap_type(gap_type),
233  m_DefaultEvidence(defaultEvidence),
234  m_GapsizeToEvidenceMap(countToEvidenceMap),
235  m_gapNmin(gapNmin),
236  m_gap_Unknown_length(gap_Unknown_length)
237 {
238 }
239 
240 
243 {
244  return CreateGap(bioseq, gap_start, gap_length, gap_length==m_gap_Unknown_length);
245 }
246 
247 
249 CGapsEditor::CreateGap(CBioseq& bioseq, TSeqPos gap_start, TSeqPos nominal_length, bool length_unknown)
250 {
251  if (!bioseq.IsSetInst())
252  return CRef<CDelta_seq>();
253 
254  CRef<CDelta_seq> seq = MakeGap(bioseq.SetInst(), gap_start, nominal_length);
255  if (seq.NotEmpty())
256  {
257  seq->SetLiteral();
258  x_SetGapParameters(*seq, length_unknown);
259  }
260  return seq;
261 }
262 
263 
265 {
266  string decoded;
267  if (!Make_Iupacna(data, decoded, len))
268  return;
269 
270  {
271  size_t index = 0;
272  CTempString current(decoded);
273  size_t start;
274  while (index + m_gapNmin <= current.length() && ((start = current.find_first_of("Nn", index)) != CTempString::npos))
275  {
276  size_t end = current.find_first_not_of("Nn", start);
277  if (end == CTempString::npos)
278  end = current.length();
279  if (end - start >= m_gapNmin)
280  {
281  if (start > 0)
282  ext.AddAndSplit(current, CSeq_data::e_Iupacna, TSeqPos(start), false, true);
283 
284  CDelta_seq& gap = ext.AddLiteral(TSeqPos(end-start));
285  x_SetGapParameters(gap);
286  current.assign(current.data(), end, current.length() - end);
287  end = 0;
288  }
289  index = end;
290  }
291  if (current.length() > 0)
292  ext.AddAndSplit(current, CSeq_data::e_Iupacna, TSeqPos(current.length()), false, true);
293  }
294 }
295 
297 {
298  if (inst.IsAa() || !inst.IsSetSeq_data() || inst.IsSetExt()) {
299  return;
300  }
301  const CSeq_data& data = inst.GetSeq_data();
302 
303  CDelta_ext& ext = inst.SetExt().SetDelta();
304 
305  ConvertNs2Gaps(data, inst.GetLength(), ext);
306 
307  if (ext.Get().size() > 1) { // finalize
309  inst.ResetSeq_data();
310  } else { // roll back
311  inst.ResetExt();
312  }
313 }
314 
316 {
317  if (bioseq.IsSetInst() && bioseq.GetInst().IsSetSeq_data() && !bioseq.GetInst().GetSeq_data().IsGap())
318  {
319  ConvertNs2Gaps(bioseq.SetInst());
320  }
321 
322 
323  if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta())
324  {
325  return;
326  }
327 
328  CSeq_inst& inst = bioseq.SetInst();
329 
330  // since delta functions allows adding new literal to an end we create a copy of literal array
331  // to iterate over
332 
333  CDelta_ext::Tdata src_data = inst.GetExt().GetDelta().Get();
334  CDelta_ext& dst_data = inst.SetExt().SetDelta();
335  dst_data.Set().clear();
336 
337  NON_CONST_ITERATE(CDelta_ext::Tdata, it, src_data)
338  {
339  if (!(**it).IsLiteral())
340  {
341  dst_data.Set().push_back(*it);
342  continue;
343  }
344 
345  CDelta_seq::TLiteral& lit = (**it).SetLiteral();
346 
347  // split a literal to elements if needed
348  if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap())
349  {
350  ConvertNs2Gaps(lit.GetSeq_data(), lit.GetLength(), dst_data);
351  }
352  else
353  {
354  // otherwise add it as is and possible update some parameters
355  dst_data.Set().push_back(*it);
356  x_SetGapParameters(**it);
357  }
358  }
359 }
360 
362 {
363  bool length_unknown = (lit.GetLiteral().IsSetLength() &&
365 
366  x_SetGapParameters(lit, length_unknown);
367 }
368 
369 void CGapsEditor::x_SetGapParameters(CDelta_seq& lit, bool length_unknown)
370 {
371  CDelta_seq::TLiteral& gap = lit.SetLiteral();
372  if (length_unknown) {
373  gap.SetFuzz().SetLim(CInt_fuzz::eLim_unk);
374  }
375 
376  if (gap.IsSetSeq_data() &&
377  gap.GetSeq_data().IsGap() &&
378  gap.GetSeq_data().GetGap().GetLinkage_evidence().size() > 0)
379  return;
380 
382  const auto len = gap.GetLength();
383  const auto it = m_GapsizeToEvidenceMap.find(len);
384  const auto& evidenceSet =
385  (it != m_GapsizeToEvidenceMap.end()) ?
386  it->second :
388 
389  if (evidenceSet.empty()) {
390  return;
391  }
392 
393  for (const auto& evidence : evidenceSet) {
394  auto pEvidence = Ref(new CLinkage_evidence());
395  pEvidence->SetType(evidence);
396  gap.SetSeq_data().SetGap().SetLinkage_evidence().emplace_back(std::move(pEvidence));
397  }
398  gap.SetSeq_data().SetGap().SetLinkage(CSeq_gap::eLinkage_linked);
399  gap.SetSeq_data().SetGap().SetType(m_gap_type);
400  }
401 }
402 
404 {
405  if (m_gapNmin == 0 && m_gap_Unknown_length > 0)
406  return;
407 
408  switch(entry.Which())
409  {
410  case CSeq_entry::e_Seq:
411  {
412  ConvertNs2Gaps(entry.SetSeq());
413  }
414  break;
415  case CSeq_entry::e_Set:
417  {
418  ConvertNs2Gaps(**it);
419  }
420  break;
421  default:
422  break;
423  }
424 }
425 
427 {
428  TSeqPos len = bioseq.GetInst().GetLength();
429  CDelta_ext& delta_ext = bioseq.SetInst().SetExt().SetDelta();
430  CRef<CDelta_seq> delta_seq(new CDelta_seq);
431  delta_seq->SetLiteral().SetSeq_data(bioseq.SetInst().SetSeq_data());
432  delta_seq->SetLiteral().SetLength(len);
433  delta_ext.Set().push_back(delta_seq);
434  bioseq.SetInst().ResetSeq_data();
435  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
436 }
437 
439 {
440  CRef<CDelta_seq> delta_seq(new CDelta_seq);
441  CDelta_seq::TLiteral& lit = delta_seq->SetLiteral();
442  lit.SetLength(0);
443  x_SetGapParameters(*delta_seq);
444  const TSeqPos len = 100;
445  lit.SetLength(len);
446  bioseq.SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
447  bioseq.SetInst().SetLength() += len;
448 }
449 
451 {
452  auto& delta_set = parent.SetInst().SetExt().SetDelta().Set();
453  if (!delta_set.empty() && delta_set.back()->GetLiteral().IsSetSeq_data())
454  {
455  AppendGap(parent);
456  }
457  if (bioseq.GetInst().IsSetExt())
458  {
459  delta_set.splice(delta_set.end(), bioseq.SetInst().SetExt().SetDelta().Set());
460  }
461  else {
462  CRef<CDelta_seq> delta_seq(new CDelta_seq);
463  delta_seq->SetLiteral().SetSeq_data(bioseq.SetInst().SetSeq_data());
464  delta_seq->SetLiteral().SetLength(bioseq.GetInst().GetLength());
465  delta_set.push_back(delta_seq);
466  }
467  parent.SetInst().SetLength() += bioseq.GetLength();
468 }
469 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
void AddAndSplit(const CTempString &src, CSeq_data::E_Choice format, TSeqPos length, bool gaps_ok=false, bool allow_packing=true)
add a chunk of sequence, splitting it as necessary for the sake of compactness (isolating ambiguous p...
Definition: Delta_ext.cpp:183
CDelta_seq & AddLiteral(TSeqPos len)
add a literal segment at the end this variant adds a gap literal
Definition: Delta_ext.cpp:69
CDelta_seq –.
Definition: Delta_seq.hpp:66
CRef< CDelta_seq > CreateGap(CBioseq &bioseq, TSeqPos gap_start, TSeqPos gap_length)
Definition: gaps_edit.cpp:242
void AppendGap(CBioseq &bioseq)
Definition: gaps_edit.cpp:438
void ConvertBioseqToDelta(CBioseq &bioseq)
Definition: gaps_edit.cpp:426
TSeqPos m_gap_Unknown_length
Definition: gaps_edit.hpp:89
void AddBioseqAsLiteral(CBioseq &parent, CBioseq &seqset)
Definition: gaps_edit.cpp:450
TEvidenceSet m_DefaultEvidence
Definition: gaps_edit.hpp:86
void ConvertNs2Gaps(CSeq_entry &entry)
Definition: gaps_edit.cpp:403
void x_SetGapParameters(CDelta_seq &gap)
Definition: gaps_edit.cpp:361
CGapsEditor(CSeq_gap::EType gap_type, const TEvidenceSet &evidences, TSeqPos gapNmin, TSeqPos gap_Unknown_length)
Definition: gaps_edit.cpp:216
TCountToEvidenceMap m_GapsizeToEvidenceMap
Definition: gaps_edit.hpp:87
TSeqPos m_gapNmin
Definition: gaps_edit.hpp:88
CSeq_gap::EType m_gap_type
Definition: gaps_edit.hpp:85
CRef –.
Definition: ncbiobj.hpp:618
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_not_set
Definition: sequtil.hpp:44
Definition: Seq_entry.hpp:56
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
bool empty() const
Definition: set.hpp:133
char data[12]
Definition: iconv.c:80
#define CODING_CASE(x)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
CTempString & assign(const char *src_str, size_type len)
Assign new values to the content of the a string.
Definition: tempstr.hpp:733
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
Definition: tempstr.hpp:553
size_type find_first_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character in the matching string within the current string,...
Definition: tempstr.hpp:538
static const size_type npos
Definition: tempstr.hpp:72
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_entry_.hpp:228
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
TLiteral & SetLiteral(void)
Select the variant.
Definition: Delta_seq_.cpp:130
void SetLength(TLength value)
Assign a value to Length data member.
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
const TGap & GetGap(void) const
Get the variant data.
Definition: Seq_data_.cpp:184
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
const TLiteral & GetLiteral(void) const
Get the variant data.
Definition: Delta_seq_.cpp:124
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
Definition: Bioseq_.hpp:324
Tdata & Set(void)
Assign a value to data member.
Definition: Delta_ext_.hpp:170
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TLength GetLength(void) const
Get the Length member data.
const TLinkage_evidence & GetLinkage_evidence(void) const
Get the Linkage_evidence member data.
Definition: Seq_gap_.hpp:369
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetFuzz(TFuzz &value)
Assign a value to Fuzz data member.
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
bool IsSetLength(void) const
must give a length in residues Check if a value has been assigned to Length data member.
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
bool IsSetSeq_data(void) const
may have the data Check if a value has been assigned to Seq_data data member.
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
void ResetExt(void)
Reset Ext data member.
Definition: Seq_inst_.cpp:142
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
void ResetSeq_data(void)
Reset Seq_data data member.
Definition: Seq_inst_.cpp:125
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
bm::gap_word_t gap_length(const bm::gap_word_t *buf) noexcept
Returs GAP block length.
Definition: bmfunc.h:1603
if(yy_accept[yy_current_state])
int len
Useful/utility classes and methods.
Int4 delta(size_t dimension_, const Int4 *score_)
Modified on Fri Sep 20 14:57:43 2024 by modify_doxy.py rev. 669887