NCBI C++ ToolKit
expand_gaps.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: expand_gaps.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Igor Filippov
27  */
28 
29 
30 #include <ncbi_pch.hpp>
33 #include <objmgr/bioseq_ci.hpp>
34 #include <objmgr/seq_vector.hpp>
35 #include <objects/seq/Seq_ext.hpp>
39 
42 
44 {
45  return seg.IsLiteral() && seg.GetLiteral().IsSetFuzz();
46 }
47 
48 void CExpandGaps::AppendLiteral(CDelta_ext &delta, const string& iupac)
49 {
50  if (delta.IsSet() && !delta.Get().empty() && delta.Get().back()->IsLiteral() && delta.Get().back()->GetLiteral().IsSetSeq_data()
51  && delta.Get().back()->GetLiteral().GetSeq_data().IsIupacna())
52  {
53  string seq = delta.Get().back()->GetLiteral().GetSeq_data().GetIupacna().Get();
54  seq += iupac;
55  delta.Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set().assign(seq.data(), seq.length());
56  delta.Set().back()->SetLiteral().SetLength((CSeq_literal::TLength)seq.size());
57  }
58  else
59  {
60  delta.AddLiteral(iupac, CSeq_inst::eMol_na, false);
61  }
62 }
63 
64 CRef<CCmdComposite> CExpandGaps::apply(objects::CSeq_entry_Handle tse)
65 {
66  if (!tse)
67  return CRef<CCmdComposite>(NULL);
68 
69  CRef<CCmdComposite> composite( new CCmdComposite("Expand Known Gaps to Include Flanking Ns") );
70  for (CBioseq_CI bi(tse, CSeq_inst::eMol_na); bi; ++bi)
71  {
72  CBioseq_Handle bsh = *bi;
73  TSeqPos len = bsh.GetBioseqLength();
74  if (len <= 1)
75  continue;
76  if (!bsh.IsSetInst())
77  continue;
78 
79  const CSeq_inst &inst = bsh.GetInst();
80  if (!inst.IsSetExt() || !inst.GetExt().IsDelta())
81  continue;
82  bool has_loc(false);
83 
84  ITERATE(CDelta_ext::Tdata, iter, inst.GetExt().GetDelta().Get())
85  {
86  if ( (*iter)->IsLoc() )
87  {
88  has_loc = true;
89  break;
90  }
91  }
92  if (has_loc)
93  continue;
94 
95  CRef<CSeq_inst> new_inst(new CSeq_inst);
96  new_inst->Assign(inst);
97  new_inst->ResetExt();
98  bool modified(false);
99  try
100  {
101  TSeqPos pos = 0;
102  int prev_finish = 0;
104  ITERATE (CDelta_ext::Tdata, delta_i, inst.GetExt().GetDelta().Get())
105  {
106  if (delta_i->Empty())
107  continue; // Ignore NULLs, reported separately above.
108  const CDelta_seq& seg = **delta_i;
109  CSeq_literal::TLength delta_len = seg.GetLiteral().GetLength();
110  if (sv.IsInGap (pos) && !IsSkipGap(seg))
111  {
112  int cur_finish = pos + delta_len;
113  while (cur_finish < len && sv[cur_finish] == 'N' && !sv.IsInGap(cur_finish))
114  {
115  cur_finish++;
116  }
117  cur_finish--;
118  int length = cur_finish - prev_finish + 1;
119  modified |= (length != delta_len);
120  if (length > 0)
121  {
122  CRef<CDelta_seq> new_seg(new CDelta_seq());
123  new_seg->Assign(seg);
124  new_seg->SetLiteral().SetLength(length);
125  new_inst->SetExt().SetDelta().Set().push_back(new_seg);
126  }
127  prev_finish = cur_finish + 1;
128  }
129  else if (sv.IsInGap (pos) && IsSkipGap(seg))
130  {
131  if (pos > prev_finish)
132  {
133  string iupac;
134  sv.GetSeqData(prev_finish, pos, iupac);
135  AppendLiteral(new_inst->SetExt().SetDelta(), iupac);
136  }
137  CRef<CDelta_seq> new_seg(new CDelta_seq());
138  new_seg->Assign(seg);
139  new_inst->SetExt().SetDelta().Set().push_back(new_seg);
140  prev_finish = pos + delta_len;
141  }
142  else
143  {
144  int cur_finish = pos + delta_len - 1;
145  while (cur_finish >= prev_finish && sv[cur_finish] == 'N' && !sv.IsInGap(cur_finish))
146  {
147  cur_finish--;
148  }
149  cur_finish++;
150  if (cur_finish > prev_finish)
151  {
152  string iupac;
153  sv.GetSeqData(prev_finish, cur_finish, iupac);
154  new_inst->SetExt().SetDelta().AddLiteral(iupac, CSeq_inst::eMol_na, false);
155  }
156  prev_finish = cur_finish;
157  }
158  pos += delta_len;
159  }
160 
161  } catch (CException )
162  {
163  modified = false;
164  }
165  catch (std::exception )
166  {
167  modified = false;
168  }
169 
170  if (modified)
171  {
173  composite->AddCommand(*cmd);
174  }
175  }
176 
177  return composite;
178 }
179 
180 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
void AddCommand(IEditCommand &command)
CDelta_seq –.
Definition: Delta_seq.hpp:66
static CRef< CCmdComposite > apply(objects::CSeq_entry_Handle tse)
Definition: expand_gaps.cpp:64
static void AppendLiteral(objects::CDelta_ext &delta, const string &iupac)
Definition: expand_gaps.cpp:48
static bool IsSkipGap(const objects::CDelta_seq &seg)
Definition: expand_gaps.cpp:43
CSeqVector –.
Definition: seq_vector.hpp:65
USING_SCOPE(ncbi::objects)
static CS_COMMAND * cmd
Definition: ct_dynamic.c:26
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
TSeqPos GetBioseqLength(void) const
bool IsSetInst(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
const TInst & GetInst(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
const TLiteral & GetLiteral(void) const
Get the variant data.
Definition: Delta_seq_.cpp:124
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
TLength GetLength(void) const
Get the Length member data.
bool IsSetFuzz(void) const
could be unsure Check if a value has been assigned to Fuzz data member.
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
bool IsLiteral(void) const
Check if variant Literal is selected.
Definition: Delta_seq_.hpp:263
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
void ResetExt(void)
Reset Ext data member.
Definition: Seq_inst_.cpp:142
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
int len
Int4 delta(size_t dimension_, const Int4 *score_)
Modified on Tue Apr 16 20:08:40 2024 by modify_doxy.py rev. 669887