NCBI C++ ToolKit
Bioseq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: Bioseq.cpp 90544 2020-06-26 12:56:10Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using specifications from the ASN data definition file
34  * 'seq.asn'.
35  *
36  */
37 
38 // standard includes
39 #include <ncbi_pch.hpp>
40 #include <corelib/ncbiutil.hpp>
41 #include <serial/enumvalues.hpp>
42 #include <serial/typeinfo.hpp>
43 
44 // generated includes
46 
47 #include <objects/seq/Bioseq.hpp>
50 #include <objects/seq/IUPACna.hpp>
51 #include <objects/seq/NCBI4na.hpp>
52 #include <objects/seq/NCBI8na.hpp>
54 #include <objects/seq/Seq_data.hpp>
56 #include <objects/seq/Seq_ext.hpp>
57 #include <objects/seq/Seq_inst.hpp>
58 #include <objects/seq/Seq_hist.hpp>
59 
66 
73 
74 
75 #define NCBI_USE_ERRCODE_X Objects_Bioseq
76 
77 // generated classes
78 
80 
81 BEGIN_objects_SCOPE // namespace ncbi::objects::
82 
83 // destructor
85 {
86 }
87 
88 void CBioseq::UserOp_Assign(const CSerialUserOp& /*source*/)
89 {
90 }
91 
92 bool CBioseq::UserOp_Equals(const CSerialUserOp& /*object*/) const
93 {
94  return true;
95 }
96 
97 
99 
101 {
102  switch ( loc.Which() ) {
104  {
105  // extract each range, create and add simple location
107  CRef<CDelta_seq> dseq(new CDelta_seq);
108  dseq->SetLoc().SetInt().Assign(**ii);
109  ext.Set().push_back(dseq);
110  }
111  break;
112  }
114  {
115  // extract each point
117  loc.GetPacked_pnt().GetPoints() ) {
118  CRef<CSeq_loc> pnt_loc(new CSeq_loc);
119  pnt_loc->SetPnt().SetId().Assign(loc.GetPacked_pnt().GetId());
120  pnt_loc->SetPnt().SetPoint(*pi);
121  if ( loc.GetPacked_pnt().IsSetStrand() ) {
122  pnt_loc->SetPnt().SetStrand(
123  loc.GetPacked_pnt().GetStrand());
124  }
125  CRef<CDelta_seq> dseq(new CDelta_seq);
126  dseq->SetLoc(*pnt_loc);
127  ext.Set().push_back(CRef<CDelta_seq>(dseq));
128  }
129  }
130  case CSeq_loc::e_Mix:
131  {
132  // extract sub-locations
133  ITERATE ( CSeq_loc_mix::Tdata, li, loc.GetMix().Get() ) {
134  x_SeqLoc_To_DeltaExt(**li, ext);
135  }
136  return;
137  }
138  default:
139  {
140  // Just add the location
141  CDelta_seq* dseq = new CDelta_seq;
142  CSeq_loc* cp_loc = new CSeq_loc;
143  SerialAssign<CSeq_loc>(*cp_loc, loc);
144  dseq->SetLoc(*cp_loc);
145  ext.Set().push_back(CRef<CDelta_seq>(dseq));
146  }
147  }
148 }
149 
150 
151 CBioseq::CBioseq(const CSeq_loc& loc, const string& str_id)
152  : m_ParentEntry(0)
153 {
154  CBioseq::TId& id_list = SetId();
155 
156  // Id
157  CSeq_id* id = new CSeq_id;
158  if ( str_id.empty() ) {
159  id->SetLocal().SetStr("constructed" + NStr::IntToString(sm_ConstructedId++));
160  }
161  else {
162  id->SetLocal().SetStr(str_id);
163  }
164  id_list.push_back(CRef<CSeq_id>(id));
165 
166  // Inst
167  CSeq_inst& inst = SetInst();
170 
171  CDelta_ext& ext = inst.SetExt().SetDelta();
172  x_SeqLoc_To_DeltaExt(loc, ext);
173 }
174 
175 
176 /// Determine the tax-id for this bioseq
178 {
179  /// A taxid can be found either in a source descriptor (the newer form) or in a
180  /// org descriptor. If both are there, the source descriptor should have precedence.
181  TTaxId taxid_from_source = ZERO_TAX_ID,
182  taxid_from_org = ZERO_TAX_ID;
183 
184  if (IsSetDescr()) {
185  ITERATE (TDescr::Tdata, it, GetDescr().Get()) {
186  const CSeqdesc& desc = **it;
187  if (desc.IsOrg()) {
188  taxid_from_org = desc.GetOrg().GetTaxId();
189  } else if (desc.IsSource() && desc.GetSource().IsSetOrg()) {
190  taxid_from_source = desc.GetSource().GetOrg().GetTaxId();
191  }
192  if (taxid_from_source != ZERO_TAX_ID) {
193  break;
194  }
195  }
196  }
197 
198  return taxid_from_source != ZERO_TAX_ID ? taxid_from_source : taxid_from_org;
199 }
200 
201 
202 void CBioseq::GetLabel(string* label, ELabelType type, bool worst) const
203 {
204  if (!label) {
205  return;
206  }
207 
208  if (type != eType && !GetId().empty()) {
209  const CSeq_id* id = 0;
210  CSeq_id worst_id;
211  if (!worst) {
212  id = GetId().begin()->GetPointer();
213  ITERATE (CBioseq::TId, id_itr, GetId()) {
214  const CSeq_id& sid = **id_itr;
215  switch (sid.Which()) {
216  case CSeq_id::e_Other:
217  case CSeq_id::e_Genbank:
218  case CSeq_id::e_Embl:
219  case CSeq_id::e_Ddbj:
220  {
221  const CTextseq_id& tsid = *sid.GetTextseq_Id ();
222  if (tsid.IsSetAccession()) {
223  id = &sid;
224  }
225  }
226  break;
227  default:
228  break;
229  }
230  }
231  } else {
232  const CSeq_id* wid =
233  FindBestChoice(GetId(), CSeq_id::WorstRank).GetPointer();
234  if (wid) {
235  worst_id.Assign(*wid);
236  CTextseq_id* tid =
237  const_cast<CTextseq_id*>(worst_id.GetTextseq_Id());
238  if (tid) {
239  tid->ResetName();
240  }
241  id = &worst_id;
242  }
243  }
244  CNcbiOstrstream os;
245  if (id) {
246  id->WriteAsFasta(os);
247  string s = CNcbiOstrstreamToString(os);
248  (*label) += s;
249  }
250  }
251 
252  if (type == eContent) {
253  return;
254  }
255 
256  if (!label->empty()) {
257  (*label) += ": ";
258  }
259 
260  const CEnumeratedTypeValues* tv;
261  tv = CSeq_inst::GetTypeInfo_enum_ERepr();
262  (*label) += tv->FindName(GetInst().GetRepr(), true) + ",";
263  tv = CSeq_inst::GetTypeInfo_enum_EMol();
264  (*label) += tv->FindName(GetInst().GetMol(), true);
265  if (GetInst().IsSetLength()) {
266  (*label) += string(" len=") + NStr::IntToString(GetInst().GetLength());
267  }
268 }
269 
270 
272 {
273  // If no ids for Bioseq, return 0 -- should not happen
274  if (GetId().empty()) {
275  return 0;
276  }
277 
278  return *GetId().begin();
279 }
280 
281 static int s_BestNonLocalRank(const CRef<CSeq_id>& id)
282 {
283  if (id.Empty()) {
284  return kMax_Int;
285  } else if (id->IsLocal()) {
286  return kMax_Int - 1;
287  } else {
288  return id->BestRankScore();
289  }
290 }
291 
293 {
295  if (id.Empty()) {
296  return NULL; // No way to verify potential IDs found elsewhere
297  } else if ( !id->IsLocal() ) {
298  return &*id;
299  }
300 
301  const CSeq_inst& inst = GetInst();
302  if ( !inst.CanGetHist() || !inst.GetHist().CanGetAssembly() ) {
303  return NULL;
304  }
305 
307  try {
308  if ((*it)->CheckNumRows() != 2) {
309  continue;
310  }
311  } catch (CSeqalignException&) { // fails basic validation; ignore
312  continue;
313  }
314 
315  const CSeq_id& id1 = (*it)->GetSeq_id(0);
316  const CSeq_id& id2 = (*it)->GetSeq_id(1);
317  if (id1.IsLocal() && id1.Match(*id) && !id2.IsLocal()) {
318  return &id2;
319  } else if (id2.IsLocal() && id2.Match(*id) && !id1.IsLocal()) {
320  return &id1;
321  }
322  }
323 
324  return NULL;
325 }
326 
327 
328 static int s_BestLocalRank(const CRef<CSeq_id>& id)
329 {
330  if (id.Empty() || !id->IsLocal()) {
331  return kMax_Int;
332  }
333  return id->BestRankScore();
334 }
335 
337 {
339  if (id.NotEmpty() && id->IsLocal()) {
340  return &*id;
341  }
342  return NULL;
343 }
344 
345 bool CBioseq::IsNa(void) const
346 {
347  return GetInst ().IsNa ();
348 }
349 
350 bool CBioseq::IsAa(void) const
351 {
352  return GetInst ().IsAa ();
353 }
354 
355 bool CBioseq::IsSetLength(void) const
356 {
357  return GetInst ().IsSetLength ();
358 }
359 
361 {
362  return GetInst ().GetLength ();
363 }
364 
365 void CBioseq::PackAsDeltaSeq(bool gaps_ok)
366 {
367  CSeq_inst& inst = SetInst();
368  if (inst.IsAa() || !inst.IsSetSeq_data() || inst.IsSetExt()) {
369  return;
370  }
371  const CSeq_data& data = inst.GetSeq_data();
372  CTempString src;
373  switch (data.Which()) {
375  return; // optimal as is
376 #define CODING_CASE(x) \
377  case CSeq_data::e_##x: \
378  src.assign(&data.Get##x().Get()[0], data.Get##x().Get().size()); \
379  break;
380  CODING_CASE(Iupacna)
381  CODING_CASE(Iupacaa)
382  CODING_CASE(Ncbi4na)
383  CODING_CASE(Ncbi8na)
384  CODING_CASE(Ncbi8aa)
385  CODING_CASE(Ncbieaa)
386  CODING_CASE(Ncbistdaa)
387 #undef CODING_CASE
388  default:
389  ERR_POST_X(1, Warning << "PackAsDeltaSeq: unsupported encoding "
390  << CSeq_data::SelectionName(data.Which()));
391  return;
392  }
393 
394  CDelta_ext& ext = inst.SetExt().SetDelta();
395  ext.AddAndSplit(src, data.Which(), inst.GetLength(), gaps_ok);
396  if (ext.Get().size() > 1) { // finalize
398  inst.ResetSeq_data();
399  } else { // roll back
400  inst.ResetExt();
401  }
402 }
403 
404 
405 END_objects_SCOPE // namespace ncbi::objects::
406 
408 
409 #undef NCBI_USE_ERRCODE_X
410 
411 /* Original file checksum: lines: 61, chars: 1871, CRC32: 1d5d7d05 */
#define CODING_CASE(x)
static int s_BestLocalRank(const CRef< CSeq_id > &id)
Definition: Bioseq.cpp:328
static int s_BestNonLocalRank(const CRef< CSeq_id > &id)
Definition: Bioseq.cpp:281
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
const CSeq_id * GetNonLocalId() const
Find a non-local ID if present, consulting assembly details if all IDs for the overall sequence are l...
Definition: Bioseq.cpp:292
virtual bool UserOp_Equals(const CSerialUserOp &object) const
Will be called after comparing the datatool-generated members.
Definition: Bioseq.cpp:92
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
ELabelType
Definition: Bioseq.hpp:101
@ eContent
Definition: Bioseq.hpp:103
@ eType
Definition: Bioseq.hpp:102
static void x_SeqLoc_To_DeltaExt(const CSeq_loc &loc, CDelta_ext &ext)
Definition: Bioseq.cpp:100
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
void GetLabel(string *label, ELabelType type, bool worst=false) const
Definition: Bioseq.cpp:202
bool IsSetLength(void) const
Definition: Bioseq.cpp:355
const CSeq_id * GetLocalId() const
Find a local ID if present.
Definition: Bioseq.cpp:336
TTaxId GetTaxId() const
Determine the tax-id for this bioseq.
Definition: Bioseq.cpp:177
virtual void UserOp_Assign(const CSerialUserOp &source)
Will be called after copying the datatool-generated members.
Definition: Bioseq.cpp:88
void PackAsDeltaSeq(bool gaps_ok=false)
Convert a raw nucleotide sequence with occasional ambiguities or gaps into a tighter (but somewhat mo...
Definition: Bioseq.cpp:365
static int sm_ConstructedId
Definition: Bioseq.hpp:151
~CBioseq(void)
Definition: Bioseq.cpp:84
bool IsNa(void) const
Definition: Bioseq.cpp:345
bool IsAa(void) const
Definition: Bioseq.cpp:350
CBioseq(void)
Definition: Bioseq.hpp:162
void AddAndSplit(const CTempString &src, CSeq_data::E_Choice format, TSeqPos length, bool gaps_ok=false, bool allow_packing=true)
add a chunk of sequence, splitting it as necessary for the sake of compactness (isolating ambiguous p...
Definition: Delta_ext.cpp:183
CDelta_seq –.
Definition: Delta_seq.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
CRef –.
Definition: ncbiobj.hpp:618
static bool IsAa(EMol mol)
Definition: Seq_inst.hpp:99
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
Base class for user-defined serializable classes to allow for objects assignment and comparison.
Definition: serialbase.hpp:319
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
const string & FindName(TEnumValueType value, bool allowBadValue) const
Find name of the enum by its numeric value.
Definition: enumerated.cpp:146
const float pi
Definition: math.hpp:54
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
static int WorstRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:744
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
static const char label[]
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
list< CRef< CSeq_interval > > Tdata
const Tdata & Get(void) const
Get the member data.
TStrand GetStrand(void) const
Get the Strand member data.
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
list< CRef< CSeq_loc > > Tdata
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
const TId & GetId(void) const
Get the Id member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
const Tdata & Get(void) const
Get the member data.
const TPacked_pnt & GetPacked_pnt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:260
vector< TSeqPos > TPoints
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
void ResetName(void)
Reset Name data member.
Definition: Textseq_id_.cpp:50
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
const TPoints & GetPoints(void) const
Get the Points member data.
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
const TPacked_int & GetPacked_int(void) const
Get the variant data.
Definition: Seq_loc_.cpp:216
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
list< CRef< CSeq_align > > TAssembly
Definition: Seq_hist_.hpp:248
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TLoc & SetLoc(void)
Select the variant.
Definition: Delta_seq_.cpp:108
bool IsOrg(void) const
Check if variant Org is selected.
Definition: Seqdesc_.hpp:1046
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
Tdata & Set(void)
Assign a value to data member.
Definition: Delta_ext_.hpp:170
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
const TAssembly & GetAssembly(void) const
Get the Assembly member data.
Definition: Seq_hist_.hpp:512
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
static string SelectionName(E_Choice index)
Retrieve selection name (for diagnostic purposes).
Definition: Seq_data_.cpp:156
TInst & SetInst(void)
Assign a value to Inst data member.
Definition: Bioseq_.hpp:345
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
bool CanGetHist(void) const
Check if it is safe to call GetHist method.
Definition: Seq_inst_.hpp:853
const THist & GetHist(void) const
Get the Hist member data.
Definition: Seq_inst_.hpp:859
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
void ResetExt(void)
Reset Ext data member.
Definition: Seq_inst_.cpp:142
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
void ResetSeq_data(void)
Reset Seq_data data member.
Definition: Seq_inst_.cpp:125
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
bool CanGetAssembly(void) const
Check if it is safe to call GetAssembly method.
Definition: Seq_hist_.hpp:506
@ eRepr_const
constructed sequence
Definition: Seq_inst_.hpp:96
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
Definition of all error codes used in objects libraries.
const TYPE & Get(const CNamedParameterList *param)
constexpr bool empty(list< Ts... >) noexcept
Useful/utility classes and methods.
Definition: type.c:6
Modified on Wed Nov 29 02:18:31 2023 by modify_doxy.py rev. 669887