NCBI C++ ToolKit
agp_seq_entry.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: agp_seq_entry.cpp 93579 2021-05-01 20:54:52Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio, Michael Kornbluh
27  *
28  * File Description:
29  * Convert an AGP file into a vector of Seq-entries
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
36 #include <objects/seq/Seq_inst.hpp>
37 #include <objects/seq/Bioseq.hpp>
45 #include <objects/seq/Seq_data.hpp>
46 #include <objects/seq/Seq_ext.hpp>
47 #include <objects/seq/Seq_gap.hpp>
50 
51 #include <util/static_map.hpp>
52 
54 
56 
58  TFlags fFlags,
59  EAgpVersion agp_version,
60  CAgpErr* arg)
61  : CAgpReader( arg, agp_version ),
62  m_fFlags(fFlags)
63 { }
64 
65 // static
67 {
68  CRef<CSeq_id> seq_id;
69  try {
70  seq_id.Reset( new CSeq_id( str ) );
71  } catch(...) {
72  // couldn't create real seq-id. fall back on local seq-id
73  return s_LocalSeqIdFromStr(str);
74  }
75  return seq_id;
76 }
77 
78 // static
81 {
82  CTempString sLocalID(str);
83 
84  // (trim off the lcl|, if any)
85  CTempString sPrefixToRemove("lcl|");
86  if( NStr::StartsWith(sLocalID, sPrefixToRemove, NStr::eNocase) ) {
87  sLocalID = sLocalID.substr(sPrefixToRemove.length());
88  }
89 
90  CRef<CSeq_id> seq_id( new CSeq_id );
91 
92  // check if it's a number or string
93  const int id_as_num = NStr::StringToInt(sLocalID,
97  if( id_as_num > 0 ) {
98  seq_id->SetLocal().SetId( id_as_num );
99  } else {
100  seq_id->SetLocal().SetStr( sLocalID );
101  }
102  return seq_id;
103 }
104 
106 {
107  if( ! m_bioseq ||
109  {
111 
112  // initialize new bioseq
113  CRef<CSeq_inst> seq_inst( new CSeq_inst );
114  seq_inst->SetRepr(CSeq_inst::eRepr_delta);
115  seq_inst->SetMol(CSeq_inst::eMol_dna);
116  seq_inst->SetLength(0);
117 
118  m_bioseq.Reset( new CBioseq );
119  m_bioseq->SetInst(*seq_inst);
120 
121  m_bioseq->SetId().push_back(
123  }
124 
125  CRef<CSeq_inst> seq_inst( & m_bioseq->SetInst() );
126 
127  CRef<CDelta_seq> delta_seq( new CDelta_seq );
128  seq_inst->SetExt().SetDelta().Set().push_back(delta_seq);
129 
130  if( m_this_row->is_gap ) {
131  delta_seq->SetLiteral().SetLength(m_this_row->gap_length);
132  if( m_this_row->component_type == 'U' ) {
133  delta_seq->SetLiteral().SetFuzz().SetLim();
134  }
135  if( m_fFlags & fSetSeqGap ) {
136  CSeq_data::TGap & gap_info =
137  delta_seq->SetLiteral().SetSeq_data().SetGap();
138  x_SetSeqGap(gap_info);
139  }
140  seq_inst->SetLength() += m_this_row->gap_length;
141  } else {
142  CSeq_loc& loc = delta_seq->SetLoc();
143 
144  CRef<CSeq_id> comp_id =
146  loc.SetInt().SetId(*comp_id);
147 
148  loc.SetInt().SetFrom( m_this_row->component_beg - 1 );
149  loc.SetInt().SetTo( m_this_row->component_end - 1 );
150  seq_inst->SetLength() += ( m_this_row->component_end - m_this_row->component_beg + 1 );
151 
152  switch( m_this_row->orientation ) {
154  loc.SetInt().SetStrand( eNa_strand_plus );
155  break;
157  loc.SetInt().SetStrand( eNa_strand_minus );
158  break;
160  loc.SetInt().SetStrand( eNa_strand_unknown );
161  break;
163  loc.SetInt().SetStrand( eNa_strand_other );
164  break;
165  default:
166  throw runtime_error("unknown orientation " + NStr::IntToString(m_this_row->orientation));
167  }
168  }
169 }
170 
172 {
173  // First, do real finalize
174  const int return_val = CAgpReader::Finalize();
175  // Then, our own finalization
177 
178  return return_val;
179 }
180 
182 {
183  if( m_bioseq ) {
184  CRef<CSeq_entry> entry( new CSeq_entry );
185  entry->SetSeq(*m_bioseq);
186  m_entries.push_back( entry );
187 
188  m_bioseq.Reset();
189  }
190 }
191 
194 {
195  if( m_fFlags & fForceLocalId ) {
196  return s_LocalSeqIdFromStr(str);
197  } else {
198  return s_DefaultSeqIdFromStr(str);
199  }
200 }
201 
202 void CAgpToSeqEntry::x_SetSeqGap( CSeq_gap & out_gap_info )
203 {
204  // convert the CAgpRow types to NCBI Objects types
205 
206  // The parent class should have verified that the gap-type,
207  // linkage, linkage-evidence combos are all consistent
208 
209  // gap type
210  {{
211  // conversion table
213  static const TGapTrans sc_GapTrans[] = {
223  };
225  DEFINE_STATIC_ARRAY_MAP(TGapMap, sc_GapMap, sc_GapTrans);
226 
227  TGapMap::const_iterator find_iter =
228  sc_GapMap.find(m_this_row->gap_type);
229 
230  if( find_iter == sc_GapMap.end() ) {
231  NCBI_USER_THROW_FMT("invalid gap type: "
232  << static_cast<int>(m_this_row->gap_type) );
233  } else {
234  out_gap_info.SetType( find_iter->second );
235  }
236  }}
237 
238  // gap linkage
239  {{
240  out_gap_info.SetLinkage( m_this_row->linkage ?
243  }}
244 
245  // gap linkage-evidence
246  {{
248  {
249  // conversion table
251  static const TEvidTrans sc_EvidTrans[] = {
262  };
264  DEFINE_STATIC_ARRAY_MAP(TEvidMap, sc_EvidMap, sc_EvidTrans);
265 
266  CSeq_gap::TLinkage_evidence & link_evid =
267  out_gap_info.SetLinkage_evidence();
268 
269  _ASSERT( ! m_this_row->linkage_evidences.empty() );
272  {
273  const CAgpRow::ELinkageEvidence eLinkageEvidence =
274  *evid_it;
275  TEvidMap::const_iterator find_iter =
276  sc_EvidMap.find(eLinkageEvidence);
277  if( find_iter == sc_EvidMap.end() ) {
278  NCBI_USER_THROW_FMT("Unknown linkage evidence: "
279  << static_cast<int>(eLinkageEvidence) );
280  }
281 
283  pEvid->SetType( find_iter->second );
284  link_evid.push_back( pEvid );
285  }
286  } else {
287  // check special values
291  {
293  pEvid->SetType( CLinkage_evidence::eType_unspecified );
294  out_gap_info.SetLinkage_evidence().push_back( pEvid );
295  }
296  break;
298  // no problem, just ignore
299  break;
300  default:
302  "Unknown or unexpected linkage_evidence_flags: "
304  break;
305  }
306  }
307  }}
308 }
309 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
USING_SCOPE(objects)
EAgpVersion
Definition: agp_util.hpp:55
Detects scaffolds, object boundaries, errors that involve 2 consecutive lines, and is intended as a s...
Definition: agp_util.hpp:327
virtual int Finalize()
This is called at the end of the file, usually automatically but can be called manually if the automa...
Definition: agp_util.cpp:1160
CRef< CAgpRow > m_prev_row
Definition: agp_util.hpp:415
CRef< CAgpRow > m_this_row
Definition: agp_util.hpp:416
@ eGapRepeat
Definition: agp_util.hpp:178
@ eGapShort_arm
Definition: agp_util.hpp:184
@ eGapFragment
Definition: agp_util.hpp:177
@ eGapHeterochromatin
Definition: agp_util.hpp:185
@ eGapClone
Definition: agp_util.hpp:176
@ eGapTelomere
Definition: agp_util.hpp:186
@ eGapCentromere
Definition: agp_util.hpp:183
@ eGapScaffold
Definition: agp_util.hpp:179
@ eGapContig
Definition: agp_util.hpp:182
ELinkageEvidence
Definition: agp_util.hpp:195
@ fLinkageEvidence_na
Definition: agp_util.hpp:211
@ fLinkageEvidence_strobe
Definition: agp_util.hpp:203
@ fLinkageEvidence_align_genus
Definition: agp_util.hpp:197
@ fLinkageEvidence_paired_ends
Definition: agp_util.hpp:196
@ fLinkageEvidence_align_xgenus
Definition: agp_util.hpp:198
@ fLinkageEvidence_align_trnscpt
Definition: agp_util.hpp:199
@ fLinkageEvidence_map
Definition: agp_util.hpp:202
@ fLinkageEvidence_pcr
Definition: agp_util.hpp:204
@ fLinkageEvidence_within_clone
Definition: agp_util.hpp:200
@ fLinkageEvidence_clone_contig
Definition: agp_util.hpp:201
@ fLinkageEvidence_proximity_ligation
Definition: agp_util.hpp:205
@ fLinkageEvidence_unspecified
Definition: agp_util.hpp:209
EGap gap_type
Definition: agp_util.hpp:192
string & GetComponentId()
Definition: agp_util.hpp:126
@ eOrientationUnknown
Definition: agp_util.hpp:168
@ eOrientationPlus
Definition: agp_util.hpp:166
@ eOrientationIrrelevant
Definition: agp_util.hpp:169
@ eOrientationMinus
Definition: agp_util.hpp:167
string & GetObject()
Definition: agp_util.hpp:120
int linkage_evidence_flags
a bit map which holds summary of info in linkage_evidences.
Definition: agp_util.hpp:220
char component_type
Definition: agp_util.hpp:154
TAgpPos component_beg
Definition: agp_util.hpp:158
EOrientation orientation
Definition: agp_util.hpp:171
TLinkageEvidenceVec linkage_evidences
Definition: agp_util.hpp:217
TAgpPos component_end
Definition: agp_util.hpp:158
TAgpLen gap_length
Definition: agp_util.hpp:173
bool linkage
Definition: agp_util.hpp:193
bool is_gap
Definition: agp_util.hpp:156
vector< ELinkageEvidence > TLinkageEvidenceVec
Might have duplicates, and is empty on error or if there are no actual linkage evidences (e....
Definition: agp_util.hpp:216
virtual void OnGapOrComponent(void)
Builds new part of delta-seq in current bioseq, or adds bioseq and starts building a new one.
void x_FinishedBioseq(void)
Our own finalization after parent's finalization.
const TFlags m_fFlags
static CRef< objects::CSeq_id > s_DefaultSeqIdFromStr(const std::string &str)
This is the default method used to turn strings into Seq-ids in AGP contexts.
vector< CRef< objects::CSeq_entry > > m_entries
Holds the results.
@ fSetSeqGap
Found gaps will not be given Seq-data such as Type and Linkage.
@ fForceLocalId
All IDs will be treated as local IDs.
virtual CRef< objects::CSeq_id > x_GetSeqIdFromStr(const std::string &str)
If you must change exactly how strings are turned into Seq-ids, you can override this in a subclass.
CAgpToSeqEntry(TFlags fFlags=0, EAgpVersion agp_version=eAgpVersion_auto, CAgpErr *arg=nullptr)
After construction, you probably want to do something like call ReadStream and then GetResult.
virtual int Finalize(void)
Parent finalize plus making sure last m_bioseq is added.
static CRef< objects::CSeq_id > s_LocalSeqIdFromStr(const std::string &str)
Turn a string into a local Seq-id (removing "lcl|" from the beginning if needed)
void x_SetSeqGap(objects::CSeq_gap &out_gap_info)
Fills in out_gap_info based on current CAgpRow.
CRef< objects::CBioseq > m_bioseq
This is the bioseq currently being built.
CDelta_seq –.
Definition: Delta_seq.hpp:66
Definition: Seq_entry.hpp:56
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
@ fAllowTrailingSpaces
Ignore trailing space characters.
Definition: ncbistr.hpp:297
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fAllowLeadingSpaces
Ignore leading spaces in converted string.
Definition: ncbistr.hpp:294
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
TId & SetId(void)
Select the variant.
Definition: Object_id_.hpp:277
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_other
Definition: Na_strand_.hpp:70
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_gap_.hpp:291
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
Definition: Seq_gap_.hpp:375
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
Definition: Seq_gap_.hpp:338
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
list< CRef< CLinkage_evidence > > TLinkage_evidence
Definition: Seq_gap_.hpp:118
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eType_clone
Deprecated. Used only for AGP 1.1.
Definition: Seq_gap_.hpp:91
@ eType_heterochromatin
Definition: Seq_gap_.hpp:93
@ eType_fragment
Deprecated. Used only for AGP 1.1.
Definition: Seq_gap_.hpp:90
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
Definition: static_set.hpp:888
static const char * str(char *buf, int n)
Definition: stats.c:84
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
#define _ASSERT
Modified on Mon Mar 04 05:13:44 2024 by modify_doxy.py rev. 669887