NCBI C++ ToolKit
gff3_import_data.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gff3_import_data.cpp 86362 2019-05-02 15:04:11Z ludwigf $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description: Iterate through file names matching a given glob pattern
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
36 #include <objects/seq/so_map.hpp>
42 
43 #include "feat_util.hpp"
44 #include "gff_util.hpp"
46 #include "gff3_import_data.hpp"
47 
48 #include <assert.h>
49 
52 
53 // ============================================================================
55  const CIdResolver& idResolver,
56  CImportMessageHandler& errorReporter):
57 // ============================================================================
58  CFeatImportData(idResolver, errorReporter)
59 {
60 }
61 
62 // ============================================================================
64  const CGff3ImportData& rhs):
65 // ============================================================================
66  CFeatImportData(rhs)
67 {
68 }
69 
70 // ============================================================================
71 void
73  const std::string& seqId,
74  const std::string& source,
75  const std::string& featureType,
76  TSeqPos seqStart,
77  TSeqPos seqStop,
78  bool scoreIsValid, double score,
79  ENa_strand seqStrand,
80  const string& phase,
81  const vector<pair<string, string>>& attributes)
82 // ============================================================================
83 {
84 
85  mpFeat.Reset(new CSeq_feat);
86  CSoMap::SoTypeToFeature(featureType, *mpFeat, true);
87 
88  CRef<CSeq_id> pId = mIdResolver(seqId);
89  mpFeat->SetLocation().SetInt().Assign(
90  CSeq_interval(*pId, seqStart, seqStop, seqStrand));
91 
92  mSource = source;
93  if (scoreIsValid) {
94  mpScore.reset(new double(score));
95  }
96 
98  if (phase != ".") {
99  mpFeat->SetData().SetCdregion().SetFrame(
100  GffUtil::PhaseToFrame(phase));
101  }
102  }
103 
105 }
106 
107 
108 // ============================================================================
109 void
111  const vector<pair<string, string>>& attributes)
112 // ============================================================================
113 {
114  vector<string> alwaysIgnored = {
115  "gbkey"
116  };
117 
118  // typically attributes that have meaning for some feature types only and
119  // should not be present for others:
120  vector<string> sometimesIgnored = {
121  "locus_tag"
122  };
123 
124  for (auto keyValuePair: attributes) {
125  auto key = keyValuePair.first;
126  auto value = keyValuePair.second;
127 
128  auto itAI = find(alwaysIgnored.begin(), alwaysIgnored.end(), key);
129  if (itAI != alwaysIgnored.end()) {
130  continue;
131  }
132 
133  if (key == "ID") {
134  mId = value;
135  //continue;
136  }
137  if (key == "Parent") {
138  mParent = value;
139  //continue;
140  }
141  if (xInitializeDbxref(key, value)) {
142  continue;
143  }
144  if (xInitializeComment(key, value)) {
145  continue;
146  }
147  if (xInitializeDataGene(key, value)) {
148  continue;
149  }
150  if (xInitializeDataRna(key, value)) {
151  continue;
152  }
153  if (xInitializeDataCds(key, value)) {
154  continue;
155  }
157  continue;
158  }
159 
160  auto itCI = find(sometimesIgnored.begin(), sometimesIgnored.end(), key);
161  if (itCI != sometimesIgnored.end()) {
162  continue;
163  }
165  }
166 }
167 
168 
169 // ============================================================================
170 bool
172  const std::string& key,
173  const std::string& value)
174 // ============================================================================
175 {
176  if (key != "Dbxref") {
177  return false;
178  }
179  vector<string> dbxRefs;
180  NStr::Split(value, ",", dbxRefs);
181  CRef<CDbtag> pDbtag;
182  for (auto dbxRef: dbxRefs) {
183  pDbtag.Reset(new CDbtag);
184  string db, tag;
185  NStr::SplitInTwo(dbxRef, ":", pDbtag->SetDb(), pDbtag->SetTag().SetStr());
186  mpFeat->SetDbxref().push_back(pDbtag);
187  }
188  return true;
189 }
190 
191 
192 // ============================================================================
193 bool
195  const std::string& key,
196  const std::string& value)
197 // ============================================================================
198 {
199  if (key != "Note") {
200  return false;
201  }
202  auto normalizedValue = NStr::URLDecode(value);
203  mpFeat->SetComment() = normalizedValue;
204  return true;
205 }
206 
207 
208 // ============================================================================
209 bool
211  const std::string& key,
212  const std::string& value)
213 // ============================================================================
214 {
215  auto& data = mpFeat->SetData();
216  if (!data.IsGene()) {
217  return false;
218  }
219 
220  auto& geneRef = data.SetGene();
221  if (key == "gene") {
222  geneRef.SetLocus(value);
223  return true;
224  }
225  if (key == "locus_tag") {
226  geneRef.SetLocus_tag(value);
227  return true;
228  }
229  if (key == "gene_synonym") {
230  vector<string> synonyms;
231  NStr::Split(value, ",", synonyms);
232  for (auto synonym: synonyms) {
233  geneRef.SetSyn().push_back(synonym);
234  }
235  return true;
236  }
237  return false;
238 }
239 
240 // ============================================================================
241 bool
243  const std::string& key,
244  const std::string& value)
245 // ============================================================================
246 {
247  auto& data = mpFeat->SetData();
248  if (!data.IsCdregion()) {
249  return false;
250  }
251  auto& cdsRef = data.SetCdregion();
252 
253  if (key == "transl_except") {
254  vector<string> codeBreaks;
255  NStr::Split(value, ",", codeBreaks);
256  for (auto codeBreakStr: codeBreaks) {
258  *mpFeat->GetLocation().GetId(),
259  NStr::URLDecode(codeBreakStr));
260  if (pCodeBreak) {
261  cdsRef.SetCode_break().push_back(pCodeBreak);
262  }
263  }
264  return true;
265  }
266 
267  if (key == "transl_table") {
269  pCe->SetId(NStr::StringToInt(value));
270  cdsRef.SetCode().Set().push_back(pCe);
271  return true;
272  }
273  return false;
274 }
275 
276 // ============================================================================
277 bool
279  const std::string& key,
280  const std::string& value)
281 // ============================================================================
282 {
283  auto& data = mpFeat->SetData();
284  if (!data.IsRna()) {
285  return false;
286  }
287  auto& rnaRef = data.SetRna();
288 
289  if (key == "ncrna_class") {
290  rnaRef.SetExt().SetGen().SetClass(value);
291  mpFeat->AddOrReplaceQualifier("ncRNA_class", value);
292  return true;
293  }
294  return false;
295 }
296 
297 // ============================================================================
298 bool
300  const std::string& key,
301  const std::string& value)
302 // ============================================================================
303 {
304  vector<string> multiValueAttrs = {
305  "ec_number", "function", "go_process", "inference"
306  };
307  auto attrIt = find(multiValueAttrs.begin(), multiValueAttrs.end(), key);
308  if (attrIt == multiValueAttrs.end()) {
309  return false;
310  }
311  vector<string> allValues;
312  NStr::Split(value, ",", allValues);
313  for (auto singleValue: allValues) {
314  mpFeat->AddQualifier(key, NStr::URLDecode(singleValue));
315  }
316  return true;
317 }
318 
319 // ============================================================================
320 void
322  CNcbiOstream& out)
323 // ============================================================================
324 {
325  out << "CGff3ImportData:\n";
326  out << "\n";
327 }
328 
329 
330 // ============================================================================
333 // ============================================================================
334 {
335  return mpFeat;
336 }
337 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const struct attribute attributes[]
Definition: attributes.c:165
Definition: Dbtag.hpp:53
const CIdResolver & mIdResolver
CRef< CSeq_feat > mpFeat
bool xInitializeDataGene(const std::string &, const std::string &)
CRef< CSeq_feat > GetData() const
virtual void Serialize(CNcbiOstream &) override
void xInitializeAttributes(const std::vector< std::pair< std::string, std::string >> &)
void Initialize(const std::string &, const std::string &, const std::string &, TSeqPos, TSeqPos, bool, double, ENa_strand, const std::string &, const std::vector< std::pair< std::string, std::string >> &)
bool xInitializeDataRna(const std::string &, const std::string &)
CGff3ImportData(const CIdResolver &, CImportMessageHandler &)
bool xInitializeDataCds(const std::string &, const std::string &)
unique_ptr< double > mpScore
bool xInitializeComment(const std::string &, const std::string &)
bool xInitializeMultiValue(const std::string &, const std::string &)
bool xInitializeDbxref(const std::string &, const std::string &)
ESubtype GetSubtype(void) const
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
void AddOrReplaceQualifier(const string &qual_name, const string &qual_val)
Add a qualifier to this feature, or replace the value for the first one if it already exists.
Definition: Seq_feat.cpp:299
void AddQualifier(const string &qual_name, const string &qual_val)
Add a qualifier to this feature.
Definition: Seq_feat.cpp:291
static bool SoTypeToFeature(const string &, CSeq_feat &, bool=false)
Definition: so_map.cpp:411
static CRef< CCode_break > MakeCodeBreak(const CSeq_id &, const std::string &)
Definition: feat_util.cpp:75
static CCdregion::TFrame PhaseToFrame(const std::string &)
Definition: gff_util.cpp:82
char value[7]
Definition: config.c:431
std::ofstream out("events_result.xml")
main entry point for tests
USING_SCOPE(objects)
USING_NCBI_SCOPE
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
string
Definition: cgiapp.hpp:687
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static string URLDecode(const CTempString str, EUrlDecode flag=eUrlDec_All)
URL-decode string.
Definition: ncbistr.cpp:6210
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
#define const
Definition: zconf.h:232
Modified on Wed Mar 27 11:17:48 2024 by modify_doxy.py rev. 669887