NCBI C++ ToolKit
gff3_idgen.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gff3_idgen.cpp 94014 2021-06-15 13:02:48Z ludwigf $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Frank Ludwig
27  *
28  * File Description: Write gff file
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
37 #include <objects/seq/so_map.hpp>
39 
65 
66 #include <objmgr/feat_ci.hpp>
67 #include <objmgr/annot_ci.hpp>
68 #include <objmgr/align_ci.hpp>
69 #include <objmgr/seqdesc_ci.hpp>
70 #include <objmgr/mapped_feat.hpp>
71 #include <objmgr/util/feature.hpp>
72 #include <objmgr/util/sequence.hpp>
74 
78 
79 #include <array>
80 #include <sstream>
81 
84 
85 // ------------------------------------------------------------------------------
87  const CMappedFeat& mf,
89 // -----------------------------------------------------------------------------t-
90 {
91  auto id = mf.GetNamedQual("ID");
92  if (id.empty()) {
93  auto subType = mf.GetFeatSubtype();
94  switch(subType) {
95  default:
96  if (mf.GetFeatType() == CSeqFeatData::e_Rna) {
97  id = xGetIdForRna(mf, fc);
98  break;
99  }
100  id = xGetGenericId(mf, fc);
101  break;
103  id = xGetIdForGene(mf, fc);
104  break;
106  id = xGetIdForCds(mf,fc);
107  break;
108  }
109  }
110  if (!id.empty()) {
111  id = xDisambiguate(id);
112  mExistingIds.emplace(id);
113  }
114  return id;
115 }
116 
117 // -----------------------------------------------------------------------------
119 // -----------------------------------------------------------------------------
120 {
122 }
123 
124 // -----------------------------------------------------------------------------
126  CBioseq_Handle bsh)
127 // -----------------------------------------------------------------------------
128 {
129  string locationId("unknown");
130 
131  string bestId;
133  if (pId) {
135  if (CGenbankIdResolve::Get().GetBestId(idh, bsh.GetScope(), bestId)) {
136  locationId = bestId;
137  }
138  }
139  else {
140  auto ids = bsh.GetId();
141  if (!ids.empty()) {
142  auto id = ids.front();
143  CGenbankIdResolve::Get().GetBestId(id, bsh.GetScope(), bestId);
144  locationId = bestId;
145  }
146  }
147 
148  string seqStart = "1";//always for source
149  string seqStop = NStr::NumericToString(bsh.GetBioseqLength());
150 
151  locationId += ":";
152  locationId += seqStart;
153  locationId += "..";
154  locationId += seqStop;
155  return locationId;
156 }
157 
158 // -----------------------------------------------------------------------------
160 // -----------------------------------------------------------------------------
161 {
162  mExistingIds.clear();
163  mLastUsedExonIds.clear();
165 }
166 
167 // -----------------------------------------------------------------------------
169  const string& rnaId)
170 // -----------------------------------------------------------------------------
171 {
172  auto idIt = mLastUsedExonIds.find(rnaId);
173  if (idIt == mLastUsedExonIds.end()) {
174  mLastUsedExonIds[rnaId] = 1;
175  }
176  else {
177  mLastUsedExonIds[rnaId]++;
178  }
179  string id("exon-");
180  auto suffix = string("-") + NStr::NumericToString(mLastUsedExonIds[rnaId]);
181  if (NStr::StartsWith(rnaId, "rna-")) {
182  id += rnaId.substr(4) + suffix;
183  }
184  else {
185  id += rnaId + suffix;
186  }
187  return id;
188 }
189 
190 // -----------------------------------------------------------------------------
192  const CMappedFeat& mf,
194 // -----------------------------------------------------------------------------
195 {
196  const string commonPrefix("gene-");
197 
198  //try locus_tag or locus
199  auto stem = xExtractGeneLocusTagOrLocus(mf);
200  if (!stem.empty()) {
201  return (commonPrefix + stem);
202  }
203 
204  //fall back: generic stem
205  return (commonPrefix + xGetGenericSuffix(mf, fc));
206 }
207 
208 // ----------------------------------------------------------------------------
210  const CMappedFeat& mf,
212 // ----------------------------------------------------------------------------
213 {
214  const string commonPrefix("rna-");
215 
216  //try to use far accession
217  auto farAccession = xExtractFarAccession(mf);
218  if (!farAccession.empty()) {
219  return (commonPrefix + farAccession);
220  }
221 
222  //try to use orig_transcript_id
223  auto origTranscriptId = mf.GetNamedQual("orig_transcript_id");
224  if (!origTranscriptId.empty()) {
225  return (commonPrefix + origTranscriptId);
226  }
227 
228  //try to inherit from gene
229  auto subtype = mf.GetFeatSubtype();
230  CMappedFeat gene;
231  if (subtype == CSeqFeatData::eSubtype_mRNA) {
232  gene = feature::GetBestGeneForMrna(mf, &fc.FeatTree());
233  }
234  else {
235  gene = feature::GetBestGeneForFeat(mf, &fc.FeatTree());
236  }
237  auto stem = xExtractGeneLocusTagOrLocus(gene);
238  if (!stem.empty()) {
239  return (commonPrefix + stem);
240  }
241 
242  //fall back: generic suffix
243  return (commonPrefix + xGetGenericSuffix(mf, fc));
244 }
245 
246 // -----------------------------------------------------------------------------
248  const CMappedFeat& mf,
250 // -----------------------------------------------------------------------------
251 {
252  const string commonPrefix("cds-");
253 
254  //try far accession
255  auto farAccession = xExtractFarAccession(mf);
256  if (!farAccession.empty()) {
257  return (commonPrefix + farAccession);
258  }
259 
260  //try orig_protein_id
261  auto origTranscriptId = mf.GetNamedQual("orig_protein_id");
262  if (!origTranscriptId.empty()) {
263  return (commonPrefix + origTranscriptId);
264  }
265 
266  //try to inherit from gene
267  auto gene = feature::GetBestGeneForCds(mf, &fc.FeatTree());
268  auto stem = xExtractGeneLocusTagOrLocus(gene);
269  if (!stem.empty()) {
270  return (commonPrefix + stem);
271  }
272 
273  //last resort: generic suffix
274  return (commonPrefix + xGetGenericSuffix(mf, fc));
275 }
276 
277 // -----------------------------------------------------------------------------
279  const CMappedFeat& mf,
281 // -----------------------------------------------------------------------------
282 {
283  const string commonPrefix("id-");
284  string rawId;
285 
286  //try to inherit from gene:
287  auto stem = xExtractGeneLocusTagOrLocus(mf);
288  if (!stem.empty()) {
289  rawId = commonPrefix + stem;
290  }
291 
292  //fall back: generic suffix
293  if (rawId.empty()) {
294  rawId = commonPrefix + xGetGenericSuffix(mf, fc);
295  }
296 
297  //for native exons: attach exon number if available
299  auto exonNumber = mf.GetNamedQual("number");
300  if (!exonNumber.empty()) {
301  rawId += string("-") + exonNumber;
302  }
303  }
304  return rawId;
305 }
306 
307 // ----------------------------------------------------------------------------
309  const CMappedFeat& mf,
311 // ----------------------------------------------------------------------------
312 {
313  const auto dbxrefs = mf.GetDbxref();
314  for (const auto& ref: dbxrefs) {
315  if (ref->GetDb() == "GeneID") {
316  stringstream ostr;
317  ostr << "GeneID:";
318  ref->GetTag().AsString(ostr);
319  return ostr.str();
320  }
321  }
322  return xExtractFeatureLocation(mf, fc);
323 }
324 
325 // ----------------------------------------------------------------------------
327  const CMappedFeat& mf)
328 // -----------------------------------------------------------------------------
329 {
330  if (!mf) {
331  return "";
332  }
333  auto gene = mf;
334  if (gene.GetFeatSubtype() != CSeqFeatData::eSubtype_gene) {
335  gene = feature::GetBestGeneForFeat(mf);
336  }
337  if (!gene) {
338  return "";
339  }
340 
341  const auto& geneRef = gene.GetData().GetGene();
342  if (geneRef.IsSetLocus_tag()) {
343  return geneRef.GetLocus_tag();
344  }
345  if (geneRef.IsSetLocus()) {
346  return geneRef.GetLocus();
347  }
348  return "";
349 }
350 
351 // ----------------------------------------------------------------------------
353  const CMappedFeat& mf,
355 // ----------------------------------------------------------------------------
356 {
357  string locationId;
358  if (!CGenbankIdResolve::Get().GetBestId(mf, locationId)) {
359  locationId = "unknown";
360  }
361  auto locationType = mf.GetLocation().Which();
362  if (locationType == CSeq_loc::e_Whole) {
363  auto bsh = fc.BioseqHandle();
364  if (bsh && bsh.CanGetInst_Length()) {
365  locationId += ":1.." + NStr::NumericToString(bsh.GetInst_Length());
366  return locationId;
367  }
368  locationId += ":whole"; //for lack of better ideas
369  return locationId;
370  }
371  auto inPoint = NStr::NumericToString(mf.GetLocationTotalRange().GetFrom() + 1);
372  auto outPoint = NStr::NumericToString(mf.GetLocationTotalRange().GetTo() + 1);
373  locationId += ":";
374  locationId += inPoint;
375  locationId += "..";
376  locationId += outPoint;
377  return locationId;
378 }
379 
380 // ------------------------------------------------------------------------------
382  const CMappedFeat& mf)
383 // ------------------------------------------------------------------------------
384 {
385  const auto productIdHandle = mf.GetProductId();
386  if (!productIdHandle) {
387  return "";
388  }
389  auto bestIdHandle = sequence::GetId(
390  productIdHandle, mf.GetScope(), sequence::eGetId_ForceAcc);
391  if (!bestIdHandle) {
392  return "";
393  }
394  return bestIdHandle.GetSeqId()->GetSeqIdString(true);
395 }
396 
397 // ------------------------------------------------------------------------------
399  const string& baseId)
400 // ------------------------------------------------------------------------------
401 {
402  auto preExisting = mExistingIds.find(baseId);
403  if (preExisting == mExistingIds.end()) {
404  return baseId;
405  }
406  for (int suffix = 2; true; ++suffix) {
407  auto disambiguated = baseId + "-" + NStr::NumericToString(suffix);
408  preExisting = mExistingIds.find(disambiguated);
409  if (preExisting == mExistingIds.end()) {
410  return disambiguated;
411  }
412  }
413  return baseId;
414 }
415 
417 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
static CGenbankIdResolve & Get()
bool GetBestId(CSeq_id_Handle, CScope &, string &)
std::string xGetGenericSuffix(const CMappedFeat &, CGffFeatureContext &)
Definition: gff3_idgen.cpp:308
std::string xDisambiguate(const std::string &)
Definition: gff3_idgen.cpp:398
std::string xGetGenericId(const CMappedFeat &, CGffFeatureContext &)
Definition: gff3_idgen.cpp:278
std::string xGetIdForGene(const CMappedFeat &, CGffFeatureContext &)
Definition: gff3_idgen.cpp:191
std::string xExtractFeatureLocation(const CMappedFeat &, CGffFeatureContext &)
Definition: gff3_idgen.cpp:352
std::string GetGffSourceId(CBioseq_Handle)
Definition: gff3_idgen.cpp:125
std::map< std::string, int > mLastUsedExonIds
Definition: gff3_idgen.hpp:109
std::set< std::string > mExistingIds
Definition: gff3_idgen.hpp:108
std::string xExtractGeneLocusTagOrLocus(const CMappedFeat &)
Definition: gff3_idgen.cpp:326
std::string xGetIdForRna(const CMappedFeat &, CGffFeatureContext &)
Definition: gff3_idgen.cpp:209
std::string xExtractFarAccession(const CMappedFeat &)
Definition: gff3_idgen.cpp:381
unsigned int mLastTrulyGenericSuffix
Definition: gff3_idgen.hpp:110
std::string GetGffId()
Definition: gff3_idgen.cpp:118
std::string xGetIdForCds(const CMappedFeat &, CGffFeatureContext &)
Definition: gff3_idgen.cpp:247
std::string GetNextGffExonId(const std::string &)
Definition: gff3_idgen.cpp:168
CMappedFeat –.
Definition: mapped_feat.hpp:59
USING_SCOPE(objects)
string
Definition: cgiapp.hpp:687
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
CMappedFeat GetBestGeneForMrna(const CMappedFeat &mrna_feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3301
CMappedFeat GetBestGeneForCds(const CMappedFeat &cds_feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3321
CMappedFeat GetBestGeneForFeat(const CMappedFeat &feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3443
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_ForceAcc
return only an accession based seq-id
Definition: sequence.hpp:100
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
TSeqPos GetBioseqLength(void) const
CConstRef< CSeq_id > GetNonLocalIdOrNull(void) const
Find a non-local ID if present, consulting assembly details if all IDs for the overall sequence are l...
CScope & GetScope(void) const
Get scope this handle belongs to.
CScope & GetScope(void) const
Get scope this handle belongs to.
const CSeq_feat::TDbxref & GetDbxref(void) const
CSeqFeatData::ESubtype GetFeatSubtype(void) const
CSeqFeatData::E_Choice GetFeatType(void) const
const TId & GetId(void) const
TRange GetLocationTotalRange(void) const
Definition: mapped_feat.hpp:98
CSeq_id_Handle GetProductId(void) const
const CSeq_loc & GetLocation(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
@ e_Whole
whole sequence
Definition: Seq_loc_.hpp:100
constexpr bool empty(list< Ts... >) noexcept
#define fc
static const char * suffix[]
Definition: pcregrep.c:408
CConstRef< CSeq_id > GetBestId(const CBioseq &bioseq)
Modified on Sat Dec 02 09:21:30 2023 by modify_doxy.py rev. 669887