NCBI C++ ToolKit
gtf_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gtf_reader.cpp 101219 2023-11-16 17:54:55Z foleyjp $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  * GFF file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/line_reader.hpp>
37 
41 
46 
48 #include <objects/seq/Annot_id.hpp>
52 
59 
61 #include "gtf_location_merger.hpp"
63 
64 #include <algorithm>
65 
67 BEGIN_objects_SCOPE // namespace ncbi::objects::
68 
69 // ----------------------------------------------------------------------------
71  const string& strGtfType,
72  const string& strRawAttributes )
73 // ----------------------------------------------------------------------------
74 {
75  vector< string > attributes;
76  xSplitGffAttributes(strRawAttributes, attributes);
77 
78  for ( size_t u=0; u < attributes.size(); ++u ) {
79  string key, value;
80  string attribute(attributes[u]);
81  if (!NStr::SplitInTwo(attribute, "=", key, value)) {
82  if (!NStr::SplitInTwo(attribute, " ", key, value)) {
83  if (strGtfType == "gene") {
86  continue;
87  }
88  if (strGtfType == "transcript") {
89  string gid, tid;
90  if (!NStr::SplitInTwo(attribute, ".", gid, tid)) {
91  return false;
92  }
94  "gene_id", xNormalizedAttributeValue(gid));
96  "transcript_id", xNormalizedAttributeValue(attribute));
97  continue;
98  }
99  }
100  }
103  if ( key.empty() && value.empty() ) {
104  // Probably due to trailing "; ". Sequence Ontology generates such
105  // things.
106  continue;
107  }
108  if (NStr::StartsWith(value, "\"")) {
109  value = value.substr(1, string::npos);
110  }
111  if (NStr::EndsWith(value, "\"")) {
112  value = value.substr(0, value.length() - 1);
113  }
115  }
116  return true;
117 }
118 
119 // ----------------------------------------------------------------------------
121  unsigned int uFlags,
122  const string& strAnnotName,
123  const string& strAnnotTitle,
124  SeqIdResolver resolver,
125  CReaderListener* pRL):
126 // ----------------------------------------------------------------------------
127  CGff2Reader( uFlags, strAnnotName, strAnnotTitle, resolver, pRL)
128 {
129  mpLocations.reset(new CGtfLocationMerger(uFlags, resolver));
130 }
131 
132 // ----------------------------------------------------------------------------
134  unsigned int uFlags,
135  CReaderListener* pRL):
136 // ----------------------------------------------------------------------------
137  CGtfReader( uFlags, "", "", CReadUtil::AsSeqId, pRL)
138 {
139 }
140 
141 
142 // ----------------------------------------------------------------------------
144 // ----------------------------------------------------------------------------
145 {
146 }
147 
148 // ----------------------------------------------------------------------------
151  ILineReader& lineReader,
152  ILineErrorListener* pEC )
153 // ----------------------------------------------------------------------------
154 {
156  return CReaderBase::ReadSeqAnnot(lineReader, pEC);
157 }
158 
159 // ----------------------------------------------------------------------------
160 void
162  const TReaderData& readerData,
163  CSeq_annot& annot)
164 // ----------------------------------------------------------------------------
165 {
166  for (const auto& lineData: readerData) {
167  const auto& line = lineData.mData;
168  if (xIsTrackTerminator(line)) {
169  continue;
170  }
171  if (xParseStructuredComment(line)) {
172  continue;
173  }
174  if (xParseBrowserLine(line, annot)) {
175  continue;
176  }
177  if (xParseFeature(line, annot, nullptr)) {
178  continue;
179  }
180  }
181 }
182 
183 
184 static bool s_IsTranscriptType(const string& recType)
185 {
186  return (recType == "exon" || recType == "5utr" || recType == "3utr");
187 }
188 
189 static bool s_IsCDSType(const string& recType)
190 {
191  return (recType == "cds" || recType == "start_codon" || recType == "stop_codon");
192 }
193 
194 
195 // ----------------------------------------------------------------------------
197  const CGff2Record& record,
198  CSeq_annot& annot,
199  ILineErrorListener* pEC)
200 // ----------------------------------------------------------------------------
201 {
202  const CGtfReadRecord& gff = dynamic_cast<const CGtfReadRecord&>(record);
203  auto recType = gff.NormalizedType();
204 
205 
206  if (s_IsCDSType(recType)) { // Only attempt to create/update the transcript if xUpdateAnnotCds() succeeds
207  return (xUpdateAnnotCds(gff, annot) &&
208  xUpdateAnnotTranscript(gff, annot));
209  }
210 
211  if (s_IsTranscriptType(recType))
212  {
213  return xUpdateAnnotTranscript(gff, annot);
214  }
215 
216  // Every other type is not officially sanctioned GTF, and per spec we are
217  // supposed to ignore it. In the spirit of being lenient on input we may
218  // try to salvage some of it anyway.
219  //
220  if (recType == "gene") {
221  return xCreateParentGene(gff, annot);
222  }
223  if (recType == "mrna" || recType == "transcript") {
224  return xCreateParentMrna(gff, annot);
225  }
226  return true;
227 }
228 
229 
231 {
233  const auto& xAttributes = x.Get();
234  const auto& yAttributes = y.Get();
235 
236  auto xit = xAttributes.begin();
237  auto yit = yAttributes.begin();
238  while (xit != xAttributes.end() && yit != yAttributes.end()) {
239  if (xit->first < yit->first) {
240  ++xit;
241  } else if (yit->first < xit->first) {
242  ++yit;
243  }
244  else { // xit->first == yit->first
245  const set<string>& xVals = xit->second;
246  if (xVals.empty()) {
247  result.AddValue(xit->first, "");
248  }
249  else {
250  const set<string>& yVals = yit->second;
251  set<string> commonVals;
252  set_intersection(begin(xVals), end(xVals),
253  begin(yVals), end(yVals),
254  inserter(commonVals, commonVals.begin()));
255  if (!commonVals.empty()) {
256  for (const auto& val : commonVals) {
257  result.AddValue(xit->first, val);
258  }
259  }
260  }
261  ++xit;
262  ++yit;
263  }
264 
265  }
266 
267 
268  return result;
269 }
270 
271 
272 // ----------------------------------------------------------------------------
274  const CGtfReadRecord& gff,
275  CSeq_annot& annot )
276 // ----------------------------------------------------------------------------
277 {
278  auto featId = mpLocations->GetFeatureIdFor(gff, "cds");
279  mpLocations->AddRecordForId(featId, gff) ;
280  return (xFindFeatById(featId) || xCreateParentCds(gff, annot));
281 }
282 
283 
284 // ----------------------------------------------------------------------------
286  const CGtfReadRecord& record,
287  const string& qualName,
288  CSeq_feat& parent)
289 // ----------------------------------------------------------------------------
290 {
292  record.GtfAttributes().GetValues(qualName, values);
293  if (!values.empty()) {
294  xFeatureAddQualifiers(qualName, values, parent);
295  }
296 }
297 
298 
299 // ----------------------------------------------------------------------------
301  const CGtfReadRecord& record,
302  const string& parentType,
303  CSeq_annot& annot)
304 // ----------------------------------------------------------------------------
305 {
306 
307  auto recType = record.NormalizedType();
308  //
309  // If there is no gene feature to go with this CDS then make one. Otherwise,
310  // make sure the existing gene feature includes the location of the CDS.
311  //
312  auto parentFeatId = mpLocations->GetFeatureIdFor(record, parentType);
313  auto pParent = xFindFeatById(parentFeatId);
314  if (!pParent) {
315  if (parentType == "gene") {
316  if (!xCreateParentGene(record, annot)) {
317  return false;
318  }
319  }
320  else {
321  if (!xCreateParentMrna(record, annot)) {
322  return false;
323  }
324  }
325 
326  m_ParentChildQualMap[parentFeatId].emplace(recType, record.GtfAttributes());
327  mpLocations->AddRecordForId(parentFeatId, record);
328  }
329  else {
330  mpLocations->AddRecordForId(parentFeatId, record);
331 
332  if (auto parentIt = m_ParentChildQualMap.find(parentFeatId);
333  parentIt != m_ParentChildQualMap.end()) {
334  if (auto childIt = parentIt->second.find(recType);
335  childIt != parentIt->second.end()) {
336 
337  auto& childAttributes = childIt->second;
338  if (!xFeatureTrimQualifiers(childAttributes, record.GtfAttributes(), *pParent)) {
339  return false;
340  }
341  auto accumulatedAttributes = g_GetIntersection(childAttributes, record.GtfAttributes());
342  childAttributes = accumulatedAttributes;
343  } else { // First feature
344  parentIt->second.emplace(recType, record.GtfAttributes());
345 
346  if (parentType == "gene") {
347  if (s_IsCDSType(recType)) {
348  xPropagateQualToParent(record, "gene_id", *pParent);
349  } else if (!xFeatureSetQualifiersGene(record, *pParent)) {
350  return false;
351  }
352  } else {
353  if (s_IsCDSType(recType)) {
354  xPropagateQualToParent(record, "gene_id", *pParent);
355  xPropagateQualToParent(record, "transcript_id", *pParent);
356  } else if (!xFeatureSetQualifiersRna(record, *pParent)) {
357  return false;
358  }
359  }
360  }
361  }
362  }
363  return true;
364 }
365 
366 // ----------------------------------------------------------------------------
368  const CGtfReadRecord& gff,
369  CSeq_annot& annot )
370 // ----------------------------------------------------------------------------
371 {
372  if (!xUpdateAnnotParent(gff, "gene", annot)) {
373  return false;
374  }
375  return xUpdateAnnotParent(gff, "transcript", annot);
376 }
377 
378 // ----------------------------------------------------------------------------
380  const CGtfReadRecord& record,
381  const string& prefix,
382  CSeq_feat& feature )
383 // ----------------------------------------------------------------------------
384 {
385  static int seqNum(1);
386 
387  string strFeatureId = prefix;
388  if (strFeatureId.empty()) {
389  strFeatureId = "id";
390  }
391  strFeatureId += "_";
392  strFeatureId += NStr::IntToString(seqNum++);
393  feature.SetId().SetLocal().SetStr(strFeatureId);
394  return true;
395 }
396 
397 // -----------------------------------------------------------------------------
399  const CGtfReadRecord& gff,
400  CSeq_annot& annot )
401 // -----------------------------------------------------------------------------
402 {
403  auto featId = mpLocations->GetFeatureIdFor(gff, "gene");
404  if (m_MapIdToFeature.find(featId) != m_MapIdToFeature.end()) {
405  return true;
406  }
407 
408  CRef<CSeq_feat> pFeature( new CSeq_feat );
409 
410  if (!xFeatureSetDataGene(gff, *pFeature)) {
411  return false;
412  }
413  if (!xCreateFeatureId(gff, "gene", *pFeature)) {
414  return false;
415  }
416  if (gff.NormalizedType() == "cds") {
417  xPropagateQualToParent(gff, "gene_id", *pFeature);
418  } else if (!xFeatureSetQualifiersGene(gff, *pFeature)) {
419  return false;
420  }
421 
422  (gff.Type() == "gene") ?
423  mpLocations->AddRecordForId(featId, gff) :
424  mpLocations->AddStubForId(featId);
425  m_MapIdToFeature[featId] = pFeature;
426  xAddFeatureToAnnot(pFeature, annot);
427  return true;
428 }
429 
430 
431 // ----------------------------------------------------------------------------
433  const CGtfReadRecord& record,
434  CSeq_feat& feature )
435 // ----------------------------------------------------------------------------
436 {
437  set<string> ignoredAttrs = {
438  "locus_tag", "transcript_id", "gene"
439  };
440  return xFeatureSetQualifiers(record, ignoredAttrs, feature);
441 }
442 
443 
444 // ----------------------------------------------------------------------------
446  const CGtfReadRecord& record,
447  CSeq_feat& feature )
448 // ----------------------------------------------------------------------------
449 {
450  set<string> ignoredAttrs = {
451  "locus_tag"
452  };
453 
454  return xFeatureSetQualifiers(record, ignoredAttrs, feature);
455 }
456 
457 
458 // ----------------------------------------------------------------------------
460  const CGtfReadRecord& record,
461  CSeq_feat& feature )
462 // ----------------------------------------------------------------------------
463 {
464  set<string> ignoredAttrs = {
465  "locus_tag"
466  };
467  return xFeatureSetQualifiers(record, ignoredAttrs, feature);
468 }
469 
470 
471 // ----------------------------------------------------------------------------
473  const CGtfReadRecord& record,
474  const set<string>& ignoredAttrs,
475  CSeq_feat& feature )
476 // ----------------------------------------------------------------------------
477 {
478  //
479  // Create GB qualifiers for the record attributes:
480  //
481  for (const auto& attribute : record.GtfAttributes().Get()) {
482  const auto& name = attribute.first;
483  if (ignoredAttrs.find(name) != ignoredAttrs.end()) {
484  continue;
485  }
486  const auto& vals = attribute.second;
487  // special case some well-known attributes
488  if (xProcessQualifierSpecialCase(name, vals, feature)) {
489  continue;
490  }
491 
492  // turn everything else into a qualifier
493  xFeatureAddQualifiers(name, vals, feature);
494  }
495  return true;
496 }
497 
498 
499 // -----------------------------------------------------------------------------
501  const CGtfReadRecord& gff,
502  CSeq_annot& annot )
503 // -----------------------------------------------------------------------------
504 {
505  CRef<CSeq_feat> pFeature(new CSeq_feat);
506 
507  if (!xFeatureSetDataCds(gff, *pFeature)) {
508  return false;
509  }
510  if (!xCreateFeatureId(gff, "cds", *pFeature)) {
511  return false;
512  }
513  if (!xFeatureSetQualifiersCds(gff, *pFeature)) {
514  return false;
515  }
516 
517  auto featId = mpLocations->GetFeatureIdFor(gff, "cds");
518 
520 
521  m_MapIdToFeature[featId] = pFeature;
522  return xAddFeatureToAnnot(pFeature, annot);
523 }
524 
525 
527  const CGtfReadRecord& gff)
528 {
529  auto transcriptId = gff.TranscriptId();
530  if (!transcriptId.empty()) {
531  if (auto geneId = gff.GeneKey(); !geneId.empty()) {
532  if (auto it = m_TranscriptToGeneMap.find(transcriptId); it != m_TranscriptToGeneMap.end()) {
533  if (it->second != geneId) {
534  string msg = "Gene id '" + geneId + "' for transcript '" + transcriptId +
535  "' conflicts with previously-assigned '" + it->second + "'";
537  eDiag_Error,
539  msg);
540  m_pMessageHandler->Report(error);
541  }
542  }
543  else {
544  m_TranscriptToGeneMap.emplace(transcriptId, geneId);
545  }
546  }
547  }
548 }
549 
550 
551 // -----------------------------------------------------------------------------
553  const CGtfReadRecord& gff,
554  CSeq_annot& annot )
555 // -----------------------------------------------------------------------------
556 {
557  auto featId = mpLocations->GetFeatureIdFor(gff, "transcript");
558  if (m_MapIdToFeature.find(featId) != m_MapIdToFeature.end()) {
559  return true;
560  }
561 
562  CRef< CSeq_feat > pFeature( new CSeq_feat );
563 
564  if (!xFeatureSetDataMrna(gff, *pFeature)) {
565  return false;
566  }
567  if (!xCreateFeatureId(gff, "mrna", *pFeature)) {
568  return false;
569  }
570 
571  if (gff.NormalizedType() == "cds") {
572  xPropagateQualToParent(gff, "gene_id", *pFeature);
573  xPropagateQualToParent(gff, "transcript_id", *pFeature);
574  } else if (!xFeatureSetQualifiersRna( gff, *pFeature ) ) {
575  return false;
576  }
577 
578  mpLocations->AddStubForId(featId);
579  m_MapIdToFeature[featId] = pFeature;
580 
581  return xAddFeatureToAnnot( pFeature, annot );
582 }
583 
584 // ----------------------------------------------------------------------------
586  const string& featId)
587 // ----------------------------------------------------------------------------
588 {
589  auto featIt = m_MapIdToFeature.find(featId);
590  if (featIt == m_MapIdToFeature.end()) {
591  return CRef<CSeq_feat>();
592  }
593  return featIt->second;
594 }
595 
596 // ----------------------------------------------------------------------------
598  const CGtfReadRecord& record,
599  CSeq_feat& feature )
600 // ----------------------------------------------------------------------------
601 {
602  CGene_ref& gene = feature.SetData().SetGene();
603 
604  const auto& attributes = record.GtfAttributes();
605  string geneSynonym = attributes.ValueOf("gene_synonym");
606  if (!geneSynonym.empty()) {
607  gene.SetSyn().push_back(geneSynonym);
608  }
609  string locusTag = attributes.ValueOf("locus_tag");
610  if (!locusTag.empty()) {
611  gene.SetLocus_tag(locusTag);
612  }
613  string locus = attributes.ValueOf("gene");
614 
615  if (!locus.empty()) {
616  gene.SetLocus(locus);
617  }
618  return true;
619 }
620 
621 // ----------------------------------------------------------------------------
623  const CGtfReadRecord& record,
624  CSeq_feat& feature)
625 // ----------------------------------------------------------------------------
626 {
627  if (!xFeatureSetDataRna(record, feature, CSeqFeatData::eSubtype_mRNA)) {
628  return false;
629  }
630  CRNA_ref& rna = feature.SetData().SetRna();
631 
632  string product = record.GtfAttributes().ValueOf("product");
633  if (!product.empty()) {
634  rna.SetExt().SetName(product);
635  }
636  return true;
637 }
638 
639 // ----------------------------------------------------------------------------
641  const CGtfReadRecord& record,
642  CSeq_feat& feature,
643  CSeqFeatData::ESubtype subType)
644 // ----------------------------------------------------------------------------
645 {
646  CRNA_ref& rnaRef = feature.SetData().SetRna();
647  switch (subType){
648  default:
650  break;
653  break;
656  break;
657  }
658  return true;
659 }
660 
661 // ----------------------------------------------------------------------------
663  const CGtfReadRecord& record,
664  CSeq_feat& feature )
665 // ----------------------------------------------------------------------------
666 {
667  CCdregion& cdr = feature.SetData().SetCdregion();
668  const auto& attributes = record.GtfAttributes();
669 
670  string proteinId = attributes.ValueOf("protein_id");
671  if (!proteinId.empty()) {
672  CRef<CSeq_id> pId = mSeqIdResolve(proteinId, m_iFlags, true);
673  if (pId->IsGenbank()) {
674  feature.SetProduct().SetWhole(*pId);
675  }
676  }
677  string ribosomalSlippage = attributes.ValueOf("ribosomal_slippage");
678  if (!ribosomalSlippage.empty()) {
679  feature.SetExcept( true );
680  feature.SetExcept_text("ribosomal slippage");
681  }
682  string transTable = attributes.ValueOf("transl_table");
683  if (!transTable.empty()) {
685  pGc->SetId(NStr::StringToUInt(transTable));
686  cdr.SetCode().Set().push_back(pGc);
687  }
688  return true;
689 }
690 
691 // ----------------------------------------------------------------------------
693  const CGtfReadRecord& record,
694  CSeq_feat& feature )
695  // ----------------------------------------------------------------------------
696 {
697  return xFeatureTrimQualifiers(record.GtfAttributes(), feature);
698 }
699 
700 
702  const CGtfAttributes& attributes,
703  CSeq_feat& feature )
704  // ----------------------------------------------------------------------------
705 {
706  //task:
707  // for each attribute of the new piece check if we already got a feature
708  // qualifier
709  // if so, and with the same value, then the qualifier is allowed to live
710  // otherwise it is subfeature specific and hence removed from the feature
711  auto& quals = feature.SetQual();
712  for (auto it = quals.begin(); it != quals.end(); /**/) {
713  const string& qualKey = (*it)->GetQual();
714 
715  if (NStr::StartsWith(qualKey, "gff_") ||
716  qualKey == "locus_tag" ||
717  qualKey == "old_locus_tag" ||
718  qualKey == "product" ||
719  qualKey == "protein_id") {
720  ++it;
721  continue;
722  }
723 
724  const string& qualVal = (*it)->GetVal();
725  if (!attributes.HasValue(qualKey, qualVal)) {
726  //superfluous qualifier- squish
727  it = quals.erase(it);
728  continue;
729  }
730  it++;
731  }
732  return true;
733 }
734 
735 
737  const CGtfAttributes& prevAttributes,
738  const CGtfAttributes& attributes,
739  CSeq_feat& feature )
740  // ----------------------------------------------------------------------------
741 {
742  //task:
743  // for each attribute of the new piece check if we already got a feature
744  // qualifier
745  // if so, and with the same value, then the qualifier is allowed to live
746  // otherwise it is subfeature specific and hence removed from the feature
747  auto& quals = feature.SetQual();
748  for (auto it = quals.begin(); it != quals.end(); /**/) {
749  const string& qualKey = (*it)->GetQual();
750 
751  if (NStr::StartsWith(qualKey, "gff_") ||
752  qualKey == "locus_tag" ||
753  qualKey == "old_locus_tag" ||
754  qualKey == "product" ||
755  qualKey == "protein_id") {
756  ++it;
757  continue;
758  }
759 
760  const string& qualVal = (*it)->GetVal();
761  if (!prevAttributes.HasValue(qualKey, qualVal)) {
762  ++it;
763  continue;
764  }
765 
766  if (!attributes.HasValue(qualKey, qualVal)) {
767  //superfluous qualifier- squish
768  it = quals.erase(it);
769  continue;
770  }
771  it++;
772  }
773  return true;
774 }
775 
776 
777 
778 // ----------------------------------------------------------------------------
780  const string& key,
781  const CGtfAttributes::MultiValue& values,
782  CSeq_feat& feature )
783 // ----------------------------------------------------------------------------
784 {
785  CRef<CGb_qual> pQual(0);
786 
787  if (0 == NStr::CompareNocase(key, "exon_id")) {
788  return true;
789  }
790  if (0 == NStr::CompareNocase(key, "exon_number")) {
791  return true;
792  }
793  if ( 0 == NStr::CompareNocase(key, "note") ) {
794  feature.SetComment(NStr::Join(values, ";"));
795  return true;
796  }
797  if ( 0 == NStr::CompareNocase(key, "dbxref") ||
798  0 == NStr::CompareNocase(key, "db_xref"))
799  {
800  for (auto value: values) {
801  vector< string > tags;
802  NStr::Split(value, ";", tags );
803  for (auto it = tags.begin(); it != tags.end(); ++it ) {
804  feature.SetDbxref().push_back(x_ParseDbtag(*it));
805  }
806  }
807  return true;
808  }
809 
810  if ( 0 == NStr::CompareNocase(key, "pseudo")) {
811  feature.SetPseudo( true );
812  return true;
813  }
814  if ( 0 == NStr::CompareNocase(key, "partial")) {
815  // RW-1108 - ignore partial attribute in Genbank mode
817  return true;
818  }
819  }
820  return false;
821 }
822 
823 // ----------------------------------------------------------------------------
825  const string& key,
826  const CGtfAttributes::MultiValue& values,
827  CSeq_feat& feature)
828  // ----------------------------------------------------------------------------
829 {
830  set<string> existingVals;
831  for (const auto& pQual : feature.GetQual()) {
832  if (pQual->GetQual() == key) {
833  existingVals.insert(pQual->GetVal());
834  }
835  }
836 
837  for (auto value: values) {
838  if (existingVals.find(value) == existingVals.end()) {
839  feature.AddQualifier(key, value);
840  }
841  }
842 };
843 
844 // ============================================================================
846  CSeq_feat& descendent,
847  CSeq_feat& ancestor)
848 // ============================================================================
849 {
850  xSetXrefFromTo(descendent, ancestor);
852  xSetXrefFromTo(ancestor, descendent);
853  }
854 }
855 
856 // ----------------------------------------------------------------------------
858  CSeq_annot& annot)
859 // ----------------------------------------------------------------------------
860 {
861  //location fixup:
862  for (auto itLocation: mpLocations->LocationMap()) {
863  auto id = itLocation.first;
864  auto itFeature = m_MapIdToFeature.find(id);
865  if (itFeature == m_MapIdToFeature.end()) {
866  continue;
867  }
868  CRef<CSeq_feat> pFeature = itFeature->second;
869  auto featSubType = pFeature->GetData().GetSubtype();
870  CRef<CSeq_loc> pNewLoc = mpLocations->MergeLocation(
871  featSubType, itLocation.second);
872  pFeature->SetLocation(*pNewLoc);
873  }
874 
875  //generate xrefs:
876  for (auto itLocation: mpLocations->LocationMap()) {
877  auto id = itLocation.first;
878  auto itFeature = m_MapIdToFeature.find(id);
879  if (itFeature == m_MapIdToFeature.end()) {
880  continue;
881  }
882  CRef<CSeq_feat> pFeature = itFeature->second;
883  auto featSubType = pFeature->GetData().GetSubtype();
884  switch(featSubType) {
885  default: {
886  break;
887  }
889  auto parentGeneFeatId = string("gene:") + pFeature->GetNamedQual("gene_id");
890  CRef<CSeq_feat> pParentGene;
891  if (x_GetFeatureById(parentGeneFeatId, pParentGene)) {
892  xSetAncestorXrefs(*pFeature, *pParentGene);
893  }
894  break;
895  }
897  auto parentRnaFeatId = string("transcript:") + pFeature->GetNamedQual("gene_id") +
898  "_" + pFeature->GetNamedQual("transcript_id");
899  CRef<CSeq_feat> pParentRna;
900  if (x_GetFeatureById(parentRnaFeatId, pParentRna)) {
901  xSetAncestorXrefs(*pFeature, *pParentRna);
902  }
903  auto parentGeneFeatId = string("gene:") + pFeature->GetNamedQual("gene_id");
904  CRef<CSeq_feat> pParentGene;
905  if (x_GetFeatureById(parentGeneFeatId, pParentGene)) {
906  xSetAncestorXrefs(*pFeature, *pParentGene);
907 
908  }
909  break;
910  }
911  }
912  }
913  return CGff2Reader::xPostProcessAnnot(annot);
914 }
915 
916 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CCdregion –.
Definition: Cdregion.hpp:66
void xSetXrefFromTo(CSeq_feat &, CSeq_feat &)
virtual bool xAddFeatureToAnnot(CRef< CSeq_feat >, CSeq_annot &)
virtual bool xParseStructuredComment(const string &)
virtual bool xParseFeature(const string &, CSeq_annot &, ILineErrorListener *)
void xPostProcessAnnot(CSeq_annot &) override
bool x_GetFeatureById(const string &, CRef< CSeq_feat > &)
IdToFeatureMap m_MapIdToFeature
static CRef< CDbtag > x_ParseDbtag(const string &)
unsigned int mCurrentFeatureCount
static string xNormalizedAttributeValue(const CTempString &)
Definition: gff2_data.cpp:343
bool xSplitGffAttributes(const string &, vector< string > &) const
Definition: gff2_data.cpp:471
static string xNormalizedAttributeKey(const CTempString &)
Definition: gff2_data.cpp:335
const string & Type() const
const string & NormalizedType() const
void AddValue(const string &key, const string &value)
Definition: gtf_reader.hpp:113
string ValueOf(const string &key) const
Definition: gtf_reader.hpp:67
void GetValues(const string &key, MultiValue &values) const
Definition: gtf_reader.hpp:99
const MultiAttributes & Get() const
Definition: gtf_reader.hpp:61
bool HasValue(const string &key, const string &value="") const
Definition: gtf_reader.hpp:79
string TranscriptId() const
Definition: gtf_reader.hpp:198
CGtfAttributes mAttributes
Definition: gtf_reader.hpp:208
bool xAssignAttributesFromGff(const string &, const string &)
Definition: gtf_reader.cpp:70
const CGtfAttributes & GtfAttributes() const
Definition: gtf_reader.hpp:168
string GeneKey() const
Definition: gtf_reader.hpp:174
bool xFeatureSetDataGene(const CGtfReadRecord &, CSeq_feat &)
Definition: gtf_reader.cpp:597
void xFeatureAddQualifiers(const string &key, const CGtfAttributes::MultiValue &, CSeq_feat &)
Definition: gtf_reader.cpp:824
bool xFeatureSetQualifiers(const CGtfReadRecord &record, const set< string > &ignoredAttrs, CSeq_feat &)
Definition: gtf_reader.cpp:472
@ fGenerateChildXrefs
Definition: gtf_reader.hpp:218
bool xFeatureSetQualifiersCds(const CGtfReadRecord &record, CSeq_feat &)
Definition: gtf_reader.cpp:459
bool xCreateParentCds(const CGtfReadRecord &, CSeq_annot &)
Definition: gtf_reader.cpp:500
void xCheckForGeneIdConflict(const CGtfReadRecord &record)
Definition: gtf_reader.cpp:526
bool xUpdateAnnotFeature(const CGff2Record &, CSeq_annot &, ILineErrorListener *=nullptr) override
Definition: gtf_reader.cpp:196
bool xCreateParentGene(const CGtfReadRecord &, CSeq_annot &)
Definition: gtf_reader.cpp:398
unique_ptr< CGtfLocationMerger > mpLocations
Definition: gtf_reader.hpp:365
bool xFeatureSetQualifiersGene(const CGtfReadRecord &record, CSeq_feat &)
Definition: gtf_reader.cpp:432
bool xCreateParentMrna(const CGtfReadRecord &, CSeq_annot &)
Definition: gtf_reader.cpp:552
virtual bool xUpdateAnnotCds(const CGtfReadRecord &, CSeq_annot &)
Definition: gtf_reader.cpp:273
void xPostProcessAnnot(CSeq_annot &) override
Definition: gtf_reader.cpp:857
bool xFeatureSetDataMrna(const CGtfReadRecord &, CSeq_feat &)
Definition: gtf_reader.cpp:622
CRef< CSeq_feat > xFindFeatById(const string &)
Definition: gtf_reader.cpp:585
bool xFeatureSetQualifiersRna(const CGtfReadRecord &record, CSeq_feat &)
Definition: gtf_reader.cpp:445
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pErrors=nullptr) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
Definition: gtf_reader.cpp:150
virtual bool xUpdateAnnotTranscript(const CGtfReadRecord &, CSeq_annot &)
Definition: gtf_reader.cpp:367
CGtfReader(unsigned int=0, const string &="", const string &="", SeqIdResolver=CReadUtil::AsSeqId, CReaderListener *=nullptr)
Definition: gtf_reader.cpp:120
bool xFeatureTrimQualifiers(const CGtfReadRecord &, CSeq_feat &)
Definition: gtf_reader.cpp:692
map< string, string > m_TranscriptToGeneMap
Definition: gtf_reader.hpp:371
bool xUpdateAnnotParent(const CGtfReadRecord &record, const string &parentType, CSeq_annot &annot)
Definition: gtf_reader.cpp:300
TParentChildQualMap m_ParentChildQualMap
Definition: gtf_reader.hpp:372
virtual ~CGtfReader()
Definition: gtf_reader.cpp:143
void xPropagateQualToParent(const CGtfReadRecord &record, const string &qualName, CSeq_feat &parent)
Definition: gtf_reader.cpp:285
virtual bool xFeatureSetDataRna(const CGtfReadRecord &, CSeq_feat &, CSeqFeatData::ESubtype)
Definition: gtf_reader.cpp:640
bool xProcessQualifierSpecialCase(const string &, const CGtfAttributes::MultiValue &, CSeq_feat &)
Definition: gtf_reader.cpp:779
bool xCreateFeatureId(const CGtfReadRecord &, const string &, CSeq_feat &)
Definition: gtf_reader.cpp:379
void xSetAncestorXrefs(CSeq_feat &, CSeq_feat &) override
Definition: gtf_reader.cpp:845
void xProcessData(const TReaderData &, CSeq_annot &) override
Definition: gtf_reader.cpp:161
bool xFeatureSetDataCds(const CGtfReadRecord &, CSeq_feat &)
Definition: gtf_reader.cpp:662
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
Common file reader utility functions.
Definition: read_util.hpp:47
unique_ptr< CReaderMessageHandler > m_pMessageHandler
SeqIdResolver mSeqIdResolve
unsigned int m_uLineNumber
virtual bool xParseBrowserLine(const string &, CSeq_annot &)
vector< TReaderLine > TReaderData
Definition: reader_base.hpp:70
TReaderFlags m_iFlags
virtual CRef< CSeq_annot > ReadSeqAnnot(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as a single Seq-annot.
virtual bool xIsTrackTerminator(const CTempString &)
CRef –.
Definition: ncbiobj.hpp:618
ESubtype GetSubtype(void) const
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
Definition: Seq_feat.cpp:429
void AddQualifier(const string &qual_name, const string &qual_val)
Add a qualifier to this feature.
Definition: Seq_feat.cpp:291
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const struct attribute attributes[]
Definition: attributes.c:165
string
Definition: cgiapp.hpp:687
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
TSyn & SetSyn(void)
Assign a value to Syn data member.
Definition: Gene_ref_.hpp:774
void SetLocus(const TLocus &value)
Assign a value to Locus data member.
Definition: Gene_ref_.hpp:514
void SetLocus_tag(const TLocus_tag &value)
Assign a value to Locus_tag data member.
Definition: Gene_ref_.hpp:802
void SetType(TType value)
Assign a value to Type data member.
Definition: RNA_ref_.hpp:538
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetComment(const TComment &value)
Assign a value to Comment data member.
Definition: Seq_feat_.hpp:1058
void SetProduct(TProduct &value)
Assign a value to Product data member.
Definition: Seq_feat_.cpp:110
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
void SetCode(TCode &value)
Assign a value to Code data member.
Definition: Cdregion_.cpp:68
void SetExcept(TExcept value)
Assign a value to Except data member.
Definition: Seq_feat_.hpp:1018
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_feat_.cpp:73
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetPseudo(TPseudo value)
Assign a value to Pseudo data member.
Definition: Seq_feat_.hpp:1374
void SetExcept_text(const TExcept_text &value)
Assign a value to Except_text data member.
Definition: Seq_feat_.hpp:1414
TQual & SetQual(void)
Assign a value to Qual data member.
Definition: Seq_feat_.hpp:1153
bool IsGenbank(void) const
Check if variant Genbank is selected.
Definition: Seq_id_.hpp:841
static bool s_IsCDSType(const string &recType)
Definition: gtf_reader.cpp:189
static bool s_IsTranscriptType(const string &recType)
Definition: gtf_reader.cpp:184
CGtfAttributes g_GetIntersection(const CGtfAttributes &x, const CGtfAttributes &y)
Definition: gtf_reader.cpp:230
Lightweight interface for getting lines of data with minimal memory copying.
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
static const char * prefix[]
Definition: pcregrep.c:405
else result
Definition: token2.c:20
Modified on Thu Apr 25 08:20:59 2024 by modify_doxy.py rev. 669887