NCBI C++ ToolKit
gff3_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gff3_reader.cpp 102105 2024-04-01 15:32:13Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  * GFF3 file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/line_reader.hpp>
37 
38 
41 
46 
48 #include <objects/seq/Annot_id.hpp>
50 #include <objects/seq/so_map.hpp>
52 
58 
61 
63 
64 #include <algorithm>
65 
67 BEGIN_objects_SCOPE
68 
69 unsigned int CGff3Reader::msGenericIdCounter = 0;
70 
71 // ----------------------------------------------------------------------------
73  const CGff2Record& record)
74 // ----------------------------------------------------------------------------
75 {
76  string id, parentId;
77  record.GetAttribute("ID", id);
78  record.GetAttribute("Parent", parentId);
79 
80  auto recordType = record.NormalizedType();
81  if (recordType == "cds") {
82  string cdsId = parentId;
83  if (cdsId.empty()) {
84  cdsId = (id.empty() ? xNextGenericId() : id);
85  }
86  else {
87  cdsId += ":cds";
88  }
89  return cdsId;
90  }
91  if (id.empty()) {
92  return xNextGenericId();
93  }
94  return id;
95 }
96 
97 // ----------------------------------------------------------------------------
99 // ----------------------------------------------------------------------------
100 {
101  return string("generic") + NStr::IntToString(msGenericIdCounter++);
102 }
103 
104 // ----------------------------------------------------------------------------
106  const string& strRawInput )
107 // ----------------------------------------------------------------------------
108 {
109  if (!CGff2Record::AssignFromGff(strRawInput)) {
110  return false;
111  }
112  string id, parent;
113  GetAttribute("ID", id);
114  GetAttribute("Parent", parent);
115  if (m_strType == "pseudogene") {
116  SetType("gene");
117  m_Attributes["pseudo"] = "true";
118  return true;
119  }
120  if (m_strType == "pseudogenic_transcript") {
121  SetType("transcript");
122  m_Attributes["pseudo"] = "true";
123  return true;
124  }
125  if (m_strType == "pseudogenic_tRNA") {
126  SetType("tRNA");
127  m_Attributes["pseudo"] = "true";
128  return true;
129  }
130  if (m_strType == "pseudogenic_rRNA") {
131  SetType("rRNA");
132  m_Attributes["pseudo"] = "true";
133  return true;
134  }
135  if (m_strType == "pseudogenic_exon") {
136  SetType("exon");
137  return true;
138  }
139  if (m_strType == "pseudogenic_CDS") {
140  SetType("CDS");
141  m_Attributes["pseudo"] = "true";
142  return true;
143  }
144  if (m_strType == "transcript") {
145  SetType("misc_RNA");
146  return true;
147  }
148  return true;
149 }
150 
151 // ----------------------------------------------------------------------------
153  const string& strRawKey )
154 // ---------------------------------------------------------------------------
155 {
156  string strKey = CGff2Record::xNormalizedAttributeKey( strRawKey );
157  if ( 0 == NStr::CompareNocase( strRawKey, "ID" ) ) {
158  return "ID";
159  }
160  if ( 0 == NStr::CompareNocase( strKey, "Name" ) ) {
161  return "Name";
162  }
163  if ( 0 == NStr::CompareNocase( strKey, "Alias" ) ) {
164  return "Alias";
165  }
166  if ( 0 == NStr::CompareNocase( strKey, "Parent" ) ) {
167  return "Parent";
168  }
169  if ( 0 == NStr::CompareNocase( strKey, "Target" ) ) {
170  return "Target";
171  }
172  if ( 0 == NStr::CompareNocase( strKey, "Gap" ) ) {
173  return "Gap";
174  }
175  if ( 0 == NStr::CompareNocase( strKey, "Derives_from" ) ) {
176  return "Derives_from";
177  }
178  if ( 0 == NStr::CompareNocase( strKey, "Note" ) ) {
179  return "Note";
180  }
181  if ( 0 == NStr::CompareNocase( strKey, "Dbxref" ) ||
182  0 == NStr::CompareNocase( strKey, "Db_xref" ) ) {
183  return "Dbxref";
184  }
185  if ( 0 == NStr::CompareNocase( strKey, "Ontology_term" ) ) {
186  return "Ontology_term";
187  }
188  return strKey;
189 }
190 
191 // ----------------------------------------------------------------------------
193  TReaderFlags uFlags,
194  const string& name,
195  const string& title,
196  SeqIdResolver resolver,
197  CReaderListener* pRL):
198 // ----------------------------------------------------------------------------
199  CGff2Reader( uFlags, name, title, resolver, pRL )
200 {
201  mpLocations.reset(new CGff3LocationMerger(uFlags, resolver, 0, pRL));
203 }
204 
205 // ----------------------------------------------------------------------------
207  unsigned int uFlags,
208  CReaderListener* pRL):
209 // ----------------------------------------------------------------------------
210  CGff3Reader(uFlags, "", "", CReadUtil::AsSeqId, pRL)
211 {
212 }
213 
214 // ----------------------------------------------------------------------------
216 // ----------------------------------------------------------------------------
217 {
218 }
219 
220 // ----------------------------------------------------------------------------
223  ILineReader& lr,
224  ILineErrorListener* pEC )
225 // ----------------------------------------------------------------------------
226 {
228  mParsingAlignment = false;
230  mpLocations->Reset();
231  auto pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
232  if (pAnnot && pAnnot->GetData().Which() == CSeq_annot::TData::e_not_set) {
233  return CRef<CSeq_annot>();
234  }
235  return pAnnot;
236 }
237 
238 // ----------------------------------------------------------------------------
239 void
241  const TReaderData& readerData,
242  CSeq_annot& annot)
243 // ----------------------------------------------------------------------------
244 {
245  for (const auto& lineData: readerData) {
246  const auto& line = lineData.mData;
247  if (xParseStructuredComment(line) &&
248  !NStr::StartsWith(line, "##sequence-region") ) {
249  continue;
250  }
251  if (xParseBrowserLine(line, annot)) {
252  continue;
253  }
254  if (xParseFeature(line, annot, nullptr)) {
255  continue;
256  }
257  }
258 }
259 
260 // ----------------------------------------------------------------------------
262  CSeq_annot& annot)
263 // ----------------------------------------------------------------------------
264 {
265  for (const string& id : mAlignmentData.mIds) {
266  CRef<CSeq_align> pAlign = Ref(new CSeq_align());
267  if (x_MergeAlignments(mAlignmentData.mAlignments.at(id), pAlign)) {
268  // if available, add current browser information
269  if ( m_CurrentBrowserInfo ) {
270  annot.SetDesc().Set().push_back( m_CurrentBrowserInfo );
271  }
272 
273  annot.SetNameDesc("alignments");
274 
275  if ( !m_AnnotTitle.empty() ) {
276  annot.SetTitleDesc(m_AnnotTitle);
277  }
278  // Add alignment
279  annot.SetData().SetAlign().push_back(pAlign);
280  }
281  }
282 }
283 
284 // ----------------------------------------------------------------------------
285 bool
287  const string& line,
288  CSeq_annot& annot,
289  ILineErrorListener* pEC)
290 // ----------------------------------------------------------------------------
291 {
292  if (CGff2Reader::IsAlignmentData(line)) {
293  return xParseAlignment(line);
294  }
295 
296  //parse record:
297  shared_ptr<CGff3ReadRecord> pRecord(x_CreateRecord());
298  try {
299  if (!pRecord->AssignFromGff(line)) {
300  return false;
301  }
302  }
303  catch(CObjReaderLineException& err) {
304  ProcessError(err, pEC);
305  return false;
306  }
307 
308  //make sure we are interested:
309  if (xIsIgnoredFeatureType(pRecord->Type())) {
310  return true;
311  }
312  if (xIsIgnoredFeatureId(pRecord->Id())) {
313  return true;
314  }
315 
316  //no support for multiparented features in genbank mode:
317  if (pRecord->IsMultiParent()) {
320  eDiag_Fatal,
321  0,
322  "This GFF3 reader does not support multiparented features"));
323  ProcessError(*pErr, pEC);
324  }
325 
326  //append feature to annot:
327  if (!xUpdateAnnotFeature(*pRecord, annot, pEC)) {
328  return false;
329  }
330 
332  mParsingAlignment = false;
333  return true;
334 }
335 
336 
337 // ----------------------------------------------------------------------------
339  const string& strLine)
340 // ----------------------------------------------------------------------------
341 {
342  if (IsInGenbankMode()) {
343  return true;
344  }
345  auto& ids = mAlignmentData.mIds;
346  auto& alignments = mAlignmentData.mAlignments;
347 
348  unique_ptr<CGff2Record> pRecord(x_CreateRecord());
349 
350  if ( !pRecord->AssignFromGff(strLine) ) {
351  return false;
352  }
353 
354  string id;
355  if ( !pRecord->GetAttribute("ID", id) ) {
356  id = pRecord->Id();
357  }
358 
359  if (alignments.find(id) == alignments.end()) {
360  ids.push_back(id);
361  }
362 
363  CRef<CSeq_align> alignment;
364  if (!x_CreateAlignment(*pRecord, alignment)) {
365  return false;
366  }
367 
368  alignments[id].push_back(alignment);
369 
371  mParsingAlignment = true;
372  return true;
373 }
374 
375 // ----------------------------------------------------------------------------
377  const CGff2Record& gffRecord,
378  CSeq_annot& annot,
379  ILineErrorListener* pEC)
380 // ----------------------------------------------------------------------------
381 {
382  //if (gffRecord.Type() == "CDS") {
383  // if (gffRecord.SeqStart() == 114392786) {
384  // cerr << "";
385  // }
386  //}
387 
388  mpLocations->AddRecord(gffRecord);
389 
390  CRef< CSeq_feat > pFeature(new CSeq_feat);
391 
392  auto recType = gffRecord.NormalizedType();
393  if (recType == "exon" || recType == "five_prime_utr" || recType == "three_prime_utr") {
394  return xUpdateAnnotExon(gffRecord, pFeature, annot, pEC);
395  }
396  if (recType == "cds") {
397  return xUpdateAnnotCds(gffRecord, pFeature, annot, pEC);
398  }
399  if (recType == "gene") {
400  return xUpdateAnnotGene(gffRecord, pFeature, annot, pEC);
401  }
402  if (NStr::EndsWith(recType, "rna")) {
403  return xUpdateAnnotRna(gffRecord, pFeature, annot, pEC);
404  }
405  // allow exon before VDJC gene segment to not crash as data error
406  if (NStr::EndsWith(recType, "_gene_segment")) {
407  return xUpdateAnnotRna(gffRecord, pFeature, annot, pEC);
408  }
409  if (recType == "region") {
410  return xUpdateAnnotRegion(gffRecord, pFeature, annot, pEC);
411  }
412  if (!xUpdateAnnotGeneric(gffRecord, pFeature, annot, pEC)) {
413  return false;
414  }
415  return true;
416 }
417 
418 
419 // ----------------------------------------------------------------------------
421  const string& mrnaId,
422  const CGff2Record& exon)
423 // ----------------------------------------------------------------------------
424 {
425  map<string,CRef<CSeq_interval> >::const_iterator cit = mMrnaLocs.find(mrnaId);
426  if (cit == mMrnaLocs.end()) {
427  string message = "Bad data line: ";
428  message += exon.Type();
429  message += " referring to non-existent parent feature.";
431  eDiag_Error,
433  message);
434  throw error;
435  }
436  const CSeq_interval& containingInt = cit->second.GetObject();
437  const CRef<CSeq_loc> pContainedLoc = exon.GetSeqLoc(m_iFlags, mSeqIdResolve);
438  const CSeq_interval& containedInt = pContainedLoc->GetInt();
439  if (containedInt.GetFrom() < containingInt.GetFrom() ||
440  containedInt.GetTo() > containingInt.GetTo()) {
441  string message = "Bad data line: ";
442  message += exon.Type();
443  message += " extends beyond parent feature.";
445  eDiag_Error,
447  message);
448  throw error;
449  }
450 }
451 
452 // ----------------------------------------------------------------------------
454  const CGff2Record& record,
455  CRef<CSeq_feat> pFeature,
456  CSeq_annot& annot,
457  ILineErrorListener* pEC)
458 // ----------------------------------------------------------------------------
459 {
460  string parentId;
461  if (record.GetAttribute("Parent", parentId)) {
462  CRef<CSeq_feat> pParent;
463  if (!x_GetFeatureById(parentId, pParent)) {
464  xAddPendingExon(parentId, record);
465  return true;
466  }
467  if (pParent->GetData().IsRna()) {
468  xVerifyExonLocation(parentId, record);
469  }
470  if (pParent->GetData().IsGene()) {
471  if (!xInitializeFeature(record, pFeature)) {
472  return false;
473  }
474  return xAddFeatureToAnnot(pFeature, annot);
475  }
477  if (fit != m_MapIdToFeature.end()) {
478  CRef<CSeq_feat> pParent = fit->second;
479  if (!record.UpdateFeature(m_iFlags, pParent)) {
480  return false;
481  }
482  }
483  }
484  return true;
485 }
486 
487 
488 // ----------------------------------------------------------------------------
490  const CGff2Record& record,
491  //CRef<CSeq_feat> pFeature,
492  //CSeq_annot& annot,
493  ILineErrorListener* pEC)
494 // ----------------------------------------------------------------------------
495 {
496  string parentId;
497  if (!record.GetAttribute("Parent", parentId)) {
498  return true;
499  }
500  list<string> parents;
501  CRef<CSeq_feat> pParent;
502  if (!x_GetFeatureById(parentId, pParent)) {
503  // Danger:
504  // We don't know whether the CDS parent is indeed an RNA and it could
505  // possible be a gene.
506  // If the parent is indeed a gene then gene construction will have to
507  // purge this pending exon (or it will cause a sanity check to fail
508  // during post processing).
509  xAddPendingExon(parentId, record);
510  return true;
511  }
512  if (!pParent->GetData().IsRna()) {
513  return true;
514  }
515  xVerifyExonLocation(parentId, record);
516  if (!record.UpdateFeature(m_iFlags, pParent)) {
517  return false;
518  }
519  return true;
520 }
521 
522 
523 // ----------------------------------------------------------------------------
525  const CGff2Record& record,
526  CRef<CSeq_feat> pFeature,
527  CSeq_annot& annot,
528  ILineErrorListener* pEC)
529 // ----------------------------------------------------------------------------
530 {
531  if (!xJoinLocationIntoRna(record, pEC)) {
532  return false;
533  }
534  xVerifyCdsParents(record);
535 
536  string cdsId = xMakeRecordId(record);
537  mpLocations->AddRecordForId(cdsId, record);
538 
539  auto pExistingFeature = m_MapIdToFeature.find(cdsId);
540  if (pExistingFeature != m_MapIdToFeature.end()) {
541  return true;
542  }
543 
544  m_MapIdToFeature[cdsId] = pFeature;
545  xInitializeFeature(record, pFeature);
546  xAddFeatureToAnnot(pFeature, annot);
547 
548  string parentId;
549  record.GetAttribute("Parent", parentId);
550  if (!parentId.empty()) {
551  xFeatureSetQualifier("Parent", parentId, pFeature);
552  xFeatureSetXrefParent(parentId, pFeature);
553  if (m_iFlags & fGeneXrefs) {
554  xFeatureSetXrefGrandParent(parentId, pFeature);
555  }
556  }
557  return true;
558 }
559 
560 
561 // ----------------------------------------------------------------------------
563  const string& parent,
564  CRef<CSeq_feat> pFeature)
565 // ----------------------------------------------------------------------------
566 {
568  if (it == m_MapIdToFeature.end()) {
569  return false;
570  }
571  CRef<CSeq_feat> pParent = it->second;
572  const string &grandParentsStr = pParent->GetNamedQual("Parent");
573  list<string> grandParents;
574  NStr::Split(grandParentsStr, ",", grandParents, 0);
575  for (list<string>::const_iterator gpcit = grandParents.begin();
576  gpcit != grandParents.end(); ++gpcit) {
578  if (gpit == m_MapIdToFeature.end()) {
579  return false;
580  }
581  CRef<CSeq_feat> pGrandParent = gpit->second;
582 
583  //xref grandchild->grandparent
584  CRef<CFeat_id> pGrandParentId(new CFeat_id);
585  pGrandParentId->Assign(pGrandParent->GetId());
586  CRef<CSeqFeatXref> pGrandParentXref(new CSeqFeatXref);
587  pGrandParentXref->SetId(*pGrandParentId);
588  pFeature->SetXref().push_back(pGrandParentXref);
589 
590  //xref grandparent->grandchild
591  CRef<CFeat_id> pGrandChildId(new CFeat_id);
592  pGrandChildId->Assign(pFeature->GetId());
593  CRef<CSeqFeatXref> pGrandChildXref(new CSeqFeatXref);
594  pGrandChildXref->SetId(*pGrandChildId);
595  pGrandParent->SetXref().push_back(pGrandChildXref);
596  }
597  return true;
598 }
599 
600 // ----------------------------------------------------------------------------
602  const string& parent,
603  CRef<CSeq_feat> pChild)
604 // ----------------------------------------------------------------------------
605 {
607  if (it == m_MapIdToFeature.end()) {
608  return false;
609  }
610  CRef<CSeq_feat> pParent = it->second;
611 
612  //xref child->parent
613  CRef<CFeat_id> pParentId(new CFeat_id);
614  pParentId->Assign(pParent->GetId());
615  CRef<CSeqFeatXref> pParentXref(new CSeqFeatXref);
616  pParentXref->SetId(*pParentId);
617  pChild->SetXref().push_back(pParentXref);
618 
619  //xref parent->child
620  CRef<CFeat_id> pChildId(new CFeat_id);
621  pChildId->Assign(pChild->GetId());
622  CRef<CSeqFeatXref> pChildXref(new CSeqFeatXref);
623  pChildXref->SetId(*pChildId);
624  pParent->SetXref().push_back(pChildXref);
625  return true;
626 }
627 
628 // ----------------------------------------------------------------------------
630  const CGff2Record& record,
631  CRef<CSeq_feat>& underConstruction)
632 // ----------------------------------------------------------------------------
633 {
634  string id;
635  if (!record.GetAttribute("ID", id)) {
636  return false;
637  }
639  if (it == m_MapIdToFeature.end()) {
640  return false;
641  }
642 
644  eDiag_Fatal,
646  "Bad data line: Duplicate feature ID \"" + id + "\".");
647  CSeq_feat tempFeat;
648  if (CSoMap::SoTypeToFeature(record.Type(), tempFeat)) {
649  if (it->second->GetData().GetSubtype() != tempFeat.GetData().GetSubtype()) {
650  throw fatal;
651  }
652  }
653 
654  underConstruction = it->second;
655  return true;
656 }
657 
658 // ----------------------------------------------------------------------------
660  const CGff2Record& record,
661  CRef<CSeq_feat> pFeature,
662  CSeq_annot& annot,
663  ILineErrorListener* pEC)
664 // ----------------------------------------------------------------------------
665 {
666  CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
667  if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
668  return record.UpdateFeature(m_iFlags, pUnderConstruction);
669  }
670 
671  string featType = record.Type();
672  if (featType == "stop_codon_read_through" || featType == "selenocysteine") {
673  string cdsParent;
674  if (!record.GetAttribute("Parent", cdsParent)) {
676  eDiag_Error,
678  "Bad data line: Unassigned code break.");
679  throw error;
680  }
682  if (it == m_MapIdToFeature.end()) {
684  eDiag_Error,
686  "Bad data line: Code break assigned to missing feature.");
687  throw error;
688  }
689 
690  CRef<CCode_break> pCodeBreak(new CCode_break);
691  CSeq_interval& cbLoc = pCodeBreak->SetLoc().SetInt();
692  CRef< CSeq_id > pId = mSeqIdResolve(record.Id(), m_iFlags, true);
693  cbLoc.SetId(*pId);
694  cbLoc.SetFrom(static_cast<TSeqPos>(record.SeqStart()));
695  cbLoc.SetTo(static_cast<TSeqPos>(record.SeqStop()));
696  if (record.IsSetStrand()) {
697  cbLoc.SetStrand(record.Strand());
698  }
699  pCodeBreak->SetAa().SetNcbieaa(
700  (featType == "selenocysteine") ? 'U' : 'X');
701  CRef<CSeq_feat> pCds = it->second;
702  CCdregion& cdRegion = pCds->SetData().SetCdregion();
703  list< CRef< CCode_break > >& codeBreaks = cdRegion.SetCode_break();
704  codeBreaks.push_back(pCodeBreak);
705  return true;
706  }
707  if (!xInitializeFeature(record, pFeature)) {
708  return false;
709  }
710  if (! xAddFeatureToAnnot(pFeature, annot)) {
711  return false;
712  }
713  string strId;
714  if ( record.GetAttribute("ID", strId)) {
715  m_MapIdToFeature[strId] = pFeature;
716  }
717  if (pFeature->GetData().IsRna() ||
720  rnaLoc->Assign(pFeature->GetLocation().GetInt());
721  mMrnaLocs[strId] = rnaLoc;
722  }
723  return true;
724 }
725 
726 // ----------------------------------------------------------------------------
728  const CGff2Record& record,
729  CRef<CSeq_feat> pFeature,
730  CSeq_annot& annot,
731  ILineErrorListener* pEC)
732 // ----------------------------------------------------------------------------
733 {
734  CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
735  if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
736  return record.UpdateFeature(m_iFlags, pUnderConstruction);
737  }
738 
739  if (!xInitializeFeature(record, pFeature)) {
740  return false;
741  }
742  string parentsStr;
743  if ((m_iFlags & fGeneXrefs) && record.GetAttribute("Parent", parentsStr)) {
744  list<string> parents;
745  NStr::Split(parentsStr, ",", parents, 0);
746  for (list<string>::const_iterator cit = parents.begin();
747  cit != parents.end();
748  ++cit) {
749  if (!xFeatureSetXrefParent(*cit, pFeature)) {
751  eDiag_Error,
753  "Bad data line: mRNA record with bad parent assignment.");
754  throw error;
755  }
756  }
757  }
758 
759  string strId;
760  if ( record.GetAttribute("ID", strId)) {
761  m_MapIdToFeature[strId] = pFeature;
762  }
763  CRef<CSeq_interval> mrnaLoc(new CSeq_interval);
764  CSeq_loc::E_Choice choice = pFeature->GetLocation().Which();
765  if (choice != CSeq_loc::e_Int) {
767  eDiag_Error,
769  "Internal error: Unexpected location type.");
770  throw error;
771  }
772  mrnaLoc->Assign(pFeature->GetLocation().GetInt());
773  mMrnaLocs[strId] = mrnaLoc;
774 
775  list<CGff2Record> pendingExons;
776  xGetPendingExons(strId, pendingExons);
777  for (auto exonRecord: pendingExons) {
778  CRef< CSeq_feat > pFeature(new CSeq_feat);
779  xUpdateAnnotExon(exonRecord, pFeature, annot, pEC);
780  }
781  if (! xAddFeatureToAnnot(pFeature, annot)) {
782  return false;
783  }
784  return true;
785 }
786 
787 // ----------------------------------------------------------------------------
789  const CGff2Record& record,
790  CRef<CSeq_feat> pFeature,
791  CSeq_annot& annot,
792  ILineErrorListener* pEC)
793 // ----------------------------------------------------------------------------
794 {
795  CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
796  if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
797  return record.UpdateFeature(m_iFlags, pUnderConstruction);
798  }
799 
800  if (!xInitializeFeature(record, pFeature)) {
801  return false;
802  }
803  if (! xAddFeatureToAnnot(pFeature, annot)) {
804  return false;
805  }
806  string strId;
807  if ( record.GetAttribute("ID", strId)) {
808  m_MapIdToFeature[strId] = pFeature;
809  }
810  // address corner case:
811  // parent of CDS is a gene but the DCS is listed before the gene so at the
812  // time we did not know the parent would be a gene.
813  // remedy: throw out any collected cds locations that were meant for RNA
814  // construction.
815  list<CGff2Record> pendingExons;
816  xGetPendingExons(strId, pendingExons);
817  return true;
818 }
819 
820 // ----------------------------------------------------------------------------
822  const CGff2Record& record,
823  CRef<CSeq_feat> pFeature,
824  CSeq_annot& annot,
825  ILineErrorListener* pEC)
826 // ----------------------------------------------------------------------------
827 {
828  if (!record.InitializeFeature(m_iFlags, pFeature)) {
829  return false;
830  }
831 
832  if (! xAddFeatureToAnnot(pFeature, annot)) {
833  return false;
834  }
835  string strId;
836  if ( record.GetAttribute("ID", strId)) {
837  mIdToSeqIdMap[strId] = record.Id();
838  m_MapIdToFeature[strId] = pFeature;
839  }
840  return true;
841 }
842 
843 // ----------------------------------------------------------------------------
845  CRef< CSeq_feat > pFeature,
846  CSeq_annot& annot )
847 // ----------------------------------------------------------------------------
848 {
849  annot.SetData().SetFtable().push_back( pFeature ) ;
850  return true;
851 }
852 
853 // ----------------------------------------------------------------------------
855  const CGff2Record& record)
856 // ----------------------------------------------------------------------------
857 {
858  string id;
859  string parentId;
860  if (!record.GetAttribute("ID", id)) {
861  return;
862  }
863  record.GetAttribute("Parent", parentId);
865  if (it == mCdsParentMap.end()) {
866  mCdsParentMap[id] = parentId;
867  return;
868  }
869  if (it->second == parentId) {
870  return;
871  }
873  eDiag_Error,
875  "Bad data line: CDS record with bad parent assignments.");
876  throw error;
877 }
878 
879 // ----------------------------------------------------------------------------
881 // ----------------------------------------------------------------------------
882 {
883  if (!CGff2Reader::xReadInit()) {
884  return false;
885  }
887  return true;
888 }
889 
890 // ----------------------------------------------------------------------------
892  const string& featureType)
893 // ----------------------------------------------------------------------------
894 {
895  typedef CStaticArraySet<string, PNocase> STRINGARRAY;
896 
897  string ftype(CSoMap::ResolveSoAlias(featureType));
898 
899  static const char* const ignoredTypesAlways_[] = {
900  "protein",
901  "start_codon", // also part of a cds feature
902  "stop_codon", // in GFF3, also part of a cds feature
903  };
904  DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesAlways, ignoredTypesAlways_);
905  STRINGARRAY::const_iterator cit = ignoredTypesAlways.find(ftype);
906  if (cit != ignoredTypesAlways.end()) {
907  return true;
908  }
909  if (!IsInGenbankMode()) {
910  return false;
911  }
912 
913  /* -genbank mode:*/
914  static const char* const specialTypesGenbank_[] = {
915  "antisense_RNA",
916  "autocatalytically_spliced_intron",
917  "guide_RNA",
918  "hammerhead_ribozyme",
919  "lnc_RNA",
920  "miRNA",
921  "piRNA",
922  "rasiRNA",
923  "ribozyme",
924  "RNase_MRP_RNA",
925  "RNase_P_RNA",
926  "scRNA",
927  "selenocysteine",
928  "siRNA",
929  "snoRNA",
930  "snRNA",
931  "SRP_RNA",
932  "stop_codon_read_through",
933  "telomerase_RNA",
934  "vault_RNA",
935  "Y_RNA"
936  };
937  DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, specialTypesGenbank, specialTypesGenbank_);
938 
939  static const char* const ignoredTypesGenbank_[] = {
940  "apicoplast_chromosome",
941  "assembly",
942  "cDNA_match",
943  "chloroplast_chromosome",
944  "chromoplast_chromosome",
945  "chromosome",
946  "contig",
947  "cyanelle_chromosome",
948  "dna_chromosome",
949  "EST_match",
950  "expressed_sequence_match",
951  "intron",
952  "leucoplast_chromosome",
953  "macronuclear_chromosome",
954  "match",
955  "match_part",
956  "micronuclear_chromosome",
957  "mitochondrial_chromosome",
958  "nuclear_chromosome",
959  "nucleomorphic_chromosome",
960  "nucleotide_motif",
961  "nucleotide_to_protein_match",
962  "partial_genomic_sequence_assembly",
963  "protein_match",
964  "replicon",
965  "rna_chromosome",
966  "sequence_assembly",
967  "supercontig",
968  "translated_nucleotide_match",
969  "ultracontig",
970  };
971  DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesGenbank, ignoredTypesGenbank_);
972 
973  cit = specialTypesGenbank.find(ftype);
974  if (cit != specialTypesGenbank.end()) {
975  return false;
976  }
977 
978  cit = ignoredTypesGenbank.find(ftype);
979  if (cit != ignoredTypesGenbank.end()) {
980  return true;
981  }
982 
983  return false;
984 }
985 
986 // ----------------------------------------------------------------------------
987 bool
989  const CGff2Record& record,
990  CRef<CSeq_feat> pFeature)
991 // ----------------------------------------------------------------------------
992 {
993  if (!record.InitializeFeature(m_iFlags, pFeature)) {
994  return false;
995  }
996  const auto& attrs = record.Attributes();
997  const auto it = attrs.find("ID");
998  if (it != attrs.end()) {
999  mIdToSeqIdMap[it->second] = record.Id();
1000  }
1001  return true;
1002 }
1003 
1004 // ----------------------------------------------------------------------------
1005 void
1007  const string& rnaId,
1008  const CGff2Record& exonRecord)
1009 // ----------------------------------------------------------------------------
1010 {
1012  if (it == mPendingExons.end()) {
1013  mPendingExons[rnaId] = list<CGff2Record>();
1014  }
1015  mPendingExons[rnaId].push_back(exonRecord);
1016 }
1017 
1018 // ----------------------------------------------------------------------------
1019 void
1021  const string& rnaId,
1022  list<CGff2Record>& pendingExons)
1023 // ----------------------------------------------------------------------------
1024 {
1026  if (it == mPendingExons.end()) {
1027  return;
1028  }
1029  pendingExons.swap(mPendingExons[rnaId]);
1030  mPendingExons.erase(rnaId);
1031 }
1032 
1033 // ----------------------------------------------------------------------------
1035  CSeq_annot& annot)
1036 // ----------------------------------------------------------------------------
1037 {
1038  if (mAlignmentData) {
1039  xProcessAlignmentData(annot);
1040  return;
1041  }
1042  if (!mCurrentFeatureCount) {
1043  return;
1044  }
1045 
1046  /*
1047  for (const auto& it: mPendingExons) {
1048  CReaderMessage warning(
1049  eDiag_Warning,
1050  m_uLineNumber,
1051  "Bad data line: Record references non-existent Parent=" + it.first);
1052  m_pMessageHandler->Report(warning);
1053  }
1054  */
1055 
1056  //location fixup:
1057  for (auto itLocation: mpLocations->LocationMap()) {
1058  auto id = itLocation.first;
1059  auto itFeature = m_MapIdToFeature.find(id);
1060  if (itFeature == m_MapIdToFeature.end()) {
1061  continue;
1062  }
1063  CRef<CSeq_loc> pNewLoc(new CSeq_loc);
1064  CCdregion::EFrame frame;
1065  mpLocations->MergeLocation(pNewLoc, frame, itLocation.second);
1066  CRef<CSeq_feat> pFeature = itFeature->second;
1067  pFeature->SetLocation(*pNewLoc);
1068  if (pFeature->GetData().IsCdregion()) {
1069  auto& cdrData = pFeature->SetData().SetCdregion();
1070  cdrData.SetFrame(
1072  }
1073  }
1074 
1075  return CGff2Reader::xPostProcessAnnot(annot);
1076 }
1077 
1078 // ----------------------------------------------------------------------------
1080  const string& pragma)
1081 // ----------------------------------------------------------------------------
1082 {
1083  TSeqPos sequenceSize(0);
1084  vector<string> tokens;
1085  NStr::Split(pragma, " \t", tokens, NStr::fSplit_MergeDelimiters);
1086  if (tokens.size() < 2) {
1087  CReaderMessage warning(
1089  m_uLineNumber,
1090  "Bad sequence-region pragma - ignored.");
1091  throw warning;
1092  }
1093  if (tokens.size() >= 4) {
1094  try {
1095  sequenceSize = NStr::StringToNonNegativeInt(tokens[3]);
1096  }
1097  catch(exception&) {
1098  CReaderMessage warning(
1100  m_uLineNumber,
1101  "Bad sequence-region pragma - ignored.");
1102  throw warning;
1103  }
1104  }
1105  mpLocations->SetSequenceSize(tokens[1], sequenceSize);
1106  auto resolvedId = mSeqIdResolve(tokens[1], m_iFlags, true)->AsFastaString();
1107  mpLocations->SetSequenceSize(resolvedId, sequenceSize);
1108 
1109 }
1110 
1111 // ----------------------------------------------------------------------------
1113 // ----------------------------------------------------------------------------
1114 {
1115  return mpLocations->SequenceSize();
1116 }
1117 
1118 // ----------------------------------------------------------------------------
1120  const string& seqId) const
1121 // ----------------------------------------------------------------------------
1122 {
1123  return mpLocations->GetSequenceSize(seqId);
1124 }
1125 
1126 // ----------------------------------------------------------------------------
1128  const CSeq_annot& )
1129 // ----------------------------------------------------------------------------
1130 {
1131  mpLocations->Validate();
1132 }
1133 
1134 
1135 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
AutoPtr –.
Definition: ncbimisc.hpp:401
CCdregion –.
Definition: Cdregion.hpp:66
CCode_break –.
Definition: Code_break.hpp:66
CFeat_id –.
Definition: Feat_id.hpp:66
virtual bool x_CreateAlignment(const CGff2Record &gff, CRef< CSeq_align > &pAlign)
virtual bool xParseStructuredComment(const string &)
static bool IsAlignmentData(const string &)
CRef< CAnnotdesc > m_CurrentBrowserInfo
void xPostProcessAnnot(CSeq_annot &) override
bool mParsingAlignment
bool x_GetFeatureById(const string &, CRef< CSeq_feat > &)
IdToFeatureMap m_MapIdToFeature
bool xFeatureSetQualifier(const string &, const string &, CRef< CSeq_feat >)
bool x_MergeAlignments(const list< CRef< CSeq_align >> &alignment_list, CRef< CSeq_align > &processed)
virtual bool xIsIgnoredFeatureId(const string &)
unsigned int mCurrentFeatureCount
bool IsInGenbankMode() const
bool GetAttribute(const string &, string &) const
Definition: gff2_data.cpp:305
virtual bool UpdateFeature(TReaderFlags, CRef< CSeq_feat >, SeqIdResolver=nullptr) const
Definition: gff2_data.cpp:521
virtual bool AssignFromGff(const string &)
Definition: gff2_data.cpp:214
virtual bool InitializeFeature(TReaderFlags, CRef< CSeq_feat >, SeqIdResolver=nullptr) const
Definition: gff2_data.cpp:508
const TAttributes & Attributes() const
Definition: gff2_data.hpp:80
static string xNormalizedAttributeKey(const CTempString &)
Definition: gff2_data.cpp:335
TAttributes m_Attributes
Definition: gff2_data.hpp:171
string x_NormalizedAttributeKey(const string &)
bool AssignFromGff(const string &) override
static string xNextGenericId()
Definition: gff3_reader.cpp:98
void xPostProcessAnnot(CSeq_annot &) override
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &lr, ILineErrorListener *pErrors=nullptr) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
virtual bool xFindFeatureUnderConstruction(const CGff2Record &, CRef< CSeq_feat > &)
bool xParseFeature(const string &, CSeq_annot &, ILineErrorListener *) override
virtual bool xUpdateAnnotCds(const CGff2Record &, CRef< CSeq_feat >, CSeq_annot &, ILineErrorListener *)
bool xUpdateAnnotFeature(const CGff2Record &, CSeq_annot &, ILineErrorListener *) override
virtual bool xUpdateAnnotGeneric(const CGff2Record &, CRef< CSeq_feat >, CSeq_annot &, ILineErrorListener *)
virtual bool xUpdateAnnotRegion(const CGff2Record &, CRef< CSeq_feat >, CSeq_annot &, ILineErrorListener *)
map< string, string > mIdToSeqIdMap
CGff3ReadRecord * x_CreateRecord() override
virtual bool xUpdateAnnotExon(const CGff2Record &, CRef< CSeq_feat >, CSeq_annot &, ILineErrorListener *)
virtual ~CGff3Reader()
void xVerifyExonLocation(const string &, const CGff2Record &)
virtual void xAddPendingExon(const string &, const CGff2Record &)
virtual bool xJoinLocationIntoRna(const CGff2Record &, ILineErrorListener *)
virtual bool xFeatureSetXrefParent(const string &, CRef< CSeq_feat >)
void xProcessAlignmentData(CSeq_annot &pAnnot)
TSeqPos SequenceSize() const
virtual bool xUpdateAnnotRna(const CGff2Record &, CRef< CSeq_feat >, CSeq_annot &, ILineErrorListener *)
map< string, string > mCdsParentMap
virtual bool xInitializeFeature(const CGff2Record &, CRef< CSeq_feat >)
bool xReadInit() override
CGff3Reader(TReaderFlags uFlags, const string &name="", const string &title="", SeqIdResolver resolver=CReadUtil::AsSeqId, CReaderListener *=nullptr)
SAlignmentData mAlignmentData
shared_ptr< CGff3LocationMerger > mpLocations
PENDING_EXONS mPendingExons
TSeqPos GetSequenceSize(const string &) const
bool xIsIgnoredFeatureType(const string &) override
virtual void xValidateAnnot(const CSeq_annot &) override
string xMakeRecordId(const CGff2Record &record)
Definition: gff3_reader.cpp:72
map< string, CRef< CSeq_interval > > mMrnaLocs
void xProcessSequenceRegionPragma(const string &pragma) override
static unsigned int msGenericIdCounter
void xVerifyCdsParents(const CGff2Record &)
virtual bool xParseAlignment(const string &strLine)
virtual bool xUpdateAnnotGene(const CGff2Record &, CRef< CSeq_feat >, CSeq_annot &, ILineErrorListener *)
virtual void xGetPendingExons(const string &, list< CGff2Record > &)
virtual bool xFeatureSetXrefGrandParent(const string &, CRef< CSeq_feat >)
void xProcessData(const TReaderData &, CSeq_annot &) override
bool xAddFeatureToAnnot(CRef< CSeq_feat >, CSeq_annot &) override
static void ResetId()
TSeqPos SeqStop() const
CRef< CSeq_loc > GetSeqLoc(TReaderFlags, SeqIdResolver seqidresolve=nullptr) const
const string & Type() const
ENa_strand Strand() const
TSeqPos SeqStart() const
bool IsSetStrand() const
const string & Id() const
virtual void SetType(const string &recType)
const string & NormalizedType() const
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:194
Common file reader utility functions.
Definition: read_util.hpp:47
long TReaderFlags
Definition: reader_base.hpp:84
SeqIdResolver mSeqIdResolve
unsigned int m_uLineNumber
string m_AnnotTitle
virtual bool xParseBrowserLine(const string &, CSeq_annot &)
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
vector< TReaderLine > TReaderData
Definition: reader_base.hpp:70
TReaderFlags m_iFlags
virtual bool xReadInit()
virtual CRef< CSeq_annot > ReadSeqAnnot(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as a single Seq-annot.
CRef –.
Definition: ncbiobj.hpp:618
ESubtype GetSubtype(void) const
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetTitleDesc(const string &title)
Definition: Seq_annot.cpp:96
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
Definition: Seq_feat.cpp:429
static bool SoTypeToFeature(const string &, CSeq_feat &, bool=false)
Definition: so_map.cpp:411
static string ResolveSoAlias(const string &)
Definition: so_map.cpp:1277
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Include a standard set of the NCBI C++ Toolkit most basic headers.
static void fatal(const char *msg,...)
Definition: attributes.c:18
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
string
Definition: cgiapp.hpp:687
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int StringToNonNegativeInt(const CTempString str, TStringToNumFlags flags=0)
Convert string to non-negative integer value.
Definition: ncbistr.cpp:457
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
void SetAa(TAa &value)
Assign a value to Aa data member.
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1314
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
bool IsCdregion(void) const
Check if variant Cdregion is selected.
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_feat_.hpp:904
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
TCode_break & SetCode_break(void)
Assign a value to Code_break data member.
Definition: Cdregion_.hpp:739
void SetLoc(TLoc &value)
Assign a value to Loc data member.
bool IsRna(void) const
Check if variant Rna is selected.
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
TFrom GetFrom(void) const
Get the From member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
void SetStrand(TStrand value)
Assign a value to Strand data member.
E_Choice
Choice variants.
Definition: Seq_loc_.hpp:96
@ e_Int
from to
Definition: Seq_loc_.hpp:101
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
@ e_not_set
No variant selected.
Definition: Seq_annot_.hpp:132
Lightweight interface for getting lines of data with minimal memory copying.
constexpr bool empty(list< Ts... >) noexcept
#define DEFINE_STATIC_ARRAY_MAP(Type, Var, Array)
Definition: static_set.hpp:888
MAP_ID_TO_ALIGN mAlignments
Definition: gff3_reader.hpp:66
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:08:59 2024 by modify_doxy.py rev. 669887