NCBI C++ ToolKit
gtf_writer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gtf_writer.cpp 101758 2024-02-07 15:03:49Z foleyjp $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Frank Ludwig
27  *
28  * File Description: Write gff file
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
38 
51 #include <objects/seq/so_map.hpp>
52 
53 #include <objmgr/annot_ci.hpp>
54 #include <objmgr/feat_ci.hpp>
55 #include <objmgr/mapped_feat.hpp>
56 #include <objmgr/util/feature.hpp>
57 #include <objmgr/util/feature.hpp>
59 
65 
67 
70 
71 // ----------------------------------------------------------------------------
73  CScope&scope,
74  CNcbiOstream& ostr,
75  unsigned int uFlags ) :
76 // ----------------------------------------------------------------------------
77  CGff2Writer( scope, ostr, uFlags )
78 {
79  mIdGenerator.Reset(); //may be flag dependent some day
80 };
81 
82 // ----------------------------------------------------------------------------
84  CNcbiOstream& ostr,
85  unsigned int uFlags ) :
86 // ----------------------------------------------------------------------------
87  CGff2Writer( ostr, uFlags )
88 {
89  mIdGenerator.Reset(); //may be flag dependent some day
90 };
91 
92 // ----------------------------------------------------------------------------
94 // ----------------------------------------------------------------------------
95 {
96 };
97 
98 // ----------------------------------------------------------------------------
100  CBioseq_Handle bsh )
101 // ----------------------------------------------------------------------------
102 {
104  const auto& display_range = GetRange();
105  CFeat_CI feat_iter(bsh, display_range, sel);
106  CGffFeatureContext fc(feat_iter, bsh);
107 
108  vector<CMappedFeat> vRoots = fc.FeatTree().GetRootFeatures();
109  std::sort(vRoots.begin(), vRoots.end(), CWriteUtil::CompareFeatures);
110  for (auto pit = vRoots.begin(); pit != vRoots.end(); ++pit) {
111  CMappedFeat mRoot = *pit;
112  fc.AssignShouldInheritPseudo(false);
113  if (!xWriteFeature(fc, mRoot)) {
114  // error!
115  continue;
116  }
117  xWriteAllChildren(fc, mRoot);
118  }
119  return true;
120 }
121 
122 // ----------------------------------------------------------------------------
124 // ----------------------------------------------------------------------------
125 {
126  if (!m_bHeaderWritten) {
127  m_Os << "#gtf-version 2.2" << '\n';
128  m_bHeaderWritten = true;
129  }
130  return true;
131 };
132 
133 // ----------------------------------------------------------------------------
135  const CGffWriteRecord* pRecord )
136 // ----------------------------------------------------------------------------
137 {
138  m_Os << pRecord->StrSeqId() << '\t';
139  m_Os << pRecord->StrMethod() << '\t';
140  m_Os << pRecord->StrType() << '\t';
141  m_Os << pRecord->StrSeqStart() << '\t';
142  m_Os << pRecord->StrSeqStop() << '\t';
143  m_Os << pRecord->StrScore() << '\t';
144  m_Os << pRecord->StrStrand() << '\t';
145  m_Os << pRecord->StrPhase() << '\t';
146 
147  if ( m_uFlags & fStructibutes ) {
148  m_Os << pRecord->StrStructibutes() << '\n';
149  }
150  else {
151  m_Os << pRecord->StrAttributes() << '\n';
152  }
153  return true;
154 }
155 
156 
157 // ----------------------------------------------------------------------------
160  const CMappedFeat& mf)
161 // ----------------------------------------------------------------------------
162 {
163  if (IsCanceled()) {
164  NCBI_THROW(
166  eInterrupted,
167  "Processing terminated by user");
168  }
169  auto subtype = mf.GetFeatSubtype();
170 
171  //const auto& feat = mf.GetMappedFeature();
172  //if (subtype == CSeqFeatData::eSubtype_tRNA) {
173  // auto from = feat.GetLocation().GetStart(ESeqLocExtremes::eExtreme_Biological);
174  // if (from == 48738) {
175  // cerr << "";
176  // }
177  //}
178 
179  switch(subtype) {
180  default:
181  if (mf.GetFeatType() == CSeqFeatData::e_Rna) {
182  return xWriteRecordsTranscript(context, mf);
183  }
184  // GTF is not interested --- ignore
185  return true;
190  return xWriteRecordsTranscript(context, mf);
192  return xWriteRecordsGene(context, mf);
194  return xWriteRecordsCds(context, mf);
195  }
196  return false;
197 }
198 
199 
200 // ----------------------------------------------------------------------------
203  const CMappedFeat& mf )
204 // ----------------------------------------------------------------------------
205 {
206  if (m_uFlags & fNoGeneFeatures) {
207  return true;
208  }
209 
210  list<CRef<CGtfRecord>> records;
211  if (!xAssignFeaturesGene(records, context, mf)) {
212  return false;
213  }
214 
215  for (const auto& record: records) {
216  if (!xWriteRecord(record)) {
217  return false;
218  }
219  }
220  return true;
221 }
222 
223 // ----------------------------------------------------------------------------
226  const CMappedFeat& mf,
227  const string& transcriptIdPreAssigned)
228 // ----------------------------------------------------------------------------
229 {
230  string transcriptId(transcriptIdPreAssigned);
231 
233  if (tf) {
234  if (!xWriteRecordsTranscript(context, tf, transcriptId)) {
235  return false;
236  }
237  }
238  if (!tf) {
239  tf = context.FeatTree().GetParent(mf);
240  }
241  if (tf) {
242  auto featIt = this->mFeatMap.find(tf);
243  if (featIt != mFeatMap.end()) {
244  transcriptId = featIt->second;
245  }
246  }
247  list<CRef<CGtfRecord>> records;
248  if (!xAssignFeaturesCds(records, context, mf, transcriptId)) {
249  return false;
250  }
251 
252  for (const auto& record: records) {
253  if (!xWriteRecord(record)) {
254  return false;
255  }
256  }
257  return true;
258 }
259 
260 // ----------------------------------------------------------------------------
263  const CMappedFeat& mf,
264  const string& transcriptIdPreAssigned )
265 // ----------------------------------------------------------------------------
266 {
267  string transcriptId(transcriptIdPreAssigned);
268 
269  list<CRef<CGtfRecord>> records;
270  if (!xAssignFeaturesTranscript(records, context, mf, transcriptId)) {
271  return false;
272  }
273  for (const auto& record: records) {
274  if (!xWriteRecord(record)) {
275  return false;
276  }
277  }
278  return xWriteFeatureExons(context, mf, transcriptId);
279 }
280 
281 // ----------------------------------------------------------------------------
283  list<CRef<CGtfRecord>>& recordList,
285  const CMappedFeat& mf)
286 // ----------------------------------------------------------------------------
287 {
288  const auto& mfLoc = mf.GetLocation();
289  auto mfStrand = (mfLoc.IsSetStrand() && mfLoc.GetStrand() == eNa_strand_minus) ?
292 
293  CSeq_loc mfLocAsPackedInt;
294  mfLocAsPackedInt.Assign(mfLoc);
295  mfLocAsPackedInt.ChangeToPackedInt();
296 
297  const auto& sublocs = mfLocAsPackedInt.GetPacked_int().Get();
298  bool needsPartNumbers = (sublocs.size() > 1);
299  unsigned int partNum = 1;
300  for ( auto it = sublocs.begin(); it != sublocs.end(); it++ ) {
301  const CSeq_interval& intv = **it;
302  CRef<CGtfRecord> pRecord(
304  if (!xAssignFeature(*pRecord, context, mf)) {
305  return false;
306  }
307  pRecord->SetEndpoints(intv.GetFrom(), intv.GetTo(), mfStrand);
308  if (needsPartNumbers) {
309  pRecord->SetPartNumber(partNum++);
310  }
311  recordList.push_back(pRecord);
312  }
313  return true;
314 }
315 
316 
317 // ----------------------------------------------------------------------------
319  list<CRef<CGtfRecord>>& recordList,
321  const CMappedFeat& mf,
322  const string& transcriptId)
323 // ----------------------------------------------------------------------------
324 {
325  // note: GTF transcript locations are the minimum covering interval of all
326  // its exons. Hence it's either a single interval, or two intervals if the
327  // covering interval wraps around the origin.
328 
329  CRef<CGtfRecord> pRecord(
331  if (!transcriptId.empty()) {
332  pRecord->SetTranscriptId(transcriptId);
333  }
334  if (!xAssignFeature(*pRecord, context, mf)) {
335  return false;
336  }
337 
338  const auto& mfLoc = mf.GetLocation();
339  auto mfStrand = mfLoc.IsSetStrand() ?
340  mfLoc.GetStrand() :
342 
343  CSeq_loc mfLocAsPackedInt;
344  mfLocAsPackedInt.Assign(mfLoc);
345  mfLocAsPackedInt.ChangeToPackedInt();
346 
347  bool isTransSpliced = mf.IsSetExcept() && mf.GetExcept() && mf.IsSetExcept_text() &&
348  (NStr::Find(mf.GetExcept_text(), "trans-splicing") != NPOS);
349  const auto& sublocs = mfLocAsPackedInt.GetPacked_int().Get();
350  if (isTransSpliced) {
351  auto rnaStart = sublocs.front()->GetFrom();
352  auto rnaStop = sublocs.front()->GetTo();
353  for (const auto& loc : sublocs) {
354  auto start = loc->GetFrom();
355  if (start < rnaStart) {
356  rnaStart = start;
357  }
358  auto stop = loc->GetTo();
359  if (stop > rnaStop) {
360  rnaStop = stop;
361  }
362  }
363  pRecord->SetEndpoints(rnaStart, rnaStop, mfStrand);
364  }
365  else {
366  TSeqPos lastFrom = sublocs.front()->GetFrom();
367  TSeqPos lastTo = sublocs.front()->GetTo();
368  auto it = sublocs.begin();
369  bool iterationDone(false);
370  for ( it++; it != sublocs.end() && !iterationDone; it++ ) {
371  const CSeq_interval& intv = **it;
372 
373  switch (mfStrand) {
374  case eNa_strand_minus: {
375  if (intv.GetTo() <= lastFrom) {
376  lastFrom = intv.GetFrom();
377  }
378  else {
379  pRecord->SetEndpoints(lastFrom, lastTo, mfLoc.GetStrand());
380  recordList.push_back(pRecord);
381  pRecord.Reset(new CGtfRecord(context, (m_uFlags & fNoExonNumbers)));
382  if (!transcriptId.empty()) {
383  pRecord->SetTranscriptId(transcriptId);
384  }
385  if (!xAssignFeature(*pRecord, context, mf)) {
386  return false;
387  }
388  lastFrom = intv.GetFrom();
389  lastTo = intv.GetTo();
390  }
391  }
392  break;
393 
394  case eNa_strand_other: {
395  //feature contains parts from both strands
396  // for now, don't even attempt to origin wrap these things (rw-1299).
397  iterationDone = true;
398  }
399  break;
400 
401  default: {
402  if (intv.GetFrom() >= lastTo) {
403  lastTo = intv.GetTo();
404  }
405  else { //wrapping back to 0
406  pRecord->SetEndpoints(lastFrom, lastTo, mfLoc.GetStrand());
407  recordList.push_back(pRecord);
408  pRecord.Reset(new CGtfRecord(context, (m_uFlags & fNoExonNumbers)));
409  if (!transcriptId.empty()) {
410  pRecord->SetTranscriptId(transcriptId);
411  }
412  if (!xAssignFeature(*pRecord, context, mf)) {
413  return false;
414  }
415  lastFrom = 0;
416  lastTo = intv.GetTo();
417  }
418  }
419  break;
420  }
421  }
422  pRecord->SetEndpoints(lastFrom, lastTo, mfLoc.GetStrand());
423  }
424  recordList.push_back(pRecord);
425 
426  bool needPartNumbers = (recordList.size() > 1);
427  unsigned int partNum = 1;
428  for (auto& record: recordList) {
429  if (needPartNumbers) {
430  record->SetPartNumber(partNum++);
431  }
432  record->SetType("transcript");
433  record->SetGbKeyFrom(mf);
434  }
435  return true;
436 }
437 
438 // ----------------------------------------------------------------------------
440  list<CRef<CGtfRecord>>& recordList,
442  const CMappedFeat& mf,
443  const string& parentTranscriptId)
444 // ----------------------------------------------------------------------------
445 {
446  string transcriptId(parentTranscriptId);
447 
448  const auto& mfLoc = mf.GetLocation();
449 
450  CSeq_loc mfLocAsPackedInt;
451  mfLocAsPackedInt.Assign(mfLoc);
452  mfLocAsPackedInt.ChangeToPackedInt();
453  const auto& sublocs = mfLocAsPackedInt.GetPacked_int().Get();
454 
455  bool needsPartNumbers = xIntervalsNeedPartNumbers(sublocs);
456 
457  int phase {0};
458  const auto& cdsFeat = mf.GetMappedFeature();
459  if (cdsFeat.GetData().GetCdregion().IsSetFrame()) {
460  phase = max(cdsFeat.GetData().GetCdregion().GetFrame()-1, 0);
461  }
462 
463  unsigned int partNum = 1;
464  for (auto pInterval : sublocs) {
465  const CSeq_interval& intv = *pInterval;
466  auto strand = intv.IsSetStrand() ? intv.GetStrand() : eNa_strand_plus;
467  CRef<CGtfRecord> pRecord(
469  if (!xAssignFeature(*pRecord, context, mf)) {
470  return false;
471  }
472  pRecord->SetEndpoints(intv.GetFrom(), intv.GetTo(), strand);
473  if (needsPartNumbers) {
474  pRecord->SetAttribute("part", NStr::NumericToString(partNum++));
475  }
476  if (!transcriptId.empty()) {
477  pRecord->SetTranscriptId(transcriptId);
478  }
479  else {
480  transcriptId = pRecord->TranscriptId();
481  }
482  pRecord->SetCdsPhase(phase);
483  recordList.push_back(pRecord);
484  phase = (3 - ((intv.GetLength()+3 - phase)%3))%3;
485  }
486 
487  // subtract stop_codon in the end:
488  unsigned int basesToLose = 3;
489  while (basesToLose > 0 && !recordList.empty()) {
490  auto pLastRecord = recordList.back();
491  auto lastSize = pLastRecord->SeqStop() - pLastRecord->SeqStart() + 1;
492  auto lastStrand = pLastRecord->SeqStrand();
493  if (lastSize > basesToLose) {
494  if (mfLoc.GetStrand() == eNa_strand_minus) {
495  pLastRecord->SetEndpoints(
496  pLastRecord->SeqStart() + basesToLose, pLastRecord->SeqStop(), lastStrand);
497  }
498  else {
499  pLastRecord->SetEndpoints(
500  pLastRecord->SeqStart(), pLastRecord->SeqStop() - basesToLose, lastStrand);
501  }
502  basesToLose = 0;
503  }
504  else {
505  recordList.erase(--recordList.end());
506  basesToLose -= lastSize;
507  }
508  }
509 
510  // generate start codon:
511  if (!mfLoc.IsPartialStart(eExtreme_Biological)) {
512  int basePairsNeeded = 3;
513  auto currentIt = sublocs.begin();
514  unsigned int partNumber = 1;
515  unsigned int baseCount = 0;
516 
517  while (basePairsNeeded > 0 && currentIt != sublocs.end()) {
518  const CSeq_interval& currentLoc = **currentIt;
519  auto currentFrom = currentLoc.GetFrom();
520  auto currentTo = currentLoc.GetTo();
521  auto currentStrand = currentLoc.IsSetStrand() ? currentLoc.GetStrand() : eNa_strand_plus;
522 
523  CRef<CGtfRecord> pRecord(
525  if (!xAssignFeature(*pRecord, context, mf)) {
526  return false;
527  }
528  pRecord->SetType("start_codon");
529 
530  if (currentTo >= currentFrom + basePairsNeeded -1) {
531  if (currentStrand == eNa_strand_minus) {
532  pRecord->SetEndpoints(currentTo - basePairsNeeded + 1, currentTo, currentStrand);
533  }
534  else {
535  pRecord->SetEndpoints(currentFrom, currentFrom + basePairsNeeded -1, currentStrand);
536  }
537  basePairsNeeded = 0;
538  }
539  else {
540  pRecord->SetEndpoints(currentFrom, currentTo, currentStrand);
541  basePairsNeeded = basePairsNeeded - (currentTo - currentFrom + 1);
542  }
543 
544  if (partNumber > 1 || basePairsNeeded > 0) {
545  pRecord->SetPartNumber(partNumber++);
546  }
547  if (!transcriptId.empty()) {
548  pRecord->SetTranscriptId(transcriptId);
549  }
550  _ASSERT(baseCount < 3);
551  pRecord->SetCdsPhase((3-baseCount)%3);
552  baseCount += pRecord->GetExtent();
553  recordList.push_back(pRecord);
554  currentIt++;
555  }
556  }
557 
558  // generate stop codon:
559  if (!mfLoc.IsPartialStop(eExtreme_Biological)) {
560  list<CRef<CGtfRecord>> stopCodonParts;
561  int basePairsNeeded = 3;
562  auto currentIt = sublocs.rbegin();
563  while (basePairsNeeded > 0 && currentIt != sublocs.rend()) {
564  const CSeq_interval& currentLoc = **currentIt;
565  auto currentFrom = currentLoc.GetFrom();
566  auto currentTo = currentLoc.GetTo();
567  auto currentStrand = currentLoc.IsSetStrand() ? currentLoc.GetStrand() : eNa_strand_plus;
568 
569  CRef<CGtfRecord> pRecord(
571  if (!xAssignFeature(*pRecord, context, mf)) {
572  return false;
573  }
574  pRecord->SetType("stop_codon");
575 
576  if (currentTo >= currentFrom + basePairsNeeded - 1) {
577  if (currentStrand == eNa_strand_minus) {
578  pRecord->SetEndpoints(currentFrom, currentFrom + basePairsNeeded - 1, currentStrand);
579  }
580  else {
581  pRecord->SetEndpoints(currentTo - basePairsNeeded + 1, currentTo, currentStrand);
582  }
583  basePairsNeeded = 0;
584  }
585  else {
586  pRecord->SetEndpoints(currentFrom, currentTo, currentStrand);
587  basePairsNeeded = basePairsNeeded - (currentTo - currentFrom + 1);
588  }
589 
590  if (!transcriptId.empty()) {
591  pRecord->SetTranscriptId(transcriptId);
592  }
593  stopCodonParts.push_front(pRecord);
594  currentIt++;
595  }
596  unsigned int partNumber = 1;
597  bool needPartNumbers = (stopCodonParts.size() > 1);
598  unsigned int baseCount = 0;
599  for (auto& pRecord: stopCodonParts) {
600  if (needPartNumbers) {
601  pRecord->SetPartNumber(partNumber++);
602  }
603  _ASSERT(baseCount < 3);
604  pRecord->SetCdsPhase((3-baseCount)%3);
605  baseCount += pRecord->GetExtent();
606  recordList.push_back(pRecord);
607  }
608  }
609 
610  // assign exon numbers:
611  CExonNumberAssigner exonNumberAssigner(mf);
612  if (exonNumberAssigner.CdsNeedsExonNumbers()) {
613  for (auto& pRecord: recordList) {
614  exonNumberAssigner.AssignExonNumberTo(*pRecord);
615  }
616  }
617  return true;
618 }
619 
620 
621 // ----------------------------------------------------------------------------
624  const CMappedFeat& mf,
625  const string& transcriptId)
626 // ----------------------------------------------------------------------------
627 {
628  CRef<CGtfRecord> pMrna( new CGtfRecord( context ) );
629  if (!transcriptId.empty()) {
630  pMrna->SetTranscriptId(transcriptId);
631  }
632  if (!xAssignFeature(*pMrna, context, mf)) {
633  return false;
634  }
635  pMrna->CorrectType("exon");
636 
637  const CSeq_loc& loc = mf.GetLocation();
638  unsigned int uExonNumber = 1;
639 
641  pLocMrna->Add( loc );
642  pLocMrna->ChangeToPackedInt();
643  if (!pLocMrna->GetPacked_int().CanGet()) {
644  return false;
645  }
646 
647  const list<CRef<CSeq_interval>>& sublocs = pLocMrna->GetPacked_int().Get();
648  for (auto it = sublocs.begin(); it != sublocs.end(); ++it) {
649  const CSeq_interval& subint = **it;
650  CRef<CGtfRecord> pExon(
652  pExon->MakeChildRecord(*pMrna, subint, uExonNumber++);
653  pExon->DropAttributes("gbkey");
654  xWriteRecord(pExon);
655  }
656  return true;
657 }
658 
659 // ----------------------------------------------------------------------------
661  CGffFeatureRecord& record,
663  const CMappedFeat& mf )
664  // ----------------------------------------------------------------------------
665 {
666  record.SetType("region");
667 
668  if (mf.IsSetQual()) {
669  const auto& quals = mf.GetQual();
670  auto it = quals.begin();
671  for( ; it != quals.end(); ++it) {
672  if (!(*it)->CanGetQual() || !(*it)->CanGetVal()) {
673  continue;
674  }
675  if ((*it)->GetQual() == "standard_name") {
676  record.SetType((*it)->GetVal());
677  return true;
678  }
679  }
680  }
681  switch ( mf.GetFeatSubtype() ) {
682  default:
683  break;
685  record.SetType("CDS");
686  break;
688  record.SetType("exon");
689  break;
691  record.SetType("transcript");
692  break;
694  record.SetType("gene");
695  break;
697  record.SetType("mRNA");
698  break;
700  record.SetType("scRNA");
701  break;
702  }
703  return true;
704 }
705 
706 
707 // ----------------------------------------------------------------------------
709  CGffFeatureRecord& record,
711  const CMappedFeat& mf )
712  // ----------------------------------------------------------------------------
713 {
714  record.SetMethod(".");
715 
716  if (mf.IsSetQual()) {
717  const auto& quals = mf.GetQual();
718  auto it = quals.begin();
719  for (; it != quals.end(); ++it) {
720  if (!(*it)->CanGetQual() || !(*it)->CanGetVal()) {
721  continue;
722  }
723  if ((*it)->GetQual() == "gff_source") {
724  record.SetMethod((*it)->GetVal());
725  return true;
726  }
727  }
728  }
729 
730  if (mf.IsSetExt()) {
732  mf.GetExt(), "ModelEvidence");
733  if (model_evidence) {
734  string strMethod;
735  if (model_evidence->HasField("Method") ) {
736  record.SetMethod(
737  model_evidence->GetField("Method").GetData().GetStr());
738  return true;
739  }
740  }
741  }
742 
743  if (mf.IsSetExts()) {
745  mf.GetExts(), "ModelEvidence");
746  if (model_evidence) {
747  string strMethod;
748  if (model_evidence->HasField("Method")) {
749  record.SetMethod(
750  model_evidence->GetField("Method").GetData().GetStr());
751  return true;
752  }
753  }
754  }
755 
756  CScope& scope = mf.GetScope();
758  string typeFromId;
759  CWriteUtil::GetIdType(scope.GetBioseqHandle(idh), typeFromId);
760  if (!typeFromId.empty()) {
761  record.SetMethod(typeFromId);
762  }
763  return true;
764 }
765 
766 // ----------------------------------------------------------------------------
768  CGffFeatureRecord& rec,
770  const CMappedFeat& mf )
771  // ----------------------------------------------------------------------------
772 {
773  CGtfRecord& record = dynamic_cast<CGtfRecord&>(rec);
774  return (
775  xAssignFeatureAttributeGeneId(record, fc, mf) &&
778 }
779 
780 // ----------------------------------------------------------------------------
782  CGffFeatureRecord& rec,
784  const CMappedFeat& mf )
785 // ----------------------------------------------------------------------------
786 {
787  const vector<string> specialCases = {
788  "ID",
789  "Parent",
790  "gff_type",
791  "transcript_id",
792  "gene_id",
793  };
794 
795  CGtfRecord& record = dynamic_cast<CGtfRecord&>(rec);
796  auto quals = mf.GetQual();
797  for (auto qual: quals) {
798  if (!qual->IsSetQual() || !qual->IsSetVal()) {
799  continue;
800  }
801  auto specialCase = std::find(
802  specialCases.begin(), specialCases.end(), qual->GetQual());
803  if (specialCase != specialCases.end()) {
804  continue;
805  }
806  record.AddAttribute(qual->GetQual(), qual->GetVal());
807  }
808  return true;
809 }
810 
811 // ----------------------------------------------------------------------------
813  CGffFeatureRecord& rec,
815  const CMappedFeat& mf )
816  // ----------------------------------------------------------------------------
817 {
818  return CGff2Writer::xAssignFeatureAttributeDbxref(rec, fc, "db_xref", mf);
819 }
820 
821 // ----------------------------------------------------------------------------
823  CGffFeatureRecord& rec,
825  const CMappedFeat& mf )
826  // ----------------------------------------------------------------------------
827 {
828  if (!mf.IsSetComment()) {
829  return true;
830  }
831  CGtfRecord& record = dynamic_cast<CGtfRecord&>(rec);
832  record.SetAttribute("note", mf.GetComment());
833  return true;
834 }
835 
836 // ----------------------------------------------------------------------------
838  const CMappedFeat& mf,
840 // ----------------------------------------------------------------------------
841 {
842  auto featIt = mMapFeatToGeneId.find(mf);
843  if (featIt != mMapFeatToGeneId.end()) {
844  return featIt->second;
845  }
846 
847  auto parent = context.FeatTree().GetParent(mf);
848  featIt = mMapFeatToGeneId.find(parent);
849  if (featIt != mMapFeatToGeneId.end()) {
850  return featIt->second;
851  }
852 
853  auto children = context.FeatTree().GetChildren(mf);
854  for (auto child : children) {
855  featIt = mMapFeatToGeneId.find(child);
856  if (featIt != mMapFeatToGeneId.end()) {
857  return featIt->second;
858  }
859  }
860 
861  string geneId = mIdGenerator.NextId("unassigned_gene");
862  mMapFeatToGeneId[mf] = geneId;
863  return geneId;
864 }
865 
866 // ----------------------------------------------------------------------------
868  const CMappedFeat& mf)
869  // ----------------------------------------------------------------------------
870 {
871  return mIdGenerator.NextId("unassigned_transcript");
872 }
873 
874 // ----------------------------------------------------------------------------
876  CGtfRecord& record,
878  const CMappedFeat& mf )
879 // ----------------------------------------------------------------------------
880 {
881  static list<CSeqFeatData::ESubtype> nonRnaTranscripts = {
888  };
889  auto featSubtype = mf.GetFeatSubtype();
890  if (!mf.GetData().IsRna()) {
891  auto it = std::find(
892  nonRnaTranscripts.begin(), nonRnaTranscripts.end(), featSubtype);
893  if (it == nonRnaTranscripts.end()) {
894  return true;
895  }
896  }
897  const auto& feature = mf.GetOriginalFeature();
898  string so_type;
899  if (!CSoMap::FeatureToSoType(feature, so_type)) {
900  return true;
901  }
902 
903  record.SetAttribute("transcript_biotype", so_type);
904  return true;
905 }
906 
907 // ----------------------------------------------------------------------------
909  CGtfRecord& record,
911  const CMappedFeat& mf )
912 // ----------------------------------------------------------------------------
913 {
914  if (!record.TranscriptId().empty()) {
915  return true; //special case hence already assigned
916  }
917 
918  const auto mfIt = mFeatMap.find(mf);
919  if (mFeatMap.end() != mfIt) {
920  record.SetTranscriptId(mfIt->second);
921  return true;
922  }
923 
924  CMappedFeat mrnaFeat;
925  auto featSubtype = mf.GetFeatSubtype();
926  switch(featSubtype) {
927  default:
928  mrnaFeat = mf;
929  break;
935  mrnaFeat = mf;
936  break;
939  mrnaFeat = feature::GetParentFeature(mf);
940  }
941  else {
942  mrnaFeat = mf;
943  //there must be one somewhere, and that's the closest we can
944  // get to it.
945  }
946  break;
948  return true;
949  }
950 
951  if (!mrnaFeat) {
952  //record.SetTranscriptId("dummy");
953  return true;
954  }
955 
956  const auto featIt = mFeatMap.find(mrnaFeat);
957  if (mFeatMap.end() != featIt) {
958  record.SetTranscriptId(featIt->second);
959  return true;
960  }
961 
962  FEAT_ID featId = mf.GetNamedQual("transcript_id");
963  if (featId.empty() && mf.GetData().IsRna() && mf.IsSetProduct()) {
965  mf.GetProductId(), mf.GetScope(), featId)) {
966  featId.clear();
967  }
968  }
969 
970  if (featId.empty()) {
971  featId = mf.GetNamedQual("orig_transcript_id");
972  }
973 
974  if (featId.empty()) {
975  featId = xGenericTranscriptId(mf);
976  //we know the ID is going to be unique if we get it this way
977  // not point in further checking
978  mUsedFeatIds.emplace(featId);
979  mFeatMap[mf] = featId;
980  record.SetTranscriptId(featId);
981  return true;
982  }
983  //uniquify the ID we came up with
984  auto cit = mUsedFeatIds.find(featId);
985  if (mUsedFeatIds.end() == cit) {
986  mUsedFeatIds.emplace(featId);
987  mFeatMap[mf] = featId;
988  record.SetTranscriptId(featId);
989  return true;
990  }
991  unsigned int suffix = 1;
992  featId += "_";
993  while (true) {
994  auto qualifiedId = featId + NStr::UIntToString(suffix);
995  cit = mUsedFeatIds.find(qualifiedId);
996  if (mUsedFeatIds.end() == cit) {
997  mUsedFeatIds.emplace(qualifiedId);
998  mFeatMap[mf] = qualifiedId;
999  record.SetTranscriptId(qualifiedId);
1000  return true;
1001  }
1002  ++suffix;
1003  }
1004  return true;
1005 }
1006 
1007 // ----------------------------------------------------------------------------
1009  CGtfRecord& record,
1011  const CMappedFeat& mf )
1012 // ----------------------------------------------------------------------------
1013 {
1014  if (!record.GeneId().empty()) {
1015  return true;
1016  }
1017  CMappedFeat geneFeat = mf;
1019  geneFeat = feature::GetBestGeneForFeat(mf, &fc.FeatTree());
1020  }
1021  if (!geneFeat) {
1022  const auto& geneIdQual = mf.GetNamedQual("gene_id");
1023  if (!geneIdQual.empty()) {
1024  record.SetGeneId(geneIdQual); // empty most times but still best effort
1025  return true;
1026  }
1027  auto geneId = xGenericGeneId(mf, fc);
1028  record.SetGeneId(geneId);
1029  return true;
1030  }
1031 
1032  auto geneIt = mGeneMap.find(geneFeat);
1033  if (mGeneMap.end() != geneIt) {
1034  record.SetGeneId(geneIt->second);
1035  return true;
1036  }
1037 
1038  GENE_ID geneId;
1039  const auto& geneRef = geneFeat.GetData().GetGene();
1040 
1041  geneId = mf.GetNamedQual("gene_id");
1042  if (geneId.empty() && geneRef.IsSetLocus_tag()) {
1043  geneId = geneRef.GetLocus_tag();
1044  }
1045  if (geneId.empty() && geneRef.IsSetLocus()) {
1046  geneId = geneRef.GetLocus();
1047  }
1048  if (geneId.empty() && geneRef.IsSetSyn() ) {
1049  geneId = geneRef.GetSyn().front();
1050  }
1051  if (geneId.empty()) {
1052  geneId = xGenericGeneId(mf, fc);
1053  //we know the ID is going to be unique if we get it this way
1054  // not point in further checking
1055  mUsedGeneIds.emplace(geneId);
1056  mGeneMap[mf] = geneId;
1057  record.SetGeneId(geneId);
1058  return true;
1059  }
1060 
1061  auto cit = mUsedGeneIds.find(geneId);
1062  if (mUsedGeneIds.end() == cit) {
1063  mUsedGeneIds.emplace(geneId);
1064  mGeneMap[mf] = geneId;
1065  record.SetGeneId(geneId);
1066  return true;
1067  }
1068 
1069  unsigned int suffix = 1;
1070  geneId += "_";
1071  while (true) {
1072  GENE_ID qualifiedGeneId = geneId + NStr::UIntToString(suffix);
1073  cit = find(mUsedGeneIds.begin(), mUsedGeneIds.end(), qualifiedGeneId);
1074  if (mUsedGeneIds.end() == cit) {
1075  mUsedGeneIds.emplace(qualifiedGeneId);
1076  mGeneMap[mf] = qualifiedGeneId;
1077  record.SetGeneId(qualifiedGeneId);
1078  return true;
1079  }
1080  ++suffix;
1081  }
1082  return true;
1083 }
1084 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
CBioseq_Handle –.
void AssignExonNumberTo(CGtfRecord &gtfRecord) const
CFeat_CI –.
Definition: feat_ci.hpp:64
static CGenbankIdResolve & Get()
CWriterBase implementation that formats Genbank objects as plain GFF files.
Definition: gff_writer.hpp:60
virtual bool xAssignFeatureAttributeDbxref(CGffFeatureRecord &, CGffFeatureContext &, const string &label, const CMappedFeat &)
Definition: gff_writer.cpp:613
virtual bool xAssignFeature(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &)
Definition: gff_writer.cpp:369
bool m_bHeaderWritten
Definition: gff_writer.hpp:406
static bool HasAccaptableTranscriptParent(CGffFeatureContext &, const CMappedFeat &)
virtual bool xWriteAllChildren(CGffFeatureContext &, const CMappedFeat &)
Definition: gff_writer.cpp:226
static bool xIntervalsNeedPartNumbers(const list< CRef< CSeq_interval >> &)
CMappedFeat xGenerateMissingTranscript(CGffFeatureContext &, const CMappedFeat &)
virtual string StrType() const
virtual string StrSeqStop() const
virtual string StrAttributes() const
virtual string StrScore() const
void SetType(const string &)
virtual string StrPhase() const
void SetMethod(const string &)
bool SetAttribute(const string &, const string &)
virtual string StrSeqStart() const
virtual string StrSeqId() const
bool AddAttribute(const string &, const string &)
virtual string StrStrand() const
virtual string StrMethod() const
virtual string StrStructibutes() const
string NextId(const string prefix)
Definition: gtf_writer.hpp:61
void SetGeneId(const std::string &geneId)
string TranscriptId() const
void SetTranscriptId(const std::string &transcriptId)
string GeneId() const
virtual bool xAssignFeatureAttributeTranscriptId(CGtfRecord &, CGffFeatureContext &, const CMappedFeat &)
Definition: gtf_writer.cpp:908
map< CMappedFeat, string > mMapFeatToGeneId
Definition: gtf_writer.hpp:203
virtual bool xAssignFeaturesGene(list< CRef< CGtfRecord >> &, CGffFeatureContext &, const CMappedFeat &)
Definition: gtf_writer.cpp:282
bool xAssignFeatureMethod(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:708
GENE_IDS mUsedGeneIds
Definition: gtf_writer.hpp:200
virtual bool xWriteRecordsGene(CGffFeatureContext &, const CMappedFeat &)
Definition: gtf_writer.cpp:201
string GENE_ID
Definition: gtf_writer.hpp:196
CGtfIdGenerator mIdGenerator
Definition: gtf_writer.hpp:204
bool xWriteRecord(const CGffWriteRecord *)
Definition: gtf_writer.cpp:134
virtual bool xAssignFeaturesCds(list< CRef< CGtfRecord >> &, CGffFeatureContext &, const CMappedFeat &, const string &="")
Definition: gtf_writer.cpp:439
FEAT_IDS mUsedFeatIds
Definition: gtf_writer.hpp:194
virtual bool xAssignFeatureAttributeTranscriptBiotype(CGtfRecord &, CGffFeatureContext &, const CMappedFeat &)
Definition: gtf_writer.cpp:875
bool xAssignFeatureAttributeNote(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:822
bool xWriteFeature(CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:158
bool x_WriteBioseqHandle(CBioseq_Handle) override
Definition: gtf_writer.cpp:99
bool xAssignFeatureAttributesFormatSpecific(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:767
virtual bool xWriteRecordsTranscript(CGffFeatureContext &, const CMappedFeat &, const string &="")
Definition: gtf_writer.cpp:261
std::string xGenericTranscriptId(const CMappedFeat &)
Definition: gtf_writer.cpp:867
bool WriteHeader() override
Write a file header identifying the file content as GFF version 2.
Definition: gtf_writer.cpp:123
CGtfWriter(CScope &, CNcbiOstream &, unsigned int=0)
Definition: gtf_writer.cpp:72
GENE_MAP mGeneMap
Definition: gtf_writer.hpp:201
bool xAssignFeatureAttributesQualifiers(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:781
bool xAssignFeatureAttributeDbxref(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:812
FEAT_MAP mFeatMap
Definition: gtf_writer.hpp:193
virtual bool xWriteRecordsCds(CGffFeatureContext &, const CMappedFeat &, const string &="")
Definition: gtf_writer.cpp:224
virtual bool xAssignFeatureAttributeGeneId(CGtfRecord &, CGffFeatureContext &, const CMappedFeat &)
string FEAT_ID
Definition: gtf_writer.hpp:189
bool xAssignFeatureType(CGffFeatureRecord &, CGffFeatureContext &, const CMappedFeat &) override
Definition: gtf_writer.cpp:660
virtual bool xWriteFeatureExons(CGffFeatureContext &, const CMappedFeat &, const string &="")
Definition: gtf_writer.cpp:622
std::string xGenericGeneId(const CMappedFeat &, CGffFeatureContext &)
Definition: gtf_writer.cpp:837
virtual bool xAssignFeaturesTranscript(list< CRef< CGtfRecord >> &, CGffFeatureContext &, const CMappedFeat &, const string &)
Definition: gtf_writer.cpp:318
bool IsCanceled() const
Definition: writer.hpp:62
CMappedFeat –.
Definition: mapped_feat.hpp:59
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
TSeqPos GetLength(void) const
static bool FeatureToSoType(const CSeq_feat &, string &)
Definition: so_map.cpp:783
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user object.
Definition: User_object.cpp:71
static bool CompareFeatures(const CMappedFeat &lhs, const CMappedFeat &rhs)
static CConstRef< CUser_object > GetUserObjectByType(const CUser_object &uo, const string &strType)
Definition: write_util.cpp:794
static bool GetIdType(CBioseq_Handle, string &)
Definition: write_util.cpp:166
unsigned int m_uFlags
Definition: writer.hpp:268
virtual const CRange< TSeqPos > & GetRange(void) const
Definition: writer.hpp:262
virtual SAnnotSelector & SetAnnotSelector(void)
Definition: writer.hpp:246
CNcbiOstream & m_Os
Definition: writer.hpp:267
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
const_iterator begin() const
Definition: set.hpp:135
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void ChangeToPackedInt(void)
Works only if location is currently an interval, point, packed-int (handled trivially),...
Definition: Seq_loc.cpp:3670
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
CMappedFeat GetBestGeneForFeat(const CMappedFeat &feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0, CFeatTree::EBestGeneType lookup_type=CFeatTree::eBestGene_TreeOnly)
Definition: feature.cpp:3443
CMappedFeat GetParentFeature(const CMappedFeat &feat)
Definition: feature.cpp:1615
CSeq_id_Handle GetIdHandle(const CSeq_loc &loc, CScope *scope)
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
const string & GetNamedQual(const CTempString &qual_name) const
Return a named qualifier.
bool IsSetExcept(void) const
bool GetExcept(void) const
bool IsSetComment(void) const
const CSeq_feat::TExts & GetExts(void) const
const CSeqFeatData & GetData(void) const
bool IsSetExcept_text(void) const
bool IsSetProduct(void) const
const string & GetComment(void) const
const CUser_object & GetExt(void) const
const string & GetExcept_text(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsSetExts(void) const
bool IsSetQual(void) const
CSeqFeatData::ESubtype GetFeatSubtype(void) const
CSeqFeatData::E_Choice GetFeatType(void) const
const CSeq_feat::TQual & GetQual(void) const
bool IsSetExt(void) const
CSeq_id_Handle GetProductId(void) const
const CSeq_loc & GetLocation(void) const
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
const CSeq_feat & GetMappedFeature(void) const
Feature mapped to the master sequence.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5109
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
const TStr & GetStr(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TGene & GetGene(void) const
Get the variant data.
bool IsRna(void) const
Check if variant Rna is selected.
const Tdata & Get(void) const
Get the member data.
TFrom GetFrom(void) const
Get the From member data.
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
bool CanGet(void) const
Check if it is safe to call Get method.
TStrand GetStrand(void) const
Get the Strand member data.
TTo GetTo(void) const
Get the To member data.
const TPacked_int & GetPacked_int(void) const
Get the variant data.
Definition: Seq_loc_.cpp:216
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_other
Definition: Na_strand_.hpp:70
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
USING_SCOPE(objects)
constexpr auto sort(_Init &&init)
T max(T x_, T y_)
#define fc
static const char * suffix[]
Definition: pcregrep.c:408
CConstRef< CSeq_id > GetBestId(const CBioseq &bioseq)
SAnnotSelector –.
#define _ASSERT
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Sat May 25 14:20:05 2024 by modify_doxy.py rev. 669887