NCBI C++ ToolKit
multireader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: multireader.cpp 100808 2023-09-14 15:36:59Z foleyjp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig, Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Reader for selected data file formats
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "fasta_ex.hpp"
37 
40 
46 
52 #include <objects/pub/Pub.hpp>
55 #include <objects/seq/Pubdesc.hpp>
57 #include <objects/seq/Bioseq.hpp>
59 #include <objects/general/Date.hpp>
61 
62 #include <corelib/ncbistre.hpp>
63 
64 #include <serial/iterator.hpp>
65 #include <serial/objistr.hpp>
66 #include <serial/objostr.hpp>
67 #include <serial/objostrasn.hpp>
68 #include <serial/serial.hpp>
69 #include <objects/seq/Annot_id.hpp>
73 
76 
77 #include "multireader.hpp"
78 #include "table2asn_context.hpp"
79 #include "descr_apply.hpp"
80 
82 
83 #include <corelib/stream_utils.hpp>
84 #include <common/ncbi_revision.h>
85 #include "utils.hpp"
86 
87 #ifndef NCBI_SC_VERSION
88 # define FLATFILE_PARSER_ENABLED
89 #elif (NCBI_SC_VERSION == 0)
90 # define FLATFILE_PARSER_ENABLED
91 #endif
92 
93 #ifdef FLATFILE_PARSER_ENABLED
95 #endif
96 
97 #include <common/test_assert.h> /* This header must go last */
98 
99 
100 
103 
104 
106 {
107  CTypeIterator<CSeq_loc> visitor(annot);
108 
109  CSeq_id& id = *new_id;
110  while (visitor)
111  {
112  CSeq_loc& loc = *visitor;
113 
114  if (loc.GetId()->Compare(match) == CSeq_id::e_YES)
115  {
116  loc.SetId(id);
117  }
118  ++visitor;
119  }
120 }
121 
122 
123 namespace
124 {
125 
126  struct SCSeqidCompare
127  {
128  inline
129  bool operator()(const CSeq_id* left, const CSeq_id* right) const
130  {
131  return *left < *right;
132  };
133  };
134 
135 }
136 
137 
139  CBioseq_set::GetTypeInfo(),
140  CBioseq::GetTypeInfo(),
141  CSeq_entry::GetTypeInfo(),
142  CSeq_submit::GetTypeInfo(),
143 };
144 
145 
146 CRef<CSerialObject> CMultiReader::xReadASN1Binary(CObjectIStream& pObjIstrm, const string& content_type) const
147 {
148  if (content_type == "Bioseq-set")
149  {
150  auto obj = Ref(new CSeq_entry);
151  auto& bioseq_set = obj->SetSet();
152  pObjIstrm.Read(ObjectInfo(bioseq_set));
153  return obj;
154  }
155 
156  if (content_type == "Seq-submit")
157  {
158  auto seqsubmit = Ref(new CSeq_submit);
159  pObjIstrm.Read(ObjectInfo(*seqsubmit));
160  return seqsubmit;
161  }
162 
163  if (content_type == "Seq-entry")
164  {
165  auto obj = Ref(new CSeq_entry);
166  pObjIstrm.Read(ObjectInfo(*obj));
167  return obj;
168  }
169 
170  if (content_type == "Bioseq")
171  {
172  auto obj = Ref(new CSeq_entry);
173  pObjIstrm.Read(ObjectInfo(obj->SetSeq()));
174  return obj;
175  };
176 
177  return {};
178 }
179 
181 {
182  CRef<CSeq_entry> entry;
183  CRef<CSeq_submit> submit;
184 
185  // guess object type
186  string sType;
187  try
188  {
189  sType = pObjIstrm.ReadFileHeader();
190  }
191  catch (const CEofException&)
192  {
193  sType.clear();
194  // ignore EOF exception
195  }
196 
197  // do the right thing depending on the input type
198  if (sType == CBioseq_set::GetTypeInfo()->GetName()) {
199  entry.Reset(new CSeq_entry);
200  pObjIstrm.Read(ObjectInfo(entry->SetSet()), CObjectIStream::eNoFileHeader);
201  }
202  else
203  if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
204  submit.Reset(new CSeq_submit);
205  pObjIstrm.Read(ObjectInfo(*submit), CObjectIStream::eNoFileHeader);
206 
207  if (submit->GetData().GetEntrys().size() > 1)
208  {
209  entry.Reset(new CSeq_entry);
210  entry->SetSet().SetSeq_set() = submit->GetData().GetEntrys();
211  }
212  else
213  entry = *submit->SetData().SetEntrys().begin();
214  }
215  else
216  if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
217  entry.Reset(new CSeq_entry);
218  pObjIstrm.Read(ObjectInfo(*entry), CObjectIStream::eNoFileHeader);
219  }
220  else
221  if (sType == CSeq_annot::GetTypeInfo()->GetName())
222  {
223  entry.Reset(new CSeq_entry);
224  do
225  {
226  CRef<CSeq_annot> annot(new CSeq_annot);
227  pObjIstrm.Read(ObjectInfo(*annot), CObjectIStream::eNoFileHeader);
228  entry->SetSeq().SetAnnot().push_back(annot);
229  try
230  {
231  sType = pObjIstrm.ReadFileHeader();
232  }
233  catch (const CEofException&)
234  {
235  sType.clear();
236  // ignore EOF exception
237  }
238  } while (sType == CSeq_annot::GetTypeInfo()->GetName());
239  }
240  else
241  {
242  return CRef<CSerialObject>();
243  }
244 
245  if (m_context.m_gapNmin > 0)
246  {
247  CGapsEditor gap_edit(
253  gap_edit.ConvertNs2Gaps(*entry);
254  }
255 
256  if (submit.Empty())
257  return entry;
258  else
259  return submit;
260 }
261 
262 // ----------------------------------------------------------------------------
265 // ----------------------------------------------------------------------------
266 {
267  CAlnReader reader(instream);
268  reader.SetAllGap(args["aln-gapchar"].AsString());
269  reader.SetMissing(args["aln-gapchar"].AsString());
270  if (args["aln-alphabet"].AsString() == "nuc") {
272  }
273  else {
275  }
276 
277  reader.Read(0, m_context.m_logger);
278  auto pSeqEntry =
279  reader.GetSeqEntry(
282 
283  if (pSeqEntry && args["a"]) {
285  s_StringToClass =
292 
293  auto it = s_StringToClass.find(args["a"].AsString());
294  if (it != s_StringToClass.end()) {
295  pSeqEntry->SetSet().SetClass(it->second);
296  }
297  }
298 
299  return pSeqEntry;
300 }
301 
302 
303 // ----------------------------------------------------------------------------
306  // ----------------------------------------------------------------------------
307 {
308  if (m_context.m_gapNmin > 0)
309  {
312  }
313  else
314  {
316 // | CFastaReader::fLeaveAsText;
317  }
318 
319  if (m_context.m_d_fasta)
320  {
322  }
323 
328 
331 
332 
335 
336  unique_ptr<CFastaReaderEx> pReader(new CFastaReaderEx(m_context, instream, m_iFlags));
337  if (!pReader) {
339  "File format not supported", 0);
340  }
341  if (m_context.m_gapNmin > 0)
342  {
343  pReader->SetMinGaps(m_context.m_gapNmin, m_context.m_gap_Unknown_length);
344  }
345 
346  //if (m_context.m_gap_evidences.size() > 0 || m_context.m_gap_type >= 0)
347  if (!m_context.m_GapsizeToEvidence.empty() ||
348  !m_context.m_DefaultEvidence.empty() ||
349  m_context.m_gap_type >= 0) {
350  pReader->SetGapLinkageEvidence(
354  }
355 
356  int max_seqs = kMax_Int;
358  if (m_context.m_di_fasta)
359  result = pReader->ReadDeltaFasta(m_context.m_logger);
360  else if (m_context.m_d_fasta)
361  result = pReader->ReadDeltaFasta(m_context.m_logger);
362  else
363  result = pReader->ReadSet(max_seqs, m_context.m_logger);
364 
365  if (result.NotEmpty())
366  {
368  }
369 
370  if (result->IsSet() && !m_context.m_HandleAsSet)
371  {
373  "File " + m_context.m_current_file + " contains multiple sequences",
374  *(m_context.m_logger));
375  }
376  if (result->IsSet())
377  {
378  result->SetSet().SetClass(m_context.m_ClassValue);
379  }
380 
381  return result;
382 
383 }
384 
386  // ----------------------------------------------------------------------------
387 {
388  CFormatGuessEx FG(istr);
390  //FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eFasta); // we wouldn't take "no" for an answer anyway
392  //FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eXml);
395 
396  if (!content_info) {
397  return FG.GuessFormat();
398  }
399 
400  FG.SetRecognizedGenbankTypes(kSupportedTypes);
401  return FG.GuessFormatAndContent(*content_info);
402 }
403 
405  // ----------------------------------------------------------------------------
406 {
407  CFormatGuess FG(istr);
412  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eGff2);
413  // RW-1591: Need at least GFF3 or GTF (plain or Augustus) to properly relate
414  // the features
417 #ifdef FLATFILE_PARSER_ENABLED
421 #endif
423 
424  return FG.GuessFormat();
425 }
426 
427 // ----------------------------------------------------------------------------
429  const CSerialObject& object,
430  ostream& ostr)
431  // ----------------------------------------------------------------------------
432 {
434  //<< MSerial_VerifyNo
435  << object;
436  ostr.flush();
437 }
438 
440  :m_context(context),
441  mAtSequenceData(false)
442 {
443 }
444 
445 /*
446 void CMultiReader::ApplyAdditionalProperties(CSeq_entry& entry)
447 {
448  switch(entry.Which())
449  {
450  case CSeq_entry::e_Seq:
451  if (!m_context.m_OrganismName.empty() || m_context.m_taxid != 0)
452  {
453  CBioSource::TOrg& org(CAutoAddDesc(entry.SetDescr(), CSeqdesc::e_Source).Set().SetSource().SetOrg());
454  // we should reset taxid in case new name is different
455  if (org.IsSetTaxname() && org.GetTaxId() >0 && org.GetTaxname() != m_context.m_OrganismName)
456  {
457  org.SetTaxId(0);
458  }
459 
460  if (!m_context.m_OrganismName.empty())
461  org.SetTaxname(m_context.m_OrganismName);
462  if (m_context.m_taxid != 0)
463  org.SetTaxId(m_context.m_taxid);
464  }
465  break;
466 
467  case CSeq_entry::e_Set:
468  {
469  if (!entry.GetSet().IsSetClass())
470  entry.SetSet().SetClass(CBioseq_set::eClass_genbank);
471 
472  NON_CONST_ITERATE(CBioseq_set_Base::TSeq_set, it, entry.SetSet().SetSeq_set())
473  {
474  ApplyAdditionalProperties(**it);
475  }
476  }
477  break;
478  default:
479  break;
480  }
481 }
482 */
483 
484 void CMultiReader::LoadDescriptors(const string& ifname, CRef<CSeq_descr> & out_desc) const
485 {
486  out_desc.Reset(new CSeq_descr);
487 
488  unique_ptr<CObjectIStream> pObjIstrm = xCreateASNStream(ifname);
489 
490  // guess object type
491  //const string sType = pObjIstrm->ReadFileHeader();
492 
493  // do the right thing depending on the input type
494  while (true) {
495  try {
496  const string sType = pObjIstrm->ReadFileHeader();
497  if (sType == CSeq_descr::GetTypeInfo()->GetName())
498  {
499  CRef<CSeq_descr> descr(new CSeq_descr);
500  pObjIstrm->Read(ObjectInfo(*descr),
502  out_desc->Set().insert(out_desc->Set().end(), descr->Get().begin(), descr->Get().end());
503  }
504  else if (sType == CSeqdesc::GetTypeInfo()->GetName())
505  {
506  CRef<CSeqdesc> desc(new CSeqdesc);
507  pObjIstrm->Read(ObjectInfo(*desc),
509  out_desc->Set().push_back(desc);
510  }
511  else if (sType == CPubdesc::GetTypeInfo()->GetName())
512  {
513  CRef<CSeqdesc> desc(new CSeqdesc);
514  pObjIstrm->Read(ObjectInfo(desc->SetPub()),
516  out_desc->Set().push_back(desc);
517  }
518  else
519  {
520  throw runtime_error("Descriptor file must contain "
521  "either Seq_descr or Seqdesc elements");
522  }
523  } catch (CException& ex) {
524  if (!NStr::EqualNocase(ex.GetMsg(), "end of file")) {
525  throw runtime_error("Unable to read descriptor from file:" + ex.GetMsg());
526  }
527  break;
528  }
529  }
530 }
531 
532 void CMultiReader::LoadTemplate(const string& ifname)
533 {
534  unique_ptr<CObjectIStream> pObjIstrm = xCreateASNStream(ifname);
535 
536  // guess object type
537  string sType = pObjIstrm->ReadFileHeader();
538 
539  // do the right thing depending on the input type
540  if( sType == CSeq_entry::GetTypeInfo()->GetName() ) {
543  } else if( sType == CBioseq::GetTypeInfo()->GetName() ) {
544  CRef<CBioseq> pBioseq( new CBioseq );
545  pObjIstrm->Read(ObjectInfo(*pBioseq), CObjectIStream::eNoFileHeader);
547  m_context.m_entry_template->SetSeq( *pBioseq );
548  } else if( sType == CSeq_submit::GetTypeInfo()->GetName() ) {
549 
552  if (!m_context.m_submit_template->GetData().IsEntrys()
553  || m_context.m_submit_template->GetData().GetEntrys().size() != 1)
554  {
555  throw runtime_error("Seq-submit template must contain "
556  "exactly one Seq-entry");
557  }
558  } else if( sType == CSubmit_block::GetTypeInfo()->GetName() ) {
559 
560  // a Submit-block
561  CRef<CSubmit_block> submit_block(new CSubmit_block);
562  pObjIstrm->Read(ObjectInfo(*submit_block),
564 
565  // Build a Seq-submit containing this plus a bogus Seq-entry
567  m_context.m_submit_template->SetSub(*submit_block);
568  CRef<CSeq_entry> ent(new CSeq_entry);
569  CRef<CSeq_id> dummy_id(new CSeq_id("lcl|dummy_id"));
570  ent->SetSeq().SetId().push_back(dummy_id);
571  ent->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
572  ent->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
573  m_context.m_submit_template->SetData().SetEntrys().push_back(ent);
574  } else if ( sType == CSeqdesc::GetTypeInfo()->GetName()) {
575  // it's OK
576  } else {
577  NCBI_USER_THROW_FMT("Template must be Seq-entry, Seq-submit, Bioseq or "
578  "Submit-block. Object seems to be of type: " << sType);
579  }
580 
581  // for submit types, pull out the seq-entry inside and remember it
583  m_context.m_entry_template = m_context.m_submit_template->SetData().SetEntrys().front();
584  }
585 
586  // The template may contain a set rather than a seq.
587  // That's OK if it contains only one na entry, which we'll use.
589  {
591  for(auto ent_iter: m_context.m_entry_template->GetSet().GetSeq_set())
592  {
593  const CSeq_descr* descr = nullptr;
594  if (ent_iter->IsSetDescr())
595  {
596  descr = &ent_iter->GetDescr();
597  }
598  if (descr)
599  {
600  //tmp->Assign(**ent_iter);
601  tmp->SetSeq().SetInst();
602  // Copy any descriptors from the set to the sequence
603  ITERATE(CBioseq_set::TDescr::Tdata, desc_iter, descr->Get())
604  {
605  switch ((*desc_iter)->Which())
606  {
607  case CSeqdesc::e_Pub:
608  case CSeqdesc::e_Source:
609  break;
610  default:
611  continue;
612  }
613  CRef<CSeqdesc> desc(new CSeqdesc);
614  desc->Assign(**desc_iter);
615  tmp->SetSeq().SetDescr().Set().push_back(desc);
616  }
617  break;
618  }
619  }
620 
621  if (tmp->IsSetDescr() && !tmp->GetDescr().Get().empty())
623 
624  }
625 
626  // incorporate any Seqdesc's that follow in the file
627  if (!pObjIstrm->EndOfData())
628  {
629  if (sType != CSeqdesc::GetTypeInfo()->GetName())
630  sType = pObjIstrm->ReadFileHeader();
631 
632  while (sType == CSeqdesc::GetTypeInfo()->GetName()) {
633  CRef<CSeqdesc> desc(new CSeqdesc);
634  pObjIstrm->Read(ObjectInfo(*desc), CObjectIStream::eNoFileHeader);
635 
638 
639  {
640  if (desc->IsUser() && desc->GetUser().IsDBLink())
641  {
642  CUser_object& user_obj = desc->SetUser();
643  if (user_obj.IsDBLink())
644  {
645  user_obj.SetData();
646  }
647  }
648  }
649 
650  m_context.m_entry_template->SetSeq().SetDescr().Set().push_back(desc);
651 
652  if (pObjIstrm->EndOfData())
653  break;
654 
655  try {
656  sType = pObjIstrm->ReadFileHeader();
657  }
658  catch(CEofException&) {
659  break;
660  }
661  }
662  }
663 
664 #if 0
665  if ( m_context.m_submit_template->IsEntrys() ) {
666  // Take Seq-submit.sub.cit and put it in the Bioseq
667  CRef<CPub> pub(new CPub);
668  pub->SetSub().Assign(context.m_submit_template->GetSub().GetCit());
669  CRef<CSeqdesc> pub_desc(new CSeqdesc);
670  pub_desc->SetPub().SetPub().Set().push_back(pub);
671  m_context.m_entry_template->SetSeq().SetDescr().Set().push_back(pub_desc);
672  }
673 #endif
674 
676  throw runtime_error("The Seq-entry must be a Bioseq not a Bioseq-set.");
677  }
678 
680  {
681  if (m_context.m_submit_template->IsSetSub() &&
682  m_context.m_submit_template->GetSub().IsSetCit())
683  {
685  m_context.m_submit_template->SetSub().SetCit().SetDate(*date);
686  }
687  }
688 
689 #if 0
690  if( args["output-type"].AsString() == "Seq-entry" ) {
691  // force Seq-entry by throwing out the Seq-submit
693  }
694 #endif
695 }
696 
697 namespace
698 {
699  class AllowedDuplicates: public set<CSeqdesc_Base::E_Choice>
700  {
701  public:
702  AllowedDuplicates()
703  {
704  insert(CSeqdesc_Base::e_User);
705  }
706  };
707  AllowedDuplicates m_allowed_duplicates;
708 
709  template<typename _which>
710  struct LocateWhich
711  {
712  typename _which::E_Choice compare_to;
713  bool operator() (_which l) const
714  {
715  return l.Which() == compare_to;
716  }
717  bool operator() (const CRef<_which>& l) const
718  {
719  return l->Which() == compare_to;
720  }
721  };
722 }
723 
725 {
726  ITERATE(CSeq_descr::Tdata, it, source.Get())
727  {
728  MergeDescriptors(dest, **it);
729  }
730 }
731 
732 void CMultiReader::MergeDescriptors(CSeq_descr & dest, const CSeqdesc & source) const
733 {
734  bool duplicates = (m_allowed_duplicates.find(source.Which()) != m_allowed_duplicates.end());
735 
736  CAutoAddDesc desc(dest, source.Which());
737  desc.Set(duplicates).Assign(source);
738 }
739 
741 {
742  MergeDescriptors(entry.SetDescr(), source);
743  //g_ApplyDescriptors(source.Get(), entry);
744 }
745 
746 namespace
747 {
748  void CopyDescr(CSeq_entry& dest, const CSeq_entry& src)
749  {
750  if (src.IsSetDescr() && !src.GetDescr().Get().empty())
751  {
752  dest.SetDescr().Set().insert(dest.SetDescr().Set().end(),
753  src.GetDescr().Get().begin(),
754  src.GetDescr().Get().end());
755  }
756  }
757  void CopyAnnot(CSeq_entry& dest, const CSeq_entry& src)
758  {
759  if (src.IsSetAnnot() && !src.GetAnnot().empty())
760  {
761  dest.SetAnnot().insert(dest.SetAnnot().end(),
762  src.GetAnnot().begin(),
763  src.GetAnnot().end());
764  }
765  }
766 }
767 
768 void CMultiReader::LoadGFF3Fasta(istream& in, TAnnots& annots)
769 {
770  LOG_POST("Recognized input file as format: " << CFormatGuess::GetFormatName(CFormatGuess::eGff3));
771 
772  bool post_process = false;
773  annots = xReadGFF3(in, post_process); // initializes m_gff3_reader!
774  if (!AtSeqenceData()) {
776  "Specified GFF3 file does not include any sequence data", 0);
777  }
778  x_PostProcessAnnots(annots);
779 }
780 
781 
782 void CMultiReader::LoadGFF3Fasta(istream& in, TAnnotMap& annotMap)
783 {
784  TAnnots annots;
785  LoadGFF3Fasta(in, annots);
786  for (auto pAnnot : annots) {
787  AddAnnotToMap(pAnnot, annotMap);
788  }
789 }
790 
791 
793  const string& objectType,
794  unique_ptr<istream>& pIstr,
795  TAnnotMap& annotMap)
796 {
797  CRef<CSerialObject> pInputObject;
798  switch (format) {
801  pInputObject = xReadASN1Binary(*m_obj_stream, objectType);
802  break;
805  pInputObject = xReadASN1Text(*m_obj_stream);
806  break;
807  case CFormatGuess::eGff3:
808  LoadGFF3Fasta(*pIstr, annotMap);
809  case CFormatGuess::eFasta: // What about buffered input?
810  default:
812  pInputObject = xReadFasta(*pIstr);
813  }
814 
815  if (!pInputObject) {
817  "File format not recognized", 0);
818  }
819  // RW-617: apply template descriptors only if input is *not* ASN1:
820  // What about binary ASN.1?
821  bool merge_template_descriptors = (format != CFormatGuess::eTextASN);
822  return xApplyTemplate(pInputObject, merge_template_descriptors);
823 }
824 
825 
826 CFormatGuess::EFormat CMultiReader::OpenFile(const string& filename, CRef<CSerialObject>& input_sequence, TAnnotMap& annotMap)
827 {
829  CFileContentInfo content_info;
830  {
831  unique_ptr<istream> istream(new CNcbiIfstream(filename));
832  format = xInputGetFormat(*istream, &content_info);
833  }
834 
835  switch (format)
836  {
839  input_sequence = xReadASN1Binary(*m_obj_stream, content_info.mInfoGenbank.mObjectType);
840  break;
843  input_sequence = xReadASN1Text(*m_obj_stream);
844  break;
845  case CFormatGuess::eGff3:
846  {
847  unique_ptr<istream> in(new CNcbiIfstream(filename));
848  LoadGFF3Fasta(*in, annotMap);
849  m_iFlags = 0;
851  input_sequence = xReadFasta(*in);
852  }
853  break;
854  default: // RW-616 - Assume FASTA
855  {
857  m_iFlags = 0;
859 
860  CBufferedInput istream;
861  istream.get().open(filename);
862  input_sequence = xReadFasta(istream);
863  }
864  break;
865  }
866  if (input_sequence.Empty())
868  "File format not recognized", 0);
869  //rw-617: apply template descriptors only if input is *not* ASN1:
870  bool merge_template_descriptors = (format != CFormatGuess::eTextASN);
871  input_sequence = xApplyTemplate(input_sequence, merge_template_descriptors);
872 
873  return format;
874 }
875 
877 {
878  if (obj->GetThisTypeInfo() == CSeq_submit::GetTypeInfo())
879  {
880  submit.Reset(static_cast<CSeq_submit*>(obj.GetPointer()));
881  entry = submit->SetData().SetEntrys().front();
882  }
883  else
884  if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
885  entry.Reset(static_cast<CSeq_entry*>(obj.GetPointer()));
886  }
887 }
888 
889 CRef<CSerialObject> CMultiReader::xApplyTemplate(CRef<CSerialObject> obj, bool merge_template_descriptors) const
890 {
891  CRef<CSeq_entry> entry;
892  CRef<CSeq_submit> submit;
893 
894  GetSeqEntry(entry, submit, obj);
895 
896  if (entry.NotEmpty()) // &&
897  {
898  if (submit.Empty())
899  if (entry->IsSet() && entry->GetSet().GetSeq_set().size() < 2 &&
900  entry->GetSet().GetSeq_set().front()->IsSeq())
901  {
902  CRef<CSeq_entry> seq = entry->SetSet().SetSeq_set().front();
903  CopyDescr(*seq, *entry);
904  CopyAnnot(*seq, *entry);
905  entry = seq;
906  }
907  entry->ResetParentEntry();
908  entry->Parentize();
909 
910  if (merge_template_descriptors) {
912  }
913  else {
914  if (m_context.m_t && m_context.m_logger) {
915  string msg(
916  "Template file descriptors are ignored if input is ASN.1");
917  m_context.m_logger->PutError(
918  *unique_ptr<CLineError>(
920  eDiag_Warning, "", 0, "", "", "", msg)));
921  }
922  }
923  }
924 
925  if (submit.Empty())
926  return entry;
927  else
928  return submit;
929 }
930 
932 {
933  if (m_obj_stream)
934  return xReadASN1Text(*m_obj_stream);
935  else
936  return CRef<CSerialObject>();
937 }
938 
940 {
941  int flags = 0;
946 
947  CReaderListener readerListener;
948  CGff3Reader reader(flags, m_AnnotName, m_AnnotTitle, CReadUtil::AsSeqId, &readerListener);
949 
950  CStreamLineReader lr(instream);
951  TAnnots annots;
952 
953  try {
954  reader.ReadSeqAnnots(annots, lr, m_context.m_logger);
955  m_gff3_merger = reader.GetLocationMerger();
956  mAtSequenceData = reader.AtSequenceData();
957 
958  if (post_process) {
959  x_PostProcessAnnots(annots);
960  }
961 
962  for (const auto& msg : readerListener) {
963  m_context.m_logger->PutMessage(msg);
964  }
965  }
966  catch (const CReaderMessage& msg) {
967  m_context.m_logger->PutMessage(msg);
968  }
969 
970  return annots;
971 }
972 
973 
975 {
976  for (auto pFeat : ftable) {
977  if (pFeat->IsSetDbxref()) {
978  auto& dbxrefs = pFeat->SetDbxref();
979  auto it = remove_if(dbxrefs.begin(), dbxrefs.end(),
980  [](const CRef<CDbtag>& pDbtag) {
981  return(pDbtag && pDbtag->IsSetDb() &&
982  NStr::EqualNocase(pDbtag->GetDb(), "GenBank"));
983  });
984  dbxrefs.erase(it, dbxrefs.end());
985  if (dbxrefs.empty()) {
986  pFeat->ResetDbxref();
987  }
988  }
989  }
990 }
991 
992 
994 {
995  unsigned int startingLocusTagNumber = 1;
996  unsigned int startingFeatureId = 1;
997  for (auto it = annots.begin(); it != annots.end(); ++it) {
998 
999  auto& annot = **it;
1000  auto& data = annot.SetData();
1001  if (!data.IsFtable() || data.GetFtable().empty()) {
1002  continue; // all that follows applies to feature tables only
1003  }
1004 
1005  s_RemoveGenBankDbxrefs(data.SetFtable()); // RW-1861
1006 
1007  edit::CFeatTableEdit fte(
1008  annot, 0, m_context.m_locus_tag_prefix, startingLocusTagNumber, startingFeatureId, m_context.m_logger);
1009  //fte.InferPartials();
1010  fte.GenerateMissingParentFeatures(m_context.m_eukaryote, m_gff3_merger.get());
1012  if (m_context.m_locus_tag_prefix.empty() && !fte.AnnotHasAllLocusTags()) {
1013  NCBI_THROW(CArgException, eNoArg,
1014  "GFF annotation requires locus tags, which are missing from one or more genes, so the command line argument -locus-tag-prefix is needed");
1015  }
1016  fte.GenerateLocusTags();
1017  }
1018  fte.GenerateProteinAndTranscriptIds();
1019  //fte.InstantiateProducts();
1020  fte.ProcessCodonRecognized();
1021  fte.EliminateBadQualifiers();
1022  fte.SubmitFixProducts();
1023 
1024  startingLocusTagNumber = fte.PendingLocusTagNumber();
1025  startingFeatureId = fte.PendingFeatureId();
1026  }
1027 }
1028 
1029 
1030 
1031 
1032 unique_ptr<CObjectIStream> CMultiReader::xCreateASNStream(const string& filename) const
1033 {
1034  unique_ptr<istream> instream(new CNcbiIfstream(filename));
1035  return xCreateASNStream(CFormatGuess::eUnknown, instream);
1036 }
1037 
1038 unique_ptr<CObjectIStream> CMultiReader::xCreateASNStream(CFormatGuess::EFormat format, unique_ptr<istream>& instream) const
1039 {
1040  // guess format
1041  ESerialDataFormat eSerialDataFormat = eSerial_None;
1042  {{
1044  format = xInputGetFormat(*instream);
1045 
1046  switch(format) {
1048  eSerialDataFormat = eSerial_AsnBinary;
1049  break;
1052  eSerialDataFormat = eSerial_AsnText;
1053  break;
1054  case CFormatGuess::eXml:
1055  eSerialDataFormat = eSerial_Xml;
1056  break;
1057  default:
1059  "Descriptor file seems to be in an unsupported format: "
1061  break;
1062  }
1063 
1064  //instream.seekg(0);
1065  }}
1066 
1067  unique_ptr<CObjectIStream> pObjIstrm(
1068  CObjectIStream::Open(eSerialDataFormat, *instream.release(), eTakeOwnership));
1069 
1070  return pObjIstrm;
1071 }
1072 
1074 {
1075 }
1076 
1078 {
1079 public:
1081 
1082 
1083  bool Init(const TAnnots& annots) {
1084  if (annots.empty()) {
1085  return false;
1086  }
1087 
1088  m_Annots = annots;
1089  m_annot_iterator = m_Annots.begin();
1090  return true;
1091  }
1092 
1093  bool Init(const string& seqid_prefix, unique_ptr<istream>& instream, ILineErrorListener* logger)
1094  {
1095  m_seqid_prefix = seqid_prefix;
1097  instream.release();
1098  m_logger = logger;
1099  return true;
1100  }
1101 
1103  {
1104  if (!m_Annots.empty())
1105  {
1106  if (m_annot_iterator != m_Annots.end())
1107  {
1108  return *m_annot_iterator++;
1109  }
1110  }
1111  else
1112  if (m_line_reader.NotEmpty())
1113  {
1114  while (!m_line_reader->AtEOF()) {
1116  *m_line_reader,
1121  m_logger, nullptr/*filter*/, m_seqid_prefix);
1122 
1123  if (annot.NotEmpty() && annot->IsSetData() && annot->GetData().IsFtable() &&
1124  !annot->GetData().GetFtable().empty()) {
1125  return annot;
1126  }
1127  }
1128  }
1129  return CRef<CSeq_annot>();
1130  }
1131 
1132 private:
1134  TAnnots::iterator m_annot_iterator;
1138 };
1139 
1140 bool CMultiReader::xGetAnnotLoader(CAnnotationLoader& loader, const string& filename)
1141 {
1142  unique_ptr<istream> in(new CNcbiIfstream(filename));
1143 
1145 
1146  if (uFormat == CFormatGuess::eUnknown)
1147  {
1148  string ext;
1149  CDirEntry::SplitPath(filename, nullptr, nullptr, &ext);
1150  NStr::ToLower(ext);
1151  if (ext == ".gff" || ext == ".gff3")
1152  uFormat = CFormatGuess::eGff3;
1153  else
1154  if (ext == ".gtf")
1155  uFormat = CFormatGuess::eGtf;
1156  else
1157  if (ext == ".tbl")
1159  else
1160  if (ext == ".asn" || ext == ".sqn" || ext == ".sap")
1161  uFormat = CFormatGuess::eTextASN;
1162 
1163  if (uFormat != CFormatGuess::eUnknown)
1164  {
1165  LOG_POST("Presuming annotation format by filename suffix: "
1166  << CFormatGuess::GetFormatName(uFormat));
1167  }
1168  }
1169  else
1170  {
1171  LOG_POST("Recognized annotation format: " << CFormatGuess::GetFormatName(uFormat));
1172  }
1173 
1174  TAnnots annots;
1175  switch (uFormat)
1176  {
1178  {
1179  string seqid_prefix;
1180  if (!m_context.m_genome_center_id.empty())
1181  seqid_prefix = "gnl|" + m_context.m_genome_center_id + "|";
1182  return loader.Init(seqid_prefix, in, m_context.m_logger);
1183  }
1184  break;
1186  {
1187  auto obj_stream = xCreateASNStream(uFormat, in);
1188  CRef<CSerialObject> obj = xReadASN1Text(*obj_stream);
1189  CRef<CSeq_submit> unused;
1190  CRef<CSeq_entry> pEntry;
1191  GetSeqEntry(pEntry, unused, obj);
1192  if (pEntry && pEntry->IsSetAnnot()) {
1193  annots = pEntry->GetAnnot();
1194  }
1195  }
1196  break;
1197  case CFormatGuess::eGff3:
1198  annots = xReadGFF3(*in, true);
1199  break;
1200  case CFormatGuess::eGtf:
1202  annots = xReadGTF(*in);
1203  break;
1204 #ifdef FLATFILE_PARSER_ENABLED
1208  in.reset();
1209  {
1210  auto pEntry = xReadFlatfile(uFormat, filename);
1211  if (pEntry && pEntry->IsSetAnnot()) {
1212  annots = pEntry->GetAnnot();
1213  }
1214  }
1215  break;
1216 #endif
1217 
1218  default:
1220  "Annotation file format not recognized. Run format validator on your annotation file", 1);
1221  }
1222 
1223  if (!annots.empty()) {
1224  loader.Init(annots);
1225  return true;
1226  }
1227  return false;
1228 }
1229 
1230 
1232 {
1233  CRef<CSeq_id> pAnnotId;
1234  if (annot.IsSetId())
1235  {
1236  pAnnotId.Reset(new CSeq_id());
1237  const CAnnot_id& firstId = *(annot.GetId().front());
1238  if (firstId.IsLocal()) {
1239  pAnnotId->SetLocal().Assign(firstId.GetLocal());
1240  }
1241  else if (firstId.IsGeneral())
1242  {
1243  pAnnotId->SetGeneral().Assign(firstId.GetGeneral());
1244  }
1245  else {
1246  return pAnnotId;
1247  }
1248  }
1249  else if (!annot.GetData().GetFtable().empty())
1250  {
1251  // get a reference to CSeq_id instance, we'd need to update it recently
1252  // 5 column feature reader has a single shared instance for all features
1253  // update one at once would change all the features
1254  pAnnotId.Reset(const_cast<CSeq_id*>(annot.GetData().GetFtable().front()->GetLocation().GetId()));
1255  }
1256 
1257  return pAnnotId;
1258 }
1259 
1260 
1261 
1263 {
1264  auto pAnnotId = s_GetAnnotId(*pAnnot);
1265  if (!pAnnotId) {
1266  return;
1267  }
1268 
1269  auto idString = pAnnotId->GetSeqIdString();
1270  NStr::ToLower(idString);
1271  auto it = annotMap.find(idString);
1272  if (it == annotMap.end()) {
1273  annotMap.emplace(idString, list<CRef<CSeq_annot>>{pAnnot});
1274  }
1275  else {
1276  it->second.push_back(pAnnot);
1277  }
1278 }
1279 
1280 
1281 void CMultiReader::LoadAnnotMap(const string& filename, TAnnotMap& annotMap)
1282 {
1283  CAnnotationLoader annot_loader;
1284  if (!xGetAnnotLoader(annot_loader, filename)) {
1285  return;
1286  }
1287 
1288  CRef<CSeq_annot> pAnnot;
1289  while ((pAnnot = annot_loader.GetNextAnnot()).NotEmpty()) {
1290  AddAnnotToMap(pAnnot, annotMap);
1291  }
1292 }
1293 
1294 
1295 
1297 {
1298 
1299  CRef<CSeq_annot> pBioseqAnnot;
1300 
1301  if (bioseq.IsSetAnnot()) {
1302  auto& bioseqAnnots = bioseq.SetAnnot();
1303  auto it = find_if(bioseqAnnots.begin(),
1304  bioseqAnnots.end(),
1305  [](CRef<CSeq_annot> pAnnot)
1306  {
1307  return (pAnnot && pAnnot->IsFtable());
1308  });
1309  if (it != bioseqAnnots.end()) {
1310  pBioseqAnnot = *it;
1311  }
1312  }
1313  return pBioseqAnnot;
1314 }
1315 
1317  list<CRef<CSeq_annot>>& annots,
1318  CBioseq& bioseq,
1319  CRef<CSeq_annot>& pBioseqAnnot)
1320 {
1321  if (pBioseqAnnot) {
1322  for (auto pAnnot : annots) {
1323  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
1324  featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
1325  }
1326  return;
1327  }
1328 
1329  pBioseqAnnot = s_GetBioseqAnnot(bioseq);
1330 
1331  if (!pBioseqAnnot) {
1332  pBioseqAnnot = annots.front();
1333  bioseq.SetAnnot().push_back(pBioseqAnnot);
1334  auto it = next(annots.begin());
1335  while (it != annots.end()) {
1336  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
1337  featEdit.MergeFeatures((*it)->SetData().SetFtable());
1338  ++it;
1339  }
1340  }
1341  else {
1342  for (auto pAnnot : annots) {
1343  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
1344  featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
1345  }
1346  }
1347 }
1348 
1349 static bool s_HasPrefixMatch(
1350  const string& idString,
1351  CMultiReader::TAnnotMap& annotMap,
1353 {
1354  matchMap.clear();
1355  auto it = annotMap.lower_bound(idString);
1356  while (it != annotMap.end() && NStr::StartsWith(it->first, idString)) {
1357  matchMap.emplace(it->first, it);
1358  ++it;
1359  }
1360  return !matchMap.empty();
1361 }
1362 
1364  bool matchVersions,
1365  const string& idString,
1366  CMultiReader::TAnnotMap& annotMap,
1367  set<string>& matchedAnnots,
1368  list<CRef<CSeq_annot>>& annots) const
1369 {
1370  if (matchVersions) {
1371  return x_HasExactMatch(idString, annotMap, matchedAnnots, annots);
1372  }
1373 
1374  bool hasMatch = false;
1376  shared_lock<shared_mutex> sLock{m_Mutex};
1377  if (!s_HasPrefixMatch(idString, annotMap, matchMap)) {
1378  return false;
1379  }
1380  sLock.unlock();
1381  {
1382  unique_lock<shared_mutex> uLock{m_Mutex};
1383  for (auto match : matchMap) {
1384  const auto& annotId = match.first;
1385  auto it = match.second;
1386  if (matchedAnnots.insert(annotId).second) {
1387  hasMatch = true;
1388  annots.splice(annots.end(), it->second);
1389  annotMap.erase(it);
1390  }
1391  }
1392  }
1393 
1394  return hasMatch;
1395 }
1396 
1398  const string& idString,
1399  CMultiReader::TAnnotMap& annotMap,
1400  set<string>& matchedAnnots,
1401  list<CRef<CSeq_annot>>& annots) const
1402 {
1403  shared_lock<shared_mutex> sLock{m_Mutex};
1404  auto it = annotMap.find(idString);
1405  if (it == annotMap.end()) {
1406  return false;
1407  }
1408  string annotId = it->first;
1409  sLock.unlock();
1410 
1411  {
1412  unique_lock<shared_mutex> uLock{m_Mutex};
1413  if (matchedAnnots.insert(annotId).second) {
1414  annots = move(it->second);
1415  annotMap.erase(it);
1416  return true;
1417  }
1418  }
1419 
1420  return false;
1421 }
1422 
1423 
1425  set<string>& matchedAnnots,
1426  CBioseq& bioseq) const
1427 {
1428  CRef<CSeq_annot> pBioseqAnnot;
1429  for (auto pSeqId : bioseq.GetId()) {
1430  list<CRef<CSeq_annot>> annots;
1431  bool hasMatch = false;
1432  bool matchVersions = (pSeqId->GetTextseq_Id() == nullptr);
1433  auto idString = pSeqId->GetSeqIdString();
1434  NStr::ToLower(idString);
1435  hasMatch = x_HasMatch(matchVersions, idString, annotMap, matchedAnnots, annots);
1436 
1437  if (!hasMatch &&
1438  pSeqId->IsGeneral() &&
1439  pSeqId->GetGeneral().IsSetDb() &&
1440  (pSeqId->GetGeneral().GetDb() == m_context.m_genome_center_id) &&
1441  pSeqId->GetGeneral().IsSetTag() && pSeqId->GetGeneral().GetTag().IsStr()) {
1442  matchVersions = true;
1443  idString = pSeqId->GetGeneral().GetTag().GetStr();
1444  NStr::ToLower(idString);
1445  hasMatch = x_HasMatch(matchVersions, idString, annotMap, matchedAnnots, annots);
1446  }
1447 
1448  if (!hasMatch) {
1449  continue;
1450  }
1451 
1452 
1453  for (auto pAnnot : annots) {
1454  auto pAnnotId = s_GetAnnotId(*pAnnot);
1455  g_ModifySeqIds(*pAnnot, *pAnnotId, pSeqId);
1456  }
1457 
1458  s_AddAnnotsToBioseq(annots, bioseq, pBioseqAnnot);
1459  }
1460 }
1461 
1462 
1463 
1465 {
1466  int flags = 0;
1470 
1472  CStreamLineReader lr(instream);
1473  TAnnots annots;
1474  try {
1475  reader.ReadSeqAnnots(annots, lr, m_context.m_logger);
1476  x_PostProcessAnnots(annots);
1477  }
1478  catch (CReaderMessage& msg) {
1479  m_context.m_logger->PutMessage(msg);
1480  }
1481 
1482  return annots;
1483 }
1484 
1485 #ifdef FLATFILE_PARSER_ENABLED
1487 {
1488  unique_ptr<Parser> pp(new Parser);
1489  switch (format)
1490  {
1492  pp->format = Parser::EFormat::GenBank;
1493  pp->source = Parser::ESource::GenBank;
1494  pp->seqtype = CSeq_id::e_Genbank;
1495  break;
1497  pp->format = Parser::EFormat::EMBL;
1498  pp->source = Parser::ESource::EMBL;
1499  pp->acprefix = ParFlat_EMBL_AC;
1500  pp->seqtype = CSeq_id::e_Embl;
1501  break;
1503  pp->format = Parser::EFormat::SPROT;
1504  pp->source = Parser::ESource::SPROT;
1505  pp->seqtype = CSeq_id::e_Swissprot;
1506  break;
1507  default:
1509  "This flat file format is not supported: " + filename, 0);
1510  break;
1511  }
1512 /*
1513 #ifdef WIN32
1514  pp->ifp = fopen(filename.c_str(), "rb");
1515 #else
1516  pp->ifp = fopen(filename.c_str(), "r");
1517 #endif
1518 */
1519  pp->output_format = Parser::EOutput::BioseqSet;
1520 
1522  auto obj = ffparser.Parse(*pp, filename);
1523  if (obj.NotEmpty())
1524  {
1525  if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo())
1526  {
1527  auto bioseq_set = Ref(CTypeConverter<CBioseq_set>::SafeCast(obj.GetPointerOrNull()));
1528  auto entry = Ref(new CSeq_entry);
1529  entry->SetSeq();
1530  auto& annot = entry->SetAnnot();
1531  for (auto& bioseq : bioseq_set->SetSeq_set())
1532  {
1533  if (bioseq->IsSetAnnot())
1534  annot.splice(annot.end(), bioseq->SetAnnot());
1535  }
1536  if (entry->IsSetAnnot())
1537  return entry;
1538  }
1539  }
1540  return {};
1541 }
1542 #endif
1543 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
Definition: utils.cpp:41
#define false
Definition: bool.h:36
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
Definition: aln_reader.hpp:100
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
void SetAlphabet(const string &value)
Definition: aln_reader.hpp:371
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
Definition: aln_reader.cpp:722
void SetMissing(const string &value)
Definition: aln_reader.hpp:192
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
Definition: aln_reader.hpp:433
CAnnot_id –.
Definition: Annot_id.hpp:66
TAnnots::iterator m_annot_iterator
CMultiReader::TAnnots TAnnots
ILineErrorListener * m_logger
CRef< CSeq_annot > GetNextAnnot()
CRef< ILineReader > m_line_reader
bool Init(const string &seqid_prefix, unique_ptr< istream > &instream, ILineErrorListener *logger)
bool Init(const TAnnots &annots)
CArgException –.
Definition: ncbiargs.hpp:120
CArgs –.
Definition: ncbiargs.hpp:379
_Stream & get()
Definition: utils.hpp:54
Definition: Date.hpp:53
@ ePrecision_day
Definition: Date.hpp:58
Modification of the CFastaReader class that allows for reading a single sequence as a degenarate mult...
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
Definition: readfeat.hpp:75
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
Definition: readfeat.hpp:74
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
Definition: readfeat.hpp:71
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
Definition: readfeat.hpp:76
CRef< CSeq_annot > ReadSequinFeatureTable(const TFlags flags=0, ITableFilter *filter=nullptr, const string &seqid_prefix=kEmptyStr)
Definition: readfeat.cpp:3714
CRef< CSerialObject > Parse(Parser &parseInfo)
Definition: ftamain.cpp:721
Wraps CFormatGuess, and if CFormatGuess's result is Unknown, it tries every file reader until one wor...
CFormatGuess::EFormat GuessFormatAndContent(CFileContentInfo &contentInfo)
CFormatGuess::CFormatHints & GetFormatHints(void)
Get format hints.
void SetRecognizedGenbankTypes(const set< TTypeInfo > &recognizedGenbankTypes)
CFormatGuess::EFormat GuessFormat()
CFormatHints & AddPreferredFormat(TFormat fmt)
Mark the format as preferred.
CFormatHints & DisableAllNonpreferred(void)
Disable all formats not marked as preferred.
Class implements different ad-hoc unreliable file format identifications.
CFormatHints & GetFormatHints(void)
Get format hints.
EFormat
The formats are checked in the same order as declared here.
@ eFiveColFeatureTable
Five-column feature table.
@ eBinaryASN
Binary ASN.1.
@ eGtf
New GTF, CGtfReader.
@ eGff3
GFF3, CGff3Reader.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eUnknown
unknown format
@ eGffAugustus
GFFish output of Augustus Gene Prediction.
@ eTextASN
Text ASN.1.
EFormat GuessFormat(EMode)
static const char * GetFormatName(EFormat format)
void ConvertNs2Gaps(CSeq_entry &entry)
Definition: gaps_edit.cpp:403
bool AtSequenceData() const
void ReadSeqAnnots(TAnnotList &, CNcbiIstream &, ILineErrorListener *=nullptr) override
Read all objects from given insput stream, returning them as a vector of Seq-annots.
shared_ptr< CGff3LocationMerger > GetLocationMerger()
@ fGenerateChildXrefs
Definition: gtf_reader.hpp:218
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
Definition: line_error.cpp:42
string m_AnnotName
Definition: multireader.hpp:92
bool xGetAnnotLoader(CAnnotationLoader &loader, const string &filename)
void LoadGFF3Fasta(istream &in, TAnnots &annots)
static void GetSeqEntry(CRef< objects::CSeq_entry > &entry, CRef< objects::CSeq_submit > &submit, CRef< CSerialObject > obj)
string m_AnnotTitle
Definition: multireader.hpp:93
list< CRef< CSeq_annot > > TAnnots
Definition: multireader.hpp:43
TAnnots xReadGFF3(CNcbiIstream &instream, bool post_process)
void MergeDescriptors(objects::CSeq_descr &dest, const objects::CSeq_descr &source) const
CMultiReader(CTable2AsnContext &context)
shared_mutex m_Mutex
Definition: multireader.hpp:98
unique_ptr< CObjectIStream > xCreateASNStream(const string &filename) const
void LoadAnnotMap(const string &filename, TAnnotMap &annotMap)
CTable2AsnContext & m_context
Definition: multireader.hpp:94
void AddAnnots(TAnnotMap &annotMap, set< string > &matchedAnnots, CBioseq &bioseq) const
CRef< CSerialObject > ReadNextEntry()
bool x_HasMatch(bool matchVersions, const string &idString, TAnnotMap &annotMap, set< string > &matchedAnnots, list< CRef< CSeq_annot >> &annots) const
static const set< TTypeInfo > kSupportedTypes
Definition: multireader.hpp:46
CFormatGuess::EFormat xAnnotGetFormat(CNcbiIstream &) const
shared_ptr< objects::CGff3LocationMerger > m_gff3_merger
Definition: multireader.hpp:96
CRef< CSerialObject > xReadASN1Binary(CObjectIStream &pObjIstrm, const string &content_type) const
bool AtSeqenceData() const
Definition: multireader.hpp:70
CRef< CSerialObject > FetchEntry(const CFormatGuess::EFormat &format, const string &objectType, unique_ptr< CNcbiIstream > &pIstr, TAnnotMap &annotMap)
bool mAtSequenceData
Definition: multireader.hpp:97
void WriteObject(const CSerialObject &, ostream &)
void LoadDescriptors(const string &ifname, CRef< objects::CSeq_descr > &out_desc) const
bool x_HasExactMatch(const string &idString, TAnnotMap &annotMap, set< string > &matchedAnnots, list< CRef< CSeq_annot >> &annots) const
TAnnots xReadGTF(CNcbiIstream &instream)
void ApplyDescriptors(objects::CSeq_entry &obj, const objects::CSeq_descr &source) const
CRef< CSerialObject > xApplyTemplate(CRef< CSerialObject > obj, bool merge_template_descriptors) const
CRef< objects::CSeq_entry > xReadFasta(CNcbiIstream &instream)
CRef< objects::CSeq_entry > ReadAlignment(CNcbiIstream &instream, const CArgs &args)
void AddAnnotToMap(CRef< CSeq_annot > pAnnot, TAnnotMap &annotMap)
CFormatGuess::EFormat OpenFile(const string &filename, CRef< CSerialObject > &input_sequence, TAnnotMap &annotMap)
CRef< objects::CSeq_entry > xReadFlatfile(CFormatGuess::EFormat format, const string &filename)
void LoadTemplate(const string &ifname)
void x_PostProcessAnnots(TAnnots &annots) const
CFormatGuess::EFormat xInputGetFormat(CNcbiIstream &, CFileContentInfo *=nullptr) const
unique_ptr< CObjectIStream > m_obj_stream
Definition: multireader.hpp:95
CRef< CSerialObject > xReadASN1Text(CObjectIStream &pObjIstrm) const
CObjectIStream –.
Definition: objistr.hpp:93
Definition: Pub.hpp:56
static CRef< CSeq_id > AsSeqId(const string &rawId, long flags=0, bool localInts=true)
Convert a raw ID string to a Seq-id, based in given customization flags.
Definition: read_util.cpp:89
@ fAllIdsAsLocal
all identifiers are local IDs
Definition: reader_base.hpp:78
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
const TAnnot & GetAnnot(void) const
Definition: Seq_entry.cpp:179
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
bool IsSetAnnot(void) const
Definition: Seq_entry.cpp:165
void ResetParentEntry(void)
Definition: Seq_entry.cpp:61
void SetDescr(CSeq_descr &value)
Definition: Seq_entry.cpp:134
TAnnot & SetAnnot(void)
Definition: Seq_entry.cpp:195
void Parentize(void)
Definition: Seq_entry.cpp:71
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
Base class for all serializable objects.
Definition: serialbase.hpp:150
Simple implementation of ILineReader for i(o)streams.
CSubmit_block –.
objects::ILineErrorListener * m_logger
objects::CGapsEditor::TEvidenceSet m_DefaultEvidence
CRef< objects::CSeq_entry > m_entry_template
CRef< objects::CSeq_submit > m_submit_template
void MergeWithTemplate(objects::CSeq_entry &entry) const
objects::CGapsEditor::TCountToEvidenceMap m_GapsizeToEvidence
objects::CBioseq_set::TClass m_ClassValue
void MakeGenomeCenterId(objects::CSeq_entry &entry) const
CTime –.
Definition: ncbitime.hpp:296
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
bool IsDBLink() const
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
static uch flags
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
#define ParFlat_EMBL_AC
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
Definition: ncbifile.cpp:358
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
Definition: fasta.hpp:112
@ fLetterGaps
Parse runs of Ns when splitting data.
Definition: fasta.hpp:105
@ fIgnoreMods
Ignore mods entirely. Incompatible with fAddMods.
Definition: fasta.hpp:115
@ fNoUserObjs
Don't save raw deflines in User-objects.
Definition: fasta.hpp:106
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fNoSplit
Don't split out ambiguous sequence regions.
Definition: fasta.hpp:99
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
@ fValidate
Check (alphabetic) residue validity.
Definition: fasta.hpp:100
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
Definition: fasta.hpp:114
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:551
void Read(const CObjectInfo &object)
Read object of know type.
Definition: objistr.cpp:952
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
Definition: objectinfo.hpp:762
virtual string ReadFileHeader(void)
Read file header.
Definition: objistr.cpp:1121
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
TData & SetData(void)
Assign a value to Data data member.
TSub & SetSub(void)
Select the variant.
Definition: Pub_.cpp:195
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_genbank
converted genbank
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Annot_id_.cpp:112
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Annot_id_.hpp:351
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
Definition: Seq_annot_.hpp:861
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
Definition: Seq_annot_.hpp:721
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_annot_.hpp:733
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
bool IsFtable(void) const
Check if variant Ftable is selected.
Definition: Seq_annot_.hpp:615
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Annot_id_.hpp:318
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Annot_id_.cpp:134
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
USING_SCOPE(objects)
const CharType(& source)[N]
Definition: pointer.h:1149
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
static char tmp[2048]
Definition: utf8.c:42
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
bool operator()(const CSeq_id *left, const CSeq_id *right) const
Definition: readfeat.cpp:3761
static CRef< CSeq_id > s_GetAnnotId(const CSeq_annot &annot)
static CRef< CSeq_annot > s_GetBioseqAnnot(CBioseq &bioseq)
static void s_RemoveGenBankDbxrefs(list< CRef< CSeq_feat >> &ftable)
void g_ModifySeqIds(CSeq_annot &annot, const CSeq_id &match, CRef< CSeq_id > new_id)
static bool s_HasPrefixMatch(const string &idString, CMultiReader::TAnnotMap &annotMap, map< string, CMultiReader::TAnnotMap::iterator > &matchMap)
static void s_AddAnnotsToBioseq(list< CRef< CSeq_annot >> &annots, CBioseq &bioseq, CRef< CSeq_annot > &pBioseqAnnot)
else result
Definition: token2.c:20
CFileContentInfoGenbank mInfoGenbank
#define ftable
Definition: utilfeat.h:37
Modified on Wed Nov 29 02:16:29 2023 by modify_doxy.py rev. 669887