NCBI C++ ToolKit
multireader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: multireader.cpp 101850 2024-02-22 16:13:34Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig, Sergiy Gotvyanskyy, NCBI
27  *
28  * File Description:
29  * Reader for selected data file formats
30  *
31  * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "fasta_ex.hpp"
37 
40 
46 
52 #include <objects/pub/Pub.hpp>
55 #include <objects/seq/Pubdesc.hpp>
57 #include <objects/seq/Bioseq.hpp>
59 #include <objects/general/Date.hpp>
61 
62 #include <corelib/ncbistre.hpp>
63 
64 #include <serial/iterator.hpp>
65 #include <serial/objistr.hpp>
66 #include <serial/objostr.hpp>
67 #include <serial/objostrasn.hpp>
68 #include <serial/serial.hpp>
69 #include <objects/seq/Annot_id.hpp>
73 
76 
77 #include "multireader.hpp"
78 #include "table2asn_context.hpp"
79 #include "descr_apply.hpp"
80 
82 
83 #include <corelib/stream_utils.hpp>
84 #include <common/ncbi_revision.h>
85 #include "utils.hpp"
86 
87 #ifndef NCBI_SC_VERSION
88 # define FLATFILE_PARSER_ENABLED
89 #elif (NCBI_SC_VERSION == 0)
90 # define FLATFILE_PARSER_ENABLED
91 #endif
92 
93 #ifdef FLATFILE_PARSER_ENABLED
95 #endif
96 
97 #include <common/test_assert.h> /* This header must go last */
98 
99 
102 
103 
105 {
106  CTypeIterator<CSeq_loc> visitor(annot);
107 
108  CSeq_id& id = *new_id;
109  while (visitor) {
110  CSeq_loc& loc = *visitor;
111  if (loc.GetId()->Compare(match) == CSeq_id::e_YES) {
112  loc.SetId(id);
113  }
114  ++visitor;
115  }
116 }
117 
118 
119 namespace
120 {
121 
122  struct SCSeqidCompare {
123  inline bool operator()(const CSeq_id* left, const CSeq_id* right) const
124  {
125  return *left < *right;
126  };
127  };
128 
129 }
130 
131 
133  CBioseq_set::GetTypeInfo(),
134  CBioseq::GetTypeInfo(),
135  CSeq_entry::GetTypeInfo(),
136  CSeq_submit::GetTypeInfo(),
137 };
138 
139 
140 CRef<CSerialObject> CMultiReader::xReadASN1Binary(CObjectIStream& pObjIstrm, const string& content_type) const
141 {
142  if (content_type == "Bioseq-set") {
143  auto obj = Ref(new CSeq_entry);
144  auto& bioseq_set = obj->SetSet();
145  pObjIstrm.Read(ObjectInfo(bioseq_set));
146  return obj;
147  }
148 
149  if (content_type == "Seq-submit") {
150  auto seqsubmit = Ref(new CSeq_submit);
151  pObjIstrm.Read(ObjectInfo(*seqsubmit));
152  return seqsubmit;
153  }
154 
155  if (content_type == "Seq-entry") {
156  auto obj = Ref(new CSeq_entry);
157  pObjIstrm.Read(ObjectInfo(*obj));
158  return obj;
159  }
160 
161  if (content_type == "Bioseq") {
162  auto obj = Ref(new CSeq_entry);
163  pObjIstrm.Read(ObjectInfo(obj->SetSeq()));
164  return obj;
165  };
166 
167  return {};
168 }
169 
171 {
172  CRef<CSeq_entry> entry;
173  CRef<CSeq_submit> submit;
174 
175  // guess object type
176  string sType;
177  try {
178  sType = pObjIstrm.ReadFileHeader();
179  } catch (const CEofException&) {
180  sType.clear();
181  // ignore EOF exception
182  }
183 
184  // do the right thing depending on the input type
185  if (sType == CBioseq_set::GetTypeInfo()->GetName()) {
186  entry.Reset(new CSeq_entry);
187  pObjIstrm.Read(ObjectInfo(entry->SetSet()), CObjectIStream::eNoFileHeader);
188  } else if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
189  submit.Reset(new CSeq_submit);
190  pObjIstrm.Read(ObjectInfo(*submit), CObjectIStream::eNoFileHeader);
191 
192  if (submit->GetData().GetEntrys().size() > 1) {
193  entry.Reset(new CSeq_entry);
194  entry->SetSet().SetSeq_set() = submit->GetData().GetEntrys();
195  }
196  else
197  entry = *submit->SetData().SetEntrys().begin();
198  } else if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
199  entry.Reset(new CSeq_entry);
200  pObjIstrm.Read(ObjectInfo(*entry), CObjectIStream::eNoFileHeader);
201  } else if (sType == CSeq_annot::GetTypeInfo()->GetName()) {
202  entry.Reset(new CSeq_entry);
203  do {
204  CRef<CSeq_annot> annot(new CSeq_annot);
205  pObjIstrm.Read(ObjectInfo(*annot), CObjectIStream::eNoFileHeader);
206  entry->SetSeq().SetAnnot().push_back(annot);
207  try {
208  sType = pObjIstrm.ReadFileHeader();
209  } catch (const CEofException&) {
210  sType.clear();
211  // ignore EOF exception
212  }
213  } while (sType == CSeq_annot::GetTypeInfo()->GetName());
214  } else {
215  return CRef<CSerialObject>();
216  }
217 
218  if (m_context.m_gapNmin > 0) {
219  CGapsEditor gap_edit(
225  gap_edit.ConvertNs2Gaps(*entry);
226  }
227 
228  if (submit.Empty())
229  return entry;
230  else
231  return submit;
232 }
233 
234 // ----------------------------------------------------------------------------
237 // ----------------------------------------------------------------------------
238 {
239  CAlnReader reader(instream);
240  reader.SetAllGap(args["aln-gapchar"].AsString());
241  reader.SetMissing(args["aln-gapchar"].AsString());
242  if (args["aln-alphabet"].AsString() == "nuc") {
244  } else {
246  }
247 
248  reader.Read(0, m_context.m_logger);
249  auto pSeqEntry =
250  reader.GetSeqEntry(
253 
254  if (pSeqEntry && args["a"]) {
256  s_StringToClass = {
258  { "s1", CBioseq_set::eClass_pop_set },
259  { "s2", CBioseq_set::eClass_phy_set },
260  { "s3", CBioseq_set::eClass_mut_set },
261  { "s4", CBioseq_set::eClass_eco_set },
263  };
264 
265  auto it = s_StringToClass.find(args["a"].AsString());
266  if (it != s_StringToClass.end()) {
267  pSeqEntry->SetSet().SetClass(it->second);
268  }
269  }
270 
271  return pSeqEntry;
272 }
273 
274 
275 // ----------------------------------------------------------------------------
278 {
279  if (m_context.m_gapNmin > 0) {
282  } else {
284 // | CFastaReader::fLeaveAsText;
285  }
286 
287  if (m_context.m_d_fasta) {
289  }
290 
295 
298 
299 
302 
303  unique_ptr<CFastaReaderEx> pReader(new CFastaReaderEx(m_context, instream, m_iFlags));
304  if (! pReader) {
306  "File format not supported", 0);
307  }
308  if (m_context.m_gapNmin > 0) {
309  pReader->SetMinGaps(m_context.m_gapNmin, m_context.m_gap_Unknown_length);
310  }
311 
312  // if (m_context.m_gap_evidences.size() > 0 || m_context.m_gap_type >= 0)
313  if (! m_context.m_GapsizeToEvidence.empty() ||
314  ! m_context.m_DefaultEvidence.empty() ||
315  m_context.m_gap_type >= 0) {
316  pReader->SetGapLinkageEvidence(
320  }
321 
322  int max_seqs = kMax_Int;
324  if (m_context.m_di_fasta)
325  result = pReader->ReadDeltaFasta(m_context.m_logger);
326  else if (m_context.m_d_fasta)
327  result = pReader->ReadDeltaFasta(m_context.m_logger);
328  else
329  result = pReader->ReadSet(max_seqs, m_context.m_logger);
330 
331  if (result.NotEmpty()) {
333  }
334 
335  if (result->IsSet() && ! m_context.m_HandleAsSet) {
338  "File " + m_context.m_current_file + " contains multiple sequences",
339  *(m_context.m_logger));
340  }
341  if (result->IsSet()) {
342  result->SetSet().SetClass(m_context.m_ClassValue);
343  }
344 
345  return result;
346 }
347 
348 // ----------------------------------------------------------------------------
350 {
351  CFormatGuessEx FG(istr);
353  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eFasta); // we wouldn't take "no" for an answer anyway
355  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eXml);
358 
359  if (! content_info) {
360  return FG.GuessFormat();
361  }
362 
364  return FG.GuessFormatAndContent(*content_info);
365 }
366 
367 // ----------------------------------------------------------------------------
369 {
370  CFormatGuess FG(istr);
375  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eGff2);
376  // RW-1591: Need at least GFF3 or GTF (plain or Augustus) to properly relate
377  // the features
380 #ifdef FLATFILE_PARSER_ENABLED
384 #endif
386 
387  return FG.GuessFormat();
388 }
389 
390 // ----------------------------------------------------------------------------
392  const CSerialObject& object,
393  ostream& ostr)
394 {
396  //<< MSerial_VerifyNo
397  << object;
398  ostr.flush();
399 }
400 
402  m_context(context),
403  mAtSequenceData(false)
404 {
405 }
406 
407 /*
408 void CMultiReader::ApplyAdditionalProperties(CSeq_entry& entry)
409 {
410  switch(entry.Which()) {
411  case CSeq_entry::e_Seq:
412  if (!m_context.m_OrganismName.empty() || m_context.m_taxid != 0) {
413  CBioSource::TOrg& org(CAutoAddDesc(entry.SetDescr(), CSeqdesc::e_Source).Set().SetSource().SetOrg());
414  // we should reset taxid in case new name is different
415  if (org.IsSetTaxname() && org.GetTaxId() >0 && org.GetTaxname() != m_context.m_OrganismName) {
416  org.SetTaxId(0);
417  }
418 
419  if (!m_context.m_OrganismName.empty())
420  org.SetTaxname(m_context.m_OrganismName);
421  if (m_context.m_taxid != 0)
422  org.SetTaxId(m_context.m_taxid);
423  }
424  break;
425 
426  case CSeq_entry::e_Set:
427  {
428  if (!entry.GetSet().IsSetClass())
429  entry.SetSet().SetClass(CBioseq_set::eClass_genbank);
430 
431  NON_CONST_ITERATE(CBioseq_set_Base::TSeq_set, it, entry.SetSet().SetSeq_set())
432  {
433  ApplyAdditionalProperties(**it);
434  }
435  }
436  break;
437  default:
438  break;
439  }
440 }
441 */
442 
443 void CMultiReader::LoadDescriptors(const string& ifname, CRef<CSeq_descr>& out_desc) const
444 {
445  out_desc.Reset(new CSeq_descr);
446 
447  unique_ptr<CObjectIStream> pObjIstrm = xCreateASNStream(ifname);
448 
449  // guess object type
450  //const string sType = pObjIstrm->ReadFileHeader();
451 
452  // do the right thing depending on the input type
453  while (true) {
454  try {
455  const string sType = pObjIstrm->ReadFileHeader();
456  if (sType == CSeq_descr::GetTypeInfo()->GetName()) {
457  CRef<CSeq_descr> descr(new CSeq_descr);
458  pObjIstrm->Read(ObjectInfo(*descr),
460  out_desc->Set().insert(out_desc->Set().end(), descr->Get().begin(), descr->Get().end());
461  } else if (sType == CSeqdesc::GetTypeInfo()->GetName()) {
462  CRef<CSeqdesc> desc(new CSeqdesc);
463  pObjIstrm->Read(ObjectInfo(*desc),
465  out_desc->Set().push_back(desc);
466  } else if (sType == CPubdesc::GetTypeInfo()->GetName()) {
467  CRef<CSeqdesc> desc(new CSeqdesc);
468  pObjIstrm->Read(ObjectInfo(desc->SetPub()),
470  out_desc->Set().push_back(desc);
471  } else {
472  throw runtime_error("Descriptor file must contain "
473  "either Seq_descr or Seqdesc elements");
474  }
475  } catch (CException& ex) {
476  if (! NStr::EqualNocase(ex.GetMsg(), "end of file")) {
477  throw runtime_error("Unable to read descriptor from file:" + ex.GetMsg());
478  }
479  break;
480  }
481  }
482 }
483 
484 void CMultiReader::LoadTemplate(const string& ifname)
485 {
486  unique_ptr<CObjectIStream> pObjIstrm = xCreateASNStream(ifname);
487 
488  // guess object type
489  string sType = pObjIstrm->ReadFileHeader();
490 
491  // do the right thing depending on the input type
492  if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
495  } else if (sType == CBioseq::GetTypeInfo()->GetName()) {
496  CRef<CBioseq> pBioseq( new CBioseq );
497  pObjIstrm->Read(ObjectInfo(*pBioseq), CObjectIStream::eNoFileHeader);
499  m_context.m_entry_template->SetSeq( *pBioseq );
500  } else if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
503  if (! m_context.m_submit_template->GetData().IsEntrys()
504  || m_context.m_submit_template->GetData().GetEntrys().size() != 1) {
505  throw runtime_error("Seq-submit template must contain "
506  "exactly one Seq-entry");
507  }
508  } else if (sType == CSubmit_block::GetTypeInfo()->GetName()) {
509  // a Submit-block
510  CRef<CSubmit_block> submit_block(new CSubmit_block);
511  pObjIstrm->Read(ObjectInfo(*submit_block), CObjectIStream::eNoFileHeader);
512 
513  // Build a Seq-submit containing this plus a bogus Seq-entry
515  m_context.m_submit_template->SetSub(*submit_block);
516  CRef<CSeq_entry> ent(new CSeq_entry);
517  CRef<CSeq_id> dummy_id(new CSeq_id("lcl|dummy_id"));
518  ent->SetSeq().SetId().push_back(dummy_id);
519  ent->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
520  ent->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
521  m_context.m_submit_template->SetData().SetEntrys().push_back(ent);
522  } else if (sType == CSeqdesc::GetTypeInfo()->GetName()) {
523  // it's OK
524  } else {
525  NCBI_USER_THROW_FMT("Template must be Seq-entry, Seq-submit, Bioseq or "
526  "Submit-block. Object seems to be of type: " << sType);
527  }
528 
529  // for submit types, pull out the seq-entry inside and remember it
531  m_context.m_entry_template = m_context.m_submit_template->SetData().SetEntrys().front();
532  }
533 
534  // The template may contain a set rather than a seq.
535  // That's OK if it contains only one na entry, which we'll use.
538  for (auto ent_iter: m_context.m_entry_template->GetSet().GetSeq_set()) {
539  const CSeq_descr* descr = nullptr;
540  if (ent_iter->IsSetDescr()) {
541  descr = &ent_iter->GetDescr();
542  }
543  if (descr) {
544  //tmp->Assign(**ent_iter);
545  tmp->SetSeq().SetInst();
546  // Copy any descriptors from the set to the sequence
547  ITERATE(CBioseq_set::TDescr::Tdata, desc_iter, descr->Get())
548  {
549  switch ((*desc_iter)->Which()) {
550  case CSeqdesc::e_Pub:
551  case CSeqdesc::e_Source:
552  break;
553  default:
554  continue;
555  }
556  CRef<CSeqdesc> desc(new CSeqdesc);
557  desc->Assign(**desc_iter);
558  tmp->SetSeq().SetDescr().Set().push_back(desc);
559  }
560  break;
561  }
562  }
563 
564  if (tmp->IsSetDescr() && !tmp->GetDescr().Get().empty())
566  }
567 
568  // incorporate any Seqdesc's that follow in the file
569  if (!pObjIstrm->EndOfData()) {
570  if (sType != CSeqdesc::GetTypeInfo()->GetName())
571  sType = pObjIstrm->ReadFileHeader();
572 
573  while (sType == CSeqdesc::GetTypeInfo()->GetName()) {
574  CRef<CSeqdesc> desc(new CSeqdesc);
575  pObjIstrm->Read(ObjectInfo(*desc), CObjectIStream::eNoFileHeader);
576 
579 
580  {
581  if (desc->IsUser() && desc->GetUser().IsDBLink()) {
582  CUser_object& user_obj = desc->SetUser();
583  if (user_obj.IsDBLink()) {
584  user_obj.SetData();
585  }
586  }
587  }
588 
589  m_context.m_entry_template->SetSeq().SetDescr().Set().push_back(desc);
590 
591  if (pObjIstrm->EndOfData())
592  break;
593 
594  try {
595  sType = pObjIstrm->ReadFileHeader();
596  } catch (CEofException&) {
597  break;
598  }
599  }
600  }
601 
602 #if 0
603  if (m_context.m_submit_template->IsEntrys()) {
604  // Take Seq-submit.sub.cit and put it in the Bioseq
605  CRef<CPub> pub(new CPub);
606  pub->SetSub().Assign(context.m_submit_template->GetSub().GetCit());
607  CRef<CSeqdesc> pub_desc(new CSeqdesc);
608  pub_desc->SetPub().SetPub().Set().push_back(pub);
609  m_context.m_entry_template->SetSeq().SetDescr().Set().push_back(pub_desc);
610  }
611 #endif
612 
614  throw runtime_error("The Seq-entry must be a Bioseq not a Bioseq-set.");
615  }
616 
618  if (m_context.m_submit_template->IsSetSub() &&
619  m_context.m_submit_template->GetSub().IsSetCit()) {
621  m_context.m_submit_template->SetSub().SetCit().SetDate(*date);
622  }
623  }
624 
625 #if 0
626  if (args["output-type"].AsString() == "Seq-entry") {
627  // force Seq-entry by throwing out the Seq-submit
629  }
630 #endif
631 }
632 
633 namespace
634 {
635  class AllowedDuplicates : public set<CSeqdesc_Base::E_Choice>
636  {
637  public:
638  AllowedDuplicates()
639  {
640  insert(CSeqdesc_Base::e_User);
641  }
642  };
643  AllowedDuplicates m_allowed_duplicates;
644 
645  template <typename _which>
646  struct LocateWhich {
647  typename _which::E_Choice compare_to;
648  bool operator()(_which l) const
649  {
650  return l.Which() == compare_to;
651  }
652  bool operator()(const CRef<_which>& l) const
653  {
654  return l->Which() == compare_to;
655  }
656  };
657 }
658 
660 {
661  ITERATE(CSeq_descr::Tdata, it, source.Get())
662  {
663  MergeDescriptors(dest, **it);
664  }
665 }
666 
667 void CMultiReader::MergeDescriptors(CSeq_descr & dest, const CSeqdesc & source) const
668 {
669  bool duplicates = (m_allowed_duplicates.find(source.Which()) != m_allowed_duplicates.end());
670 
671  CAutoAddDesc desc(dest, source.Which());
672  desc.Set(duplicates).Assign(source);
673 }
674 
676 {
677  MergeDescriptors(entry.SetDescr(), source);
678  //g_ApplyDescriptors(source.Get(), entry);
679 }
680 
681 namespace
682 {
683  void CopyDescr(CSeq_entry& dest, const CSeq_entry& src) {
684  if (src.IsSetDescr() && ! src.GetDescr().Get().empty()) {
685  dest.SetDescr().Set().insert(dest.SetDescr().Set().end(),
686  src.GetDescr().Get().begin(),
687  src.GetDescr().Get().end());
688  }
689  }
690  void CopyAnnot(CSeq_entry& dest, const CSeq_entry& src) {
691  if (src.IsSetAnnot() && ! src.GetAnnot().empty()) {
692  dest.SetAnnot().insert(dest.SetAnnot().end(),
693  src.GetAnnot().begin(),
694  src.GetAnnot().end());
695  }
696  }
697 }
698 
699 void CMultiReader::LoadGFF3Fasta(istream& in, TAnnots& annots)
700 {
704  string("Recognized input file as format: ") + CFormatGuess::GetFormatName(CFormatGuess::eGff3));
705 
706  bool post_process = false;
707  annots = xReadGFF3(in, post_process); // initializes m_gff3_reader!
708  if (! AtSeqenceData()) {
710  "Specified GFF3 file does not include any sequence data", 0);
711  }
712  x_PostProcessAnnots(annots);
713 }
714 
715 
716 void CMultiReader::LoadGFF3Fasta(istream& in, TAnnotMap& annotMap)
717 {
718  TAnnots annots;
719  LoadGFF3Fasta(in, annots);
720  for (auto pAnnot : annots) {
721  AddAnnotToMap(pAnnot, annotMap);
722  }
723 }
724 
725 
728  const string& objectType,
729  unique_ptr<istream>& pIstr,
730  TAnnotMap& annotMap)
731 {
732  CRef<CSerialObject> pInputObject;
733  switch (format) {
736  pInputObject = xReadASN1Binary(*m_obj_stream, objectType);
737  break;
740  pInputObject = xReadASN1Text(*m_obj_stream);
741  break;
742  case CFormatGuess::eGff3:
743  LoadGFF3Fasta(*pIstr, annotMap);
744  case CFormatGuess::eFasta: // What about buffered input?
745  default:
747  pInputObject = xReadFasta(*pIstr);
748  }
749 
750  if (! pInputObject) {
752  "File format not recognized", 0);
753  }
754  // RW-617: apply template descriptors only if input is *not* ASN1:
755  // What about binary ASN.1?
756  bool merge_template_descriptors = (format != CFormatGuess::eTextASN);
757  return xApplyTemplate(pInputObject, merge_template_descriptors);
758 }
759 
760 
761 CFormatGuess::EFormat CMultiReader::OpenFile(const string& filename, CRef<CSerialObject>& input_sequence, TAnnotMap& annotMap)
762 {
764  CFileContentInfo content_info;
765  {
766  unique_ptr<istream> istream(new CNcbiIfstream(filename));
767  format = xInputGetFormat(*istream, &content_info);
768  }
769 
770  switch (format)
771  {
774  input_sequence = xReadASN1Binary(*m_obj_stream, content_info.mInfoGenbank.mObjectType);
775  break;
778  input_sequence = xReadASN1Text(*m_obj_stream);
779  break;
780  case CFormatGuess::eGff3:
781  {
782  unique_ptr<istream> in(new CNcbiIfstream(filename));
783  LoadGFF3Fasta(*in, annotMap);
784  m_iFlags = 0;
786  input_sequence = xReadFasta(*in);
787  }
788  break;
789  default: // RW-616 - Assume FASTA
790  {
792  m_iFlags = 0;
794 
795  CBufferedInput istream;
796  istream.get().open(filename);
797  input_sequence = xReadFasta(istream);
798  }
799  break;
800  }
801  if (input_sequence.Empty())
803  "File format not recognized", 0);
804  //rw-617: apply template descriptors only if input is *not* ASN1:
805  bool merge_template_descriptors = (format != CFormatGuess::eTextASN);
806  input_sequence = xApplyTemplate(input_sequence, merge_template_descriptors);
807 
808  return format;
809 }
810 
812 {
813  if (obj->GetThisTypeInfo() == CSeq_submit::GetTypeInfo()) {
814  submit.Reset(static_cast<CSeq_submit*>(obj.GetPointer()));
815  entry = submit->SetData().SetEntrys().front();
816  } else if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
817  entry.Reset(static_cast<CSeq_entry*>(obj.GetPointer()));
818  }
819 }
820 
821 CRef<CSerialObject> CMultiReader::xApplyTemplate(CRef<CSerialObject> obj, bool merge_template_descriptors) const
822 {
823  CRef<CSeq_entry> entry;
824  CRef<CSeq_submit> submit;
825 
826  GetSeqEntry(entry, submit, obj);
827 
828  if (entry.NotEmpty()) // &&
829  {
830  if (submit.Empty())
831  if (entry->IsSet() && entry->GetSet().GetSeq_set().size() < 2 &&
832  entry->GetSet().GetSeq_set().front()->IsSeq())
833  {
834  CRef<CSeq_entry> seq = entry->SetSet().SetSeq_set().front();
835  CopyDescr(*seq, *entry);
836  CopyAnnot(*seq, *entry);
837  entry = seq;
838  }
839  entry->ResetParentEntry();
840  entry->Parentize();
841 
842  if (merge_template_descriptors) {
844  } else {
845  if (m_context.m_t && m_context.m_logger) {
846  string msg(
847  "Template file descriptors are ignored if input is ASN.1");
848  m_context.m_logger->PutError(
849  *unique_ptr<CLineError>(
851  eDiag_Warning, "", 0, "", "", "", msg)));
852  }
853  }
854  }
855 
856  if (submit.Empty())
857  return entry;
858  else
859  return submit;
860 }
861 
863 {
864  if (m_obj_stream)
865  return xReadASN1Text(*m_obj_stream);
866  else
867  return CRef<CSerialObject>();
868 }
869 
871 {
872  int flags = 0;
877 
878  CReaderListener readerListener;
879  CGff3Reader reader(flags, m_AnnotName, m_AnnotTitle, CReadUtil::AsSeqId, &readerListener);
880 
881  CStreamLineReader lr(instream);
882  TAnnots annots;
883 
884  try {
885  reader.ReadSeqAnnots(annots, lr, m_context.m_logger);
886  m_gff3_merger = reader.GetLocationMerger();
887  mAtSequenceData = reader.AtSequenceData();
888 
889  if (post_process) {
890  x_PostProcessAnnots(annots);
891  }
892 
893  for (const auto& msg : readerListener) {
894  m_context.m_logger->PutMessage(msg);
895  }
896  }
897  catch (const CReaderMessage& msg) {
898  m_context.m_logger->PutMessage(msg);
899  }
900 
901  return annots;
902 }
903 
904 
906 {
907  for (auto pFeat : ftable) {
908  if (pFeat->IsSetDbxref()) {
909  auto& dbxrefs = pFeat->SetDbxref();
910  auto it = remove_if(dbxrefs.begin(), dbxrefs.end(),
911  [](const CRef<CDbtag>& pDbtag) {
912  return(pDbtag && pDbtag->IsSetDb() &&
913  NStr::EqualNocase(pDbtag->GetDb(), "GenBank"));
914  });
915  dbxrefs.erase(it, dbxrefs.end());
916  if (dbxrefs.empty()) {
917  pFeat->ResetDbxref();
918  }
919  }
920  }
921 }
922 
923 
925 {
926  unsigned int startingLocusTagNumber = 1;
927  unsigned int startingFeatureId = 1;
928  for (auto it = annots.begin(); it != annots.end(); ++it) {
929 
930  auto& annot = **it;
931  auto& data = annot.SetData();
932  if (! data.IsFtable() || data.GetFtable().empty()) {
933  continue; // all that follows applies to feature tables only
934  }
935 
936  s_RemoveGenBankDbxrefs(data.SetFtable()); // RW-1861
937 
938  edit::CFeatTableEdit fte(
939  annot, 0, m_context.m_locus_tag_prefix, startingLocusTagNumber, startingFeatureId, m_context.m_logger);
940  //fte.InferPartials();
941  fte.GenerateMissingParentFeatures(m_context.m_eukaryote, m_gff3_merger.get());
943  if (m_context.m_locus_tag_prefix.empty() && !fte.AnnotHasAllLocusTags()) {
944  NCBI_THROW(CArgException, eNoArg,
945  "GFF annotation requires locus tags, which are missing from one or more genes, so the command line argument -locus-tag-prefix is needed");
946  }
947  fte.GenerateLocusTags();
948  }
949  fte.GenerateProteinAndTranscriptIds();
950  //fte.InstantiateProducts();
951  fte.ProcessCodonRecognized();
952  fte.EliminateBadQualifiers();
953  fte.SubmitFixProducts();
954 
955  startingLocusTagNumber = fte.PendingLocusTagNumber();
956  startingFeatureId = fte.PendingFeatureId();
957  }
958 }
959 
960 
961 unique_ptr<CObjectIStream> CMultiReader::xCreateASNStream(const string& filename) const
962 {
963  unique_ptr<istream> instream(new CNcbiIfstream(filename));
964  return xCreateASNStream(CFormatGuess::eUnknown, instream);
965 }
966 
967 unique_ptr<CObjectIStream> CMultiReader::xCreateASNStream(CFormatGuess::EFormat format, unique_ptr<istream>& instream) const
968 {
969  // guess format
970  ESerialDataFormat eSerialDataFormat = eSerial_None;
971  {
973  format = xInputGetFormat(*instream);
974 
975  switch(format) {
977  eSerialDataFormat = eSerial_AsnBinary;
978  break;
981  eSerialDataFormat = eSerial_AsnText;
982  break;
983  case CFormatGuess::eXml:
984  eSerialDataFormat = eSerial_Xml;
985  break;
986  default:
988  "Descriptor file seems to be in an unsupported format: "
990  break;
991  }
992 
993  //instream.seekg(0);
994  }
995 
996  unique_ptr<CObjectIStream> pObjIstrm(
997  CObjectIStream::Open(eSerialDataFormat, *instream.release(), eTakeOwnership));
998 
999  return pObjIstrm;
1000 }
1001 
1003 {
1004 }
1005 
1007 {
1008 public:
1010 
1011  bool Init(const TAnnots& annots)
1012  {
1013  if (annots.empty()) {
1014  return false;
1015  }
1016 
1017  m_Annots = annots;
1018  m_annot_iterator = m_Annots.begin();
1019  return true;
1020  }
1021 
1022  bool Init(const string& seqid_prefix, unique_ptr<istream>& instream, ILineErrorListener* logger)
1023  {
1024  m_seqid_prefix = seqid_prefix;
1026  instream.release();
1027  m_logger = logger;
1028  return true;
1029  }
1030 
1032  {
1033  if (!m_Annots.empty()) {
1034  if (m_annot_iterator != m_Annots.end()) {
1035  return *m_annot_iterator++;
1036  }
1037  } else if (m_line_reader.NotEmpty()) {
1038  while (!m_line_reader->AtEOF()) {
1040  *m_line_reader,
1045  m_logger, nullptr/*filter*/, m_seqid_prefix);
1046 
1047  if (annot.NotEmpty() && annot->IsSetData() && annot->GetData().IsFtable() &&
1048  ! annot->GetData().GetFtable().empty()) {
1049  return annot;
1050  }
1051  }
1052  }
1053  return CRef<CSeq_annot>();
1054  }
1055 
1056 private:
1058  TAnnots::iterator m_annot_iterator;
1062 };
1063 
1064 bool CMultiReader::xGetAnnotLoader(CAnnotationLoader& loader, const string& filename)
1065 {
1066  unique_ptr<istream> in(new CNcbiIfstream(filename));
1067 
1069 
1070  if (uFormat == CFormatGuess::eUnknown) {
1071  string ext;
1072  CDirEntry::SplitPath(filename, nullptr, nullptr, &ext);
1073  NStr::ToLower(ext);
1074  if (ext == ".gff" || ext == ".gff3")
1075  uFormat = CFormatGuess::eGff3;
1076  else if (ext == ".gtf")
1077  uFormat = CFormatGuess::eGtf;
1078  else if (ext == ".tbl")
1080  else if (ext == ".asn" || ext == ".sqn" || ext == ".sap")
1081  uFormat = CFormatGuess::eTextASN;
1082 
1083  if (uFormat != CFormatGuess::eUnknown) {
1087  string("Presuming annotation format by filename suffix: ") + CFormatGuess::GetFormatName(uFormat));
1088  }
1089  } else {
1093  string("Recognized annotation format: ") + CFormatGuess::GetFormatName(uFormat));
1094  }
1095 
1096  TAnnots annots;
1097  switch (uFormat) {
1099  string seqid_prefix;
1100  if (!m_context.m_genome_center_id.empty())
1101  seqid_prefix = "gnl|" + m_context.m_genome_center_id + "|";
1102  return loader.Init(seqid_prefix, in, m_context.m_logger);
1103  } break;
1104  case CFormatGuess::eTextASN: {
1105  auto obj_stream = xCreateASNStream(uFormat, in);
1106  CRef<CSerialObject> obj = xReadASN1Text(*obj_stream);
1108  CRef<CSeq_entry> pEntry;
1109  GetSeqEntry(pEntry, unused, obj);
1110  if (pEntry && pEntry->IsSetAnnot()) {
1111  annots = pEntry->GetAnnot();
1112  }
1113  } break;
1114  case CFormatGuess::eGff3:
1115  annots = xReadGFF3(*in, true);
1116  break;
1117  case CFormatGuess::eGtf:
1119  annots = xReadGTF(*in);
1120  break;
1121 #ifdef FLATFILE_PARSER_ENABLED
1125  in.reset();
1126  auto pEntry = xReadFlatfile(uFormat, filename);
1127  if (pEntry && pEntry->IsSetAnnot()) {
1128  annots = pEntry->GetAnnot();
1129  }
1130  } break;
1131 #endif
1132 
1133  default:
1135  "Annotation file format not recognized. Run format validator on your annotation file", 1);
1136  }
1137 
1138  if (! annots.empty()) {
1139  loader.Init(annots);
1140  return true;
1141  }
1142  return false;
1143 }
1144 
1145 
1147 {
1148  CRef<CSeq_id> pAnnotId;
1149  if (annot.IsSetId()) {
1150  pAnnotId.Reset(new CSeq_id());
1151  const CAnnot_id& firstId = *(annot.GetId().front());
1152  if (firstId.IsLocal()) {
1153  pAnnotId->SetLocal().Assign(firstId.GetLocal());
1154  } else if (firstId.IsGeneral()) {
1155  pAnnotId->SetGeneral().Assign(firstId.GetGeneral());
1156  } else {
1157  return pAnnotId;
1158  }
1159  } else if (! annot.GetData().GetFtable().empty()) {
1160  // get a reference to CSeq_id instance, we'd need to update it recently
1161  // 5 column feature reader has a single shared instance for all features
1162  // update one at once would change all the features
1163  pAnnotId.Reset(const_cast<CSeq_id*>(annot.GetData().GetFtable().front()->GetLocation().GetId()));
1164  }
1165 
1166  return pAnnotId;
1167 }
1168 
1169 
1171 {
1172  auto pAnnotId = s_GetAnnotId(*pAnnot);
1173  if (! pAnnotId) {
1174  return;
1175  }
1176 
1177  auto idString = pAnnotId->GetSeqIdString();
1178  NStr::ToLower(idString);
1179  auto it = annotMap.find(idString);
1180  if (it == annotMap.end()) {
1181  annotMap.emplace(idString, list<CRef<CSeq_annot>>{ pAnnot });
1182  } else {
1183  it->second.push_back(pAnnot);
1184  }
1185 }
1186 
1187 
1188 void CMultiReader::LoadAnnotMap(const string& filename, TAnnotMap& annotMap)
1189 {
1190  CAnnotationLoader annot_loader;
1191  if (! xGetAnnotLoader(annot_loader, filename)) {
1192  return;
1193  }
1194 
1195  CRef<CSeq_annot> pAnnot;
1196  while ((pAnnot = annot_loader.GetNextAnnot()).NotEmpty()) {
1197  AddAnnotToMap(pAnnot, annotMap);
1198  }
1199 }
1200 
1201 
1203 {
1204  CRef<CSeq_annot> pBioseqAnnot;
1205 
1206  if (bioseq.IsSetAnnot()) {
1207  auto& bioseqAnnots = bioseq.SetAnnot();
1208  auto it = find_if(bioseqAnnots.begin(),
1209  bioseqAnnots.end(),
1210  [](CRef<CSeq_annot> pAnnot) {
1211  return (pAnnot && pAnnot->IsFtable());
1212  });
1213  if (it != bioseqAnnots.end()) {
1214  pBioseqAnnot = *it;
1215  }
1216  }
1217  return pBioseqAnnot;
1218 }
1219 
1221  list<CRef<CSeq_annot>>& annots,
1222  CBioseq& bioseq,
1223  CRef<CSeq_annot>& pBioseqAnnot)
1224 {
1225  if (pBioseqAnnot) {
1226  for (auto pAnnot : annots) {
1227  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
1228  featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
1229  }
1230  return;
1231  }
1232 
1233  pBioseqAnnot = s_GetBioseqAnnot(bioseq);
1234 
1235  if (! pBioseqAnnot) {
1236  pBioseqAnnot = annots.front();
1237  bioseq.SetAnnot().push_back(pBioseqAnnot);
1238  auto it = next(annots.begin());
1239  while (it != annots.end()) {
1240  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
1241  featEdit.MergeFeatures((*it)->SetData().SetFtable());
1242  ++it;
1243  }
1244  } else {
1245  for (auto pAnnot : annots) {
1246  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
1247  featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
1248  }
1249  }
1250 }
1251 
1252 static bool s_HasPrefixMatch(
1253  const string& idString,
1254  CMultiReader::TAnnotMap& annotMap,
1256 {
1257  matchMap.clear();
1258  auto it = annotMap.lower_bound(idString);
1259  while (it != annotMap.end() && NStr::StartsWith(it->first, idString)) {
1260  matchMap.emplace(it->first, it);
1261  ++it;
1262  }
1263  return ! matchMap.empty();
1264 }
1265 
1267  bool matchVersions,
1268  const string& idString,
1269  CMultiReader::TAnnotMap& annotMap,
1270  set<string>& matchedAnnots,
1271  list<CRef<CSeq_annot>>& annots) const
1272 {
1273  if (matchVersions) {
1274  return x_HasExactMatch(idString, annotMap, matchedAnnots, annots);
1275  }
1276 
1277  bool hasMatch = false;
1279  shared_lock<shared_mutex> sLock{m_Mutex};
1280  if (! s_HasPrefixMatch(idString, annotMap, matchMap)) {
1281  return false;
1282  }
1283  sLock.unlock();
1284  {
1285  unique_lock<shared_mutex> uLock{ m_Mutex };
1286  for (auto match : matchMap) {
1287  const auto& annotId = match.first;
1288  auto it = match.second;
1289  if (matchedAnnots.insert(annotId).second) {
1290  hasMatch = true;
1291  annots.splice(annots.end(), it->second);
1292  annotMap.erase(it);
1293  }
1294  }
1295  }
1296 
1297  return hasMatch;
1298 }
1299 
1301  const string& idString,
1302  CMultiReader::TAnnotMap& annotMap,
1303  set<string>& matchedAnnots,
1304  list<CRef<CSeq_annot>>& annots) const
1305 {
1306  shared_lock<shared_mutex> sLock{ m_Mutex };
1307  auto it = annotMap.find(idString);
1308  if (it == annotMap.end()) {
1309  return false;
1310  }
1311  string annotId = it->first;
1312  sLock.unlock();
1313 
1314  {
1315  unique_lock<shared_mutex> uLock{m_Mutex};
1316  if (matchedAnnots.insert(annotId).second) {
1317  annots = move(it->second);
1318  annotMap.erase(it);
1319  return true;
1320  }
1321  }
1322 
1323  return false;
1324 }
1325 
1326 
1328  set<string>& matchedAnnots,
1329  CBioseq& bioseq) const
1330 {
1331  CRef<CSeq_annot> pBioseqAnnot;
1332  for (auto pSeqId : bioseq.GetId()) {
1333  list<CRef<CSeq_annot>> annots;
1334  bool hasMatch = false;
1335  bool matchVersions = (pSeqId->GetTextseq_Id() == nullptr);
1336  auto idString = pSeqId->GetSeqIdString();
1337  NStr::ToLower(idString);
1338  hasMatch = x_HasMatch(matchVersions, idString, annotMap, matchedAnnots, annots);
1339 
1340  if (! hasMatch &&
1341  pSeqId->IsGeneral() &&
1342  pSeqId->GetGeneral().IsSetDb() &&
1343  (pSeqId->GetGeneral().GetDb() == m_context.m_genome_center_id) &&
1344  pSeqId->GetGeneral().IsSetTag() && pSeqId->GetGeneral().GetTag().IsStr()) {
1345  matchVersions = true;
1346  idString = pSeqId->GetGeneral().GetTag().GetStr();
1347  NStr::ToLower(idString);
1348  hasMatch = x_HasMatch(matchVersions, idString, annotMap, matchedAnnots, annots);
1349  }
1350 
1351  if (! hasMatch) {
1352  continue;
1353  }
1354 
1355  for (auto pAnnot : annots) {
1356  auto pAnnotId = s_GetAnnotId(*pAnnot);
1357  g_ModifySeqIds(*pAnnot, *pAnnotId, pSeqId);
1358  }
1359 
1360  s_AddAnnotsToBioseq(annots, bioseq, pBioseqAnnot);
1361  }
1362 }
1363 
1364 
1366 {
1367  int flags = 0;
1371 
1373  CStreamLineReader lr(instream);
1374  TAnnots annots;
1375  try {
1376  reader.ReadSeqAnnots(annots, lr, m_context.m_logger);
1377  x_PostProcessAnnots(annots);
1378  } catch (CReaderMessage& msg) {
1379  m_context.m_logger->PutMessage(msg);
1380  }
1381 
1382  return annots;
1383 }
1384 
1385 #ifdef FLATFILE_PARSER_ENABLED
1387 {
1388  unique_ptr<Parser> pp(new Parser);
1389  switch (format)
1390  {
1392  pp->format = Parser::EFormat::GenBank;
1393  pp->source = Parser::ESource::GenBank;
1394  pp->seqtype = CSeq_id::e_Genbank;
1395  break;
1397  pp->format = Parser::EFormat::EMBL;
1398  pp->source = Parser::ESource::EMBL;
1399  pp->acprefix = ParFlat_EMBL_AC;
1400  pp->seqtype = CSeq_id::e_Embl;
1401  break;
1403  pp->format = Parser::EFormat::SPROT;
1404  pp->source = Parser::ESource::SPROT;
1405  pp->seqtype = CSeq_id::e_Swissprot;
1406  break;
1407  default:
1409  "This flat file format is not supported: " + filename, 0);
1410  break;
1411  }
1412 /*
1413 #ifdef WIN32
1414  pp->ifp = fopen(filename.c_str(), "rb");
1415 #else
1416  pp->ifp = fopen(filename.c_str(), "r");
1417 #endif
1418 */
1419  pp->output_format = Parser::EOutput::BioseqSet;
1420 
1422  auto obj = ffparser.Parse(*pp, filename);
1423  if (obj.NotEmpty()) {
1424  if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
1425  auto bioseq_set = Ref(CTypeConverter<CBioseq_set>::SafeCast(obj.GetPointerOrNull()));
1426  auto entry = Ref(new CSeq_entry);
1427  entry->SetSeq();
1428  auto& annot = entry->SetAnnot();
1429  for (auto& bioseq : bioseq_set->SetSeq_set()) {
1430  if (bioseq->IsSetAnnot())
1431  annot.splice(annot.end(), bioseq->SetAnnot());
1432  }
1433  if (entry->IsSetAnnot())
1434  return entry;
1435  }
1436  }
1437  return {};
1438 }
1439 #endif
1440 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
Definition: utils.cpp:41
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
Definition: aln_reader.hpp:100
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
void SetAlphabet(const string &value)
Definition: aln_reader.hpp:371
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
Definition: aln_reader.cpp:722
void SetMissing(const string &value)
Definition: aln_reader.hpp:192
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
Definition: aln_reader.hpp:433
CAnnot_id –.
Definition: Annot_id.hpp:66
TAnnots::iterator m_annot_iterator
CMultiReader::TAnnots TAnnots
ILineErrorListener * m_logger
CRef< CSeq_annot > GetNextAnnot()
CRef< ILineReader > m_line_reader
bool Init(const string &seqid_prefix, unique_ptr< istream > &instream, ILineErrorListener *logger)
bool Init(const TAnnots &annots)
CArgException –.
Definition: ncbiargs.hpp:120
CArgs –.
Definition: ncbiargs.hpp:379
_Stream & get()
Definition: utils.hpp:54
Definition: Date.hpp:53
@ ePrecision_day
Definition: Date.hpp:58
Modification of the CFastaReader class that allows for reading a single sequence as a degenarate mult...
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
Definition: readfeat.hpp:75
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
Definition: readfeat.hpp:74
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
Definition: readfeat.hpp:71
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
Definition: readfeat.hpp:76
CRef< CSeq_annot > ReadSequinFeatureTable(const TFlags flags=0, ITableFilter *filter=nullptr, const string &seqid_prefix=kEmptyStr)
Definition: readfeat.cpp:3715
CRef< CSerialObject > Parse(Parser &parseInfo)
Definition: ftamain.cpp:721
Wraps CFormatGuess, and if CFormatGuess's result is Unknown, it tries every file reader until one wor...
CFormatGuess::EFormat GuessFormatAndContent(CFileContentInfo &contentInfo)
CFormatGuess::CFormatHints & GetFormatHints(void)
Get format hints.
void SetRecognizedGenbankTypes(const set< TTypeInfo > &recognizedGenbankTypes)
CFormatGuess::EFormat GuessFormat()
CFormatHints & AddPreferredFormat(TFormat fmt)
Mark the format as preferred.
CFormatHints & DisableAllNonpreferred(void)
Disable all formats not marked as preferred.
Class implements different ad-hoc unreliable file format identifications.
CFormatHints & GetFormatHints(void)
Get format hints.
EFormat
The formats are checked in the same order as declared here.
@ eFiveColFeatureTable
Five-column feature table.
@ eBinaryASN
Binary ASN.1.
@ eGtf
New GTF, CGtfReader.
@ eGff3
GFF3, CGff3Reader.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eUnknown
unknown format
@ eGffAugustus
GFFish output of Augustus Gene Prediction.
@ eTextASN
Text ASN.1.
EFormat GuessFormat(EMode)
static const char * GetFormatName(EFormat format)
void ConvertNs2Gaps(CSeq_entry &entry)
Definition: gaps_edit.cpp:403
bool AtSequenceData() const
void ReadSeqAnnots(TAnnotList &, CNcbiIstream &, ILineErrorListener *=nullptr) override
Read all objects from given insput stream, returning them as a vector of Seq-annots.
shared_ptr< CGff3LocationMerger > GetLocationMerger()
@ fGenerateChildXrefs
Definition: gtf_reader.hpp:218
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
Definition: line_error.cpp:42
string m_AnnotName
Definition: multireader.hpp:92
bool xGetAnnotLoader(CAnnotationLoader &loader, const string &filename)
void LoadGFF3Fasta(istream &in, TAnnots &annots)
static void GetSeqEntry(CRef< objects::CSeq_entry > &entry, CRef< objects::CSeq_submit > &submit, CRef< CSerialObject > obj)
string m_AnnotTitle
Definition: multireader.hpp:93
list< CRef< CSeq_annot > > TAnnots
Definition: multireader.hpp:43
TAnnots xReadGFF3(CNcbiIstream &instream, bool post_process)
void MergeDescriptors(objects::CSeq_descr &dest, const objects::CSeq_descr &source) const
CMultiReader(CTable2AsnContext &context)
shared_mutex m_Mutex
Definition: multireader.hpp:98
unique_ptr< CObjectIStream > xCreateASNStream(const string &filename) const
void LoadAnnotMap(const string &filename, TAnnotMap &annotMap)
CTable2AsnContext & m_context
Definition: multireader.hpp:94
void AddAnnots(TAnnotMap &annotMap, set< string > &matchedAnnots, CBioseq &bioseq) const
CRef< CSerialObject > ReadNextEntry()
bool x_HasMatch(bool matchVersions, const string &idString, TAnnotMap &annotMap, set< string > &matchedAnnots, list< CRef< CSeq_annot >> &annots) const
static const set< TTypeInfo > kSupportedTypes
Definition: multireader.hpp:46
CFormatGuess::EFormat xAnnotGetFormat(CNcbiIstream &) const
shared_ptr< objects::CGff3LocationMerger > m_gff3_merger
Definition: multireader.hpp:96
CRef< CSerialObject > xReadASN1Binary(CObjectIStream &pObjIstrm, const string &content_type) const
bool AtSeqenceData() const
Definition: multireader.hpp:70
CRef< CSerialObject > FetchEntry(const CFormatGuess::EFormat &format, const string &objectType, unique_ptr< CNcbiIstream > &pIstr, TAnnotMap &annotMap)
bool mAtSequenceData
Definition: multireader.hpp:97
void WriteObject(const CSerialObject &, ostream &)
void LoadDescriptors(const string &ifname, CRef< objects::CSeq_descr > &out_desc) const
bool x_HasExactMatch(const string &idString, TAnnotMap &annotMap, set< string > &matchedAnnots, list< CRef< CSeq_annot >> &annots) const
TAnnots xReadGTF(CNcbiIstream &instream)
void ApplyDescriptors(objects::CSeq_entry &obj, const objects::CSeq_descr &source) const
CRef< CSerialObject > xApplyTemplate(CRef< CSerialObject > obj, bool merge_template_descriptors) const
CRef< objects::CSeq_entry > xReadFasta(CNcbiIstream &instream)
CRef< objects::CSeq_entry > ReadAlignment(CNcbiIstream &instream, const CArgs &args)
void AddAnnotToMap(CRef< CSeq_annot > pAnnot, TAnnotMap &annotMap)
CFormatGuess::EFormat OpenFile(const string &filename, CRef< CSerialObject > &input_sequence, TAnnotMap &annotMap)
CRef< objects::CSeq_entry > xReadFlatfile(CFormatGuess::EFormat format, const string &filename)
void LoadTemplate(const string &ifname)
void x_PostProcessAnnots(TAnnots &annots) const
CFormatGuess::EFormat xInputGetFormat(CNcbiIstream &, CFileContentInfo *=nullptr) const
unique_ptr< CObjectIStream > m_obj_stream
Definition: multireader.hpp:95
CRef< CSerialObject > xReadASN1Text(CObjectIStream &pObjIstrm) const
CObjectIStream –.
Definition: objistr.hpp:93
Definition: Pub.hpp:56
static CRef< CSeq_id > AsSeqId(const string &rawId, long flags=0, bool localInts=true)
Convert a raw ID string to a Seq-id, based in given customization flags.
Definition: read_util.cpp:89
@ fAllIdsAsLocal
all identifiers are local IDs
Definition: reader_base.hpp:78
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
const TAnnot & GetAnnot(void) const
Definition: Seq_entry.cpp:179
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
bool IsSetAnnot(void) const
Definition: Seq_entry.cpp:165
void ResetParentEntry(void)
Definition: Seq_entry.cpp:61
void SetDescr(CSeq_descr &value)
Definition: Seq_entry.cpp:134
TAnnot & SetAnnot(void)
Definition: Seq_entry.cpp:195
void Parentize(void)
Definition: Seq_entry.cpp:71
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
Base class for all serializable objects.
Definition: serialbase.hpp:150
Simple implementation of ILineReader for i(o)streams.
CSubmit_block –.
objects::ILineErrorListener * m_logger
objects::CGapsEditor::TEvidenceSet m_DefaultEvidence
CRef< objects::CSeq_entry > m_entry_template
CRef< objects::CSeq_submit > m_submit_template
void MergeWithTemplate(objects::CSeq_entry &entry) const
objects::CGapsEditor::TCountToEvidenceMap m_GapsizeToEvidence
objects::CBioseq_set::TClass m_ClassValue
void MakeGenomeCenterId(objects::CSeq_entry &entry) const
CTime –.
Definition: ncbitime.hpp:296
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
bool IsDBLink() const
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
static uch flags
#define ParFlat_EMBL_AC
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
Definition: ncbifile.cpp:358
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
Definition: fasta.hpp:112
@ fLetterGaps
Parse runs of Ns when splitting data.
Definition: fasta.hpp:105
@ fIgnoreMods
Ignore mods entirely. Incompatible with fAddMods.
Definition: fasta.hpp:115
@ fNoUserObjs
Don't save raw deflines in User-objects.
Definition: fasta.hpp:106
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fNoSplit
Don't split out ambiguous sequence regions.
Definition: fasta.hpp:99
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
@ fValidate
Check (alphabetic) residue validity.
Definition: fasta.hpp:100
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
Definition: fasta.hpp:114
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
void Read(const CObjectInfo &object)
Read object of know type.
Definition: objistr.cpp:952
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
Definition: objectinfo.hpp:762
virtual string ReadFileHeader(void)
Read file header.
Definition: objistr.cpp:1121
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
TData & SetData(void)
Assign a value to Data data member.
TSub & SetSub(void)
Select the variant.
Definition: Pub_.cpp:195
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_genbank
converted genbank
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Annot_id_.cpp:112
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Annot_id_.hpp:351
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
Definition: Seq_annot_.hpp:861
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
Definition: Seq_annot_.hpp:721
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_annot_.hpp:733
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
bool IsFtable(void) const
Check if variant Ftable is selected.
Definition: Seq_annot_.hpp:615
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Annot_id_.hpp:318
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Annot_id_.cpp:134
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
static const CS_INT unused
Definition: long_binary.c:20
USING_SCOPE(objects)
const CharType(& source)[N]
Definition: pointer.h:1149
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
bool operator()(const CSeq_id *left, const CSeq_id *right) const
Definition: readfeat.cpp:3762
static CRef< CSeq_id > s_GetAnnotId(const CSeq_annot &annot)
static CRef< CSeq_annot > s_GetBioseqAnnot(CBioseq &bioseq)
static void s_RemoveGenBankDbxrefs(list< CRef< CSeq_feat >> &ftable)
void g_ModifySeqIds(CSeq_annot &annot, const CSeq_id &match, CRef< CSeq_id > new_id)
static bool s_HasPrefixMatch(const string &idString, CMultiReader::TAnnotMap &annotMap, map< string, CMultiReader::TAnnotMap::iterator > &matchMap)
static void s_AddAnnotsToBioseq(list< CRef< CSeq_annot >> &annots, CBioseq &bioseq, CRef< CSeq_annot > &pBioseqAnnot)
void g_LogDiagMessage(ILineErrorListener *logger, EDiagSev sev, const string &msg)
Definition: table2asn.cpp:188
else result
Definition: token2.c:20
CFileContentInfoGenbank mInfoGenbank
#define ftable
Definition: utilfeat.h:37
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Thu Apr 25 08:17:35 2024 by modify_doxy.py rev. 669887