NCBI C++ ToolKit
multireader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: multireader.cpp 103021 2024-08-22 18:38:05Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig, Sergiy Gotvyanskyy, NCBI
27  *
28  * File Description:
29  * Reader for selected data file formats
30  *
31  * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "fasta_ex.hpp"
37 
40 
46 
52 #include <objects/pub/Pub.hpp>
55 #include <objects/seq/Pubdesc.hpp>
57 #include <objects/seq/Bioseq.hpp>
59 #include <objects/general/Date.hpp>
61 
62 #include <corelib/ncbistre.hpp>
63 
64 #include <serial/iterator.hpp>
65 #include <serial/objistr.hpp>
66 #include <serial/objostr.hpp>
67 #include <serial/objostrasn.hpp>
68 #include <serial/serial.hpp>
69 #include <objects/seq/Annot_id.hpp>
73 
76 
77 #include "multireader.hpp"
78 #include "table2asn_context.hpp"
79 #include "descr_apply.hpp"
80 #include "annot_match.hpp"
81 
83 
84 #include <corelib/stream_utils.hpp>
85 #include <common/ncbi_revision.h>
86 #include "utils.hpp"
87 
88 #ifndef NCBI_SC_VERSION
89 # define FLATFILE_PARSER_ENABLED
90 #elif (NCBI_SC_VERSION == 0)
91 # define FLATFILE_PARSER_ENABLED
92 #endif
93 
94 #ifdef FLATFILE_PARSER_ENABLED
96 #endif
97 
98 #include <common/test_assert.h> /* This header must go last */
99 
100 
103 
104 
105 namespace
106 {
107 
108  void s_ModifySeqIds(CSeq_annot& annot, const CSeq_id& match, CRef<CSeq_id> new_id)
109  {
110  CTypeIterator<CSeq_loc> visitor(annot);
111 
112  CSeq_id& id = *new_id;
113  while (visitor)
114  {
115  CSeq_loc& loc = *visitor;
116 
117  if (loc.GetId()->Compare(match) == CSeq_id::e_YES)
118  {
119  loc.SetId(id);
120  }
121  ++visitor;
122  }
123  }
124 
125  CRef<CSeq_annot> s_GetBioseqAnnot(CBioseq& bioseq)
126  {
127 
128  CRef<CSeq_annot> pBioseqAnnot;
129 
130  if (bioseq.IsSetAnnot()) {
131  auto& bioseqAnnots = bioseq.SetAnnot();
132  auto it = find_if(bioseqAnnots.begin(),
133  bioseqAnnots.end(),
134  [](CRef<CSeq_annot> pAnnot)
135  {
136  return (pAnnot && pAnnot->IsFtable());
137  });
138  if (it != bioseqAnnots.end()) {
139  pBioseqAnnot = *it;
140  }
141  }
142  return pBioseqAnnot;
143  }
144 
145  void s_AddAnnotsToBioseq(
146  list<CRef<CSeq_annot>>& annots,
147  CBioseq& bioseq,
148  CRef<CSeq_annot>& pBioseqAnnot)
149  {
150  if (pBioseqAnnot) {
151  for (auto pAnnot : annots) {
152  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
153  featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
154  }
155  return;
156  }
157 
158  pBioseqAnnot = s_GetBioseqAnnot(bioseq);
159 
160  if (!pBioseqAnnot) {
161  pBioseqAnnot = annots.front();
162  bioseq.SetAnnot().push_back(pBioseqAnnot);
163  auto it = next(annots.begin());
164  while (it != annots.end()) {
165  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
166  featEdit.MergeFeatures((*it)->SetData().SetFtable());
167  ++it;
168  }
169  }
170  else {
171  for (auto pAnnot : annots) {
172  objects::edit::CFeatTableEdit featEdit(*pBioseqAnnot);
173  featEdit.MergeFeatures(pAnnot->SetData().SetFtable());
174  }
175  }
176  }
177 }
178 
179 
181  CBioseq_set::GetTypeInfo(),
182  CBioseq::GetTypeInfo(),
183  CSeq_entry::GetTypeInfo(),
184  CSeq_submit::GetTypeInfo(),
185  CSeq_annot::GetTypeInfo(),
186 };
187 
188 
189 CRef<CSerialObject> CMultiReader::xReadASN1Binary(CObjectIStream& pObjIstrm, const string& content_type) const
190 {
191  if (content_type == "Bioseq-set") {
192  auto obj = Ref(new CSeq_entry);
193  auto& bioseq_set = obj->SetSet();
194  pObjIstrm.Read(ObjectInfo(bioseq_set));
195  return obj;
196  }
197 
198  if (content_type == "Seq-submit") {
199  auto seqsubmit = Ref(new CSeq_submit);
200  pObjIstrm.Read(ObjectInfo(*seqsubmit));
201  return seqsubmit;
202  }
203 
204  if (content_type == "Seq-entry") {
205  auto obj = Ref(new CSeq_entry);
206  pObjIstrm.Read(ObjectInfo(*obj));
207  return obj;
208  }
209 
210  if (content_type == "Bioseq") {
211  auto obj = Ref(new CSeq_entry);
212  pObjIstrm.Read(ObjectInfo(obj->SetSeq()));
213  return obj;
214  };
215 
216  return {};
217 }
218 
220 {
221  CRef<CSeq_entry> entry;
222  CRef<CSeq_submit> submit;
223 
224  // guess object type
225  string sType;
226  try {
227  sType = pObjIstrm.ReadFileHeader();
228  } catch (const CEofException&) {
229  sType.clear();
230  // ignore EOF exception
231  }
232 
233  // do the right thing depending on the input type
234  if (sType == CBioseq_set::GetTypeInfo()->GetName()) {
235  entry.Reset(new CSeq_entry);
236  pObjIstrm.Read(ObjectInfo(entry->SetSet()), CObjectIStream::eNoFileHeader);
237  } else if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
238  submit.Reset(new CSeq_submit);
239  pObjIstrm.Read(ObjectInfo(*submit), CObjectIStream::eNoFileHeader);
240 
241  if (submit->GetData().GetEntrys().size() > 1) {
242  entry.Reset(new CSeq_entry);
243  entry->SetSet().SetSeq_set() = submit->GetData().GetEntrys();
244  }
245  else
246  entry = *submit->SetData().SetEntrys().begin();
247  } else if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
248  entry.Reset(new CSeq_entry);
249  pObjIstrm.Read(ObjectInfo(*entry), CObjectIStream::eNoFileHeader);
250  } else if (sType == CSeq_annot::GetTypeInfo()->GetName()) {
251  entry.Reset(new CSeq_entry);
252  do {
253  CRef<CSeq_annot> annot(new CSeq_annot);
254  pObjIstrm.Read(ObjectInfo(*annot), CObjectIStream::eNoFileHeader);
255  entry->SetSeq().SetAnnot().push_back(annot);
256  try {
257  sType = pObjIstrm.ReadFileHeader();
258  } catch (const CEofException&) {
259  sType.clear();
260  // ignore EOF exception
261  }
262  } while (sType == CSeq_annot::GetTypeInfo()->GetName());
263  } else {
264  return CRef<CSerialObject>();
265  }
266 
267  if (m_context.m_gapNmin > 0) {
268  CGapsEditor gap_edit(
274  gap_edit.ConvertNs2Gaps(*entry);
275  }
276 
277  if (submit.Empty())
278  return entry;
279  else
280  return submit;
281 }
282 
283 // ----------------------------------------------------------------------------
286 // ----------------------------------------------------------------------------
287 {
288  CAlnReader reader(instream);
289  reader.SetAllGap(args["aln-gapchar"].AsString());
290  reader.SetMissing(args["aln-gapchar"].AsString());
291  if (args["aln-alphabet"].AsString() == "nuc") {
293  } else {
295  }
296 
297  reader.Read(0, m_context.m_logger);
298  auto pSeqEntry =
299  reader.GetSeqEntry(
302 
303  if (pSeqEntry && args["a"]) {
305  s_StringToClass = {
307  { "s1", CBioseq_set::eClass_pop_set },
308  { "s2", CBioseq_set::eClass_phy_set },
309  { "s3", CBioseq_set::eClass_mut_set },
310  { "s4", CBioseq_set::eClass_eco_set },
312  };
313 
314  auto it = s_StringToClass.find(args["a"].AsString());
315  if (it != s_StringToClass.end()) {
316  pSeqEntry->SetSet().SetClass(it->second);
317  }
318  }
319 
320  return pSeqEntry;
321 }
322 
323 
324 // ----------------------------------------------------------------------------
327 {
328  if (m_context.m_gapNmin > 0) {
331  } else {
333 // | CFastaReader::fLeaveAsText;
334  }
335 
336  if (m_context.m_d_fasta) {
338  }
339 
344 
347 
348 
351 
352  unique_ptr<CFastaReaderEx> pReader(new CFastaReaderEx(m_context, instream, m_iFlags));
353  if (! pReader) {
355  "File format not supported", 0);
356  }
357  if (m_context.m_gapNmin > 0) {
358  pReader->SetMinGaps(m_context.m_gapNmin, m_context.m_gap_Unknown_length);
359  }
360 
361  // if (m_context.m_gap_evidences.size() > 0 || m_context.m_gap_type >= 0)
362  if (! m_context.m_GapsizeToEvidence.empty() ||
363  ! m_context.m_DefaultEvidence.empty() ||
364  m_context.m_gap_type >= 0) {
365  pReader->SetGapLinkageEvidence(
369  }
370 
371  int max_seqs = kMax_Int;
373  if (m_context.m_di_fasta)
374  result = pReader->ReadDeltaFasta(m_context.m_logger);
375  else if (m_context.m_d_fasta)
376  result = pReader->ReadDeltaFasta(m_context.m_logger);
377  else
378  result = pReader->ReadSet(max_seqs, m_context.m_logger);
379 
380  if (result.NotEmpty()) {
382  }
383 
384  if (result->IsSet() && ! m_context.m_HandleAsSet) {
387  "File " + m_context.m_current_file + " contains multiple sequences",
388  *(m_context.m_logger));
389  }
390  if (result->IsSet()) {
391  result->SetSet().SetClass(m_context.m_ClassValue);
392  }
393 
394  return result;
395 }
396 
397 // ----------------------------------------------------------------------------
399 {
400  CFormatGuessEx FG(istr);
402  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eFasta); // we wouldn't take "no" for an answer anyway
404  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eXml);
407 
408  if (! content_info) {
409  return FG.GuessFormat();
410  }
411 
413  return FG.GuessFormatAndContent(*content_info);
414 }
415 
416 // ----------------------------------------------------------------------------
417 void CMultiReader::xAnnotGetFormat(objects::edit::CHugeFile& file) const
418 {
419  auto* in = file.m_stream.get();
420  CFormatGuess FG(*in);
421  //FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eBinaryASN);
425  // FG.GetFormatHints().AddPreferredFormat(CFormatGuess::eGff2);
426  // RW-1591: Need at least GFF3 or GTF (plain or Augustus) to properly relate
427  // the features
430 #ifdef FLATFILE_PARSER_ENABLED
434 #endif
436 
437  file.m_format = FG.GuessFormat();
438 
439  switch (file.m_format)
440  {
442  file.m_serial_format = eSerial_AsnText;
443  break;
445  file.m_serial_format = eSerial_AsnBinary;
446  break;
447  default:
448  break;
449  }
450 }
451 
452 // ----------------------------------------------------------------------------
454  const CSerialObject& object,
455  ostream& ostr)
456 {
458  //<< MSerial_VerifyNo
459  << object;
460  ostr.flush();
461 }
462 
464  m_context(context),
465  mAtSequenceData(false)
466 {
467 }
468 
469 void CMultiReader::LoadDescriptors(const string& ifname, CRef<CSeq_descr>& out_desc) const
470 {
471  out_desc.Reset(new CSeq_descr);
472 
473  unique_ptr<CObjectIStream> pObjIstrm = xCreateASNStream(ifname);
474 
475  // guess object type
476  //const string sType = pObjIstrm->ReadFileHeader();
477 
478  // do the right thing depending on the input type
479  while (true) {
480  try {
481  const string sType = pObjIstrm->ReadFileHeader();
482  if (sType == CSeq_descr::GetTypeInfo()->GetName()) {
483  CRef<CSeq_descr> descr(new CSeq_descr);
484  pObjIstrm->Read(ObjectInfo(*descr),
486  out_desc->Set().insert(out_desc->Set().end(), descr->Get().begin(), descr->Get().end());
487  } else if (sType == CSeqdesc::GetTypeInfo()->GetName()) {
488  CRef<CSeqdesc> desc(new CSeqdesc);
489  pObjIstrm->Read(ObjectInfo(*desc),
491  out_desc->Set().push_back(desc);
492  } else if (sType == CPubdesc::GetTypeInfo()->GetName()) {
493  CRef<CSeqdesc> desc(new CSeqdesc);
494  pObjIstrm->Read(ObjectInfo(desc->SetPub()),
496  out_desc->Set().push_back(desc);
497  } else {
498  throw runtime_error("Descriptor file must contain "
499  "either Seq_descr or Seqdesc elements");
500  }
501  } catch (CException& ex) {
502  if (! NStr::EqualNocase(ex.GetMsg(), "end of file")) {
503  throw runtime_error("Unable to read descriptor from file:" + ex.GetMsg());
504  }
505  break;
506  }
507  }
508 }
509 
510 void CMultiReader::LoadTemplate(const string& ifname)
511 {
512  unique_ptr<CObjectIStream> pObjIstrm = xCreateASNStream(ifname);
513 
514  // guess object type
515  string sType = pObjIstrm->ReadFileHeader();
516 
517  // do the right thing depending on the input type
518  if (sType == CSeq_entry::GetTypeInfo()->GetName()) {
521  } else if (sType == CBioseq::GetTypeInfo()->GetName()) {
522  CRef<CBioseq> pBioseq( new CBioseq );
523  pObjIstrm->Read(ObjectInfo(*pBioseq), CObjectIStream::eNoFileHeader);
525  m_context.m_entry_template->SetSeq( *pBioseq );
526  } else if (sType == CSeq_submit::GetTypeInfo()->GetName()) {
529  if (! m_context.m_submit_template->GetData().IsEntrys()
530  || m_context.m_submit_template->GetData().GetEntrys().size() != 1) {
531  throw runtime_error("Seq-submit template must contain "
532  "exactly one Seq-entry");
533  }
534  } else if (sType == CSubmit_block::GetTypeInfo()->GetName()) {
535  // a Submit-block
536  CRef<CSubmit_block> submit_block(new CSubmit_block);
537  pObjIstrm->Read(ObjectInfo(*submit_block), CObjectIStream::eNoFileHeader);
538 
539  // Build a Seq-submit containing this plus a bogus Seq-entry
541  m_context.m_submit_template->SetSub(*submit_block);
542  CRef<CSeq_entry> ent(new CSeq_entry);
543  CRef<CSeq_id> dummy_id(new CSeq_id("lcl|dummy_id"));
544  ent->SetSeq().SetId().push_back(dummy_id);
545  ent->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
546  ent->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
547  m_context.m_submit_template->SetData().SetEntrys().push_back(ent);
548  } else if (sType == CSeqdesc::GetTypeInfo()->GetName()) {
549  // it's OK
550  } else {
551  NCBI_USER_THROW_FMT("Template must be Seq-entry, Seq-submit, Bioseq or "
552  "Submit-block. Object seems to be of type: " << sType);
553  }
554 
555  // for submit types, pull out the seq-entry inside and remember it
557  m_context.m_entry_template = m_context.m_submit_template->SetData().SetEntrys().front();
558  }
559 
560  // The template may contain a set rather than a seq.
561  // That's OK if it contains only one na entry, which we'll use.
564  for (auto ent_iter: m_context.m_entry_template->GetSet().GetSeq_set()) {
565  const CSeq_descr* descr = nullptr;
566  if (ent_iter->IsSetDescr()) {
567  descr = &ent_iter->GetDescr();
568  }
569  if (descr) {
570  //tmp->Assign(**ent_iter);
571  tmp->SetSeq().SetInst();
572  // Copy any descriptors from the set to the sequence
573  ITERATE(CBioseq_set::TDescr::Tdata, desc_iter, descr->Get())
574  {
575  switch ((*desc_iter)->Which()) {
576  case CSeqdesc::e_Pub:
577  case CSeqdesc::e_Source:
578  break;
579  default:
580  continue;
581  }
582  CRef<CSeqdesc> desc(new CSeqdesc);
583  desc->Assign(**desc_iter);
584  tmp->SetSeq().SetDescr().Set().push_back(desc);
585  }
586  break;
587  }
588  }
589 
590  if (tmp->IsSetDescr() && !tmp->GetDescr().Get().empty())
592  }
593 
594  // incorporate any Seqdesc's that follow in the file
595  if (!pObjIstrm->EndOfData()) {
596  if (sType != CSeqdesc::GetTypeInfo()->GetName())
597  sType = pObjIstrm->ReadFileHeader();
598 
599  while (sType == CSeqdesc::GetTypeInfo()->GetName()) {
600  CRef<CSeqdesc> desc(new CSeqdesc);
601  pObjIstrm->Read(ObjectInfo(*desc), CObjectIStream::eNoFileHeader);
602 
605 
606  {
607  if (desc->IsUser() && desc->GetUser().IsDBLink()) {
608  CUser_object& user_obj = desc->SetUser();
609  if (user_obj.IsDBLink()) {
610  user_obj.SetData();
611  }
612  }
613  }
614 
615  m_context.m_entry_template->SetSeq().SetDescr().Set().push_back(desc);
616 
617  if (pObjIstrm->EndOfData())
618  break;
619 
620  try {
621  sType = pObjIstrm->ReadFileHeader();
622  } catch (CEofException&) {
623  break;
624  }
625  }
626  }
627 
628 #if 0
629  if (m_context.m_submit_template->IsEntrys()) {
630  // Take Seq-submit.sub.cit and put it in the Bioseq
631  CRef<CPub> pub(new CPub);
632  pub->SetSub().Assign(context.m_submit_template->GetSub().GetCit());
633  CRef<CSeqdesc> pub_desc(new CSeqdesc);
634  pub_desc->SetPub().SetPub().Set().push_back(pub);
635  m_context.m_entry_template->SetSeq().SetDescr().Set().push_back(pub_desc);
636  }
637 #endif
638 
640  throw runtime_error("The Seq-entry must be a Bioseq not a Bioseq-set.");
641  }
642 
644  if (m_context.m_submit_template->IsSetSub() &&
645  m_context.m_submit_template->GetSub().IsSetCit()) {
647  m_context.m_submit_template->SetSub().SetCit().SetDate(*date);
648  }
649  }
650 
651 #if 0
652  if (args["output-type"].AsString() == "Seq-entry") {
653  // force Seq-entry by throwing out the Seq-submit
655  }
656 #endif
657 }
658 
659 namespace
660 {
661  class AllowedDuplicates : public set<CSeqdesc_Base::E_Choice>
662  {
663  public:
664  AllowedDuplicates()
665  {
666  insert(CSeqdesc_Base::e_User);
667  }
668  };
669  AllowedDuplicates m_allowed_duplicates;
670 
671  template <typename _which>
672  struct LocateWhich {
673  typename _which::E_Choice compare_to;
674  bool operator()(_which l) const
675  {
676  return l.Which() == compare_to;
677  }
678  bool operator()(const CRef<_which>& l) const
679  {
680  return l->Which() == compare_to;
681  }
682  };
683 }
684 
686 {
687  ITERATE(CSeq_descr::Tdata, it, source.Get())
688  {
689  MergeDescriptors(dest, **it);
690  }
691 }
692 
693 void CMultiReader::MergeDescriptors(CSeq_descr & dest, const CSeqdesc & source) const
694 {
695  bool duplicates = (m_allowed_duplicates.find(source.Which()) != m_allowed_duplicates.end());
696 
697  CAutoAddDesc desc(dest, source.Which());
698  desc.Set(duplicates).Assign(source);
699 }
700 
702 {
703  MergeDescriptors(entry.SetDescr(), source);
704  //g_ApplyDescriptors(source.Get(), entry);
705 }
706 
707 namespace
708 {
709  void CopyDescr(CSeq_entry& dest, const CSeq_entry& src) {
710  if (src.IsSetDescr() && ! src.GetDescr().Get().empty()) {
711  dest.SetDescr().Set().insert(dest.SetDescr().Set().end(),
712  src.GetDescr().Get().begin(),
713  src.GetDescr().Get().end());
714  }
715  }
716  void CopyAnnot(CSeq_entry& dest, const CSeq_entry& src) {
717  if (src.IsSetAnnot() && ! src.GetAnnot().empty()) {
718  dest.SetAnnot().insert(dest.SetAnnot().end(),
719  src.GetAnnot().begin(),
720  src.GetAnnot().end());
721  }
722  }
723 }
724 
725 void CMultiReader::LoadGFF3Fasta(istream& in, TAnnots& annots)
726 {
730  string("Recognized input file as format: ") + CFormatGuess::GetFormatName(CFormatGuess::eGff3));
731 
732  bool post_process = false;
733  annots = xReadGFF3(in, post_process); // initializes m_gff3_reader!
734  if (! AtSeqenceData()) {
736  "Specified GFF3 file does not include any sequence data", 0);
737  }
738  x_PostProcessAnnots(annots);
739 }
740 
741 
743  const string& objectType,
744  unique_ptr<istream>& pIstr,
745  TAnnots& annots)
746 {
747  CRef<CSerialObject> pInputObject;
748  switch (format) {
751  pInputObject = xReadASN1Binary(*m_obj_stream, objectType);
752  break;
755  pInputObject = xReadASN1Text(*m_obj_stream);
756  break;
757  case CFormatGuess::eGff3:
758  LoadGFF3Fasta(*pIstr, annots);
759  case CFormatGuess::eFasta: // What about buffered input?
760  default:
762  pInputObject = xReadFasta(*pIstr);
763  }
764 
765  if (! pInputObject) {
767  "File format not recognized", 0);
768  }
769  // RW-617: apply template descriptors only if input is *not* ASN1:
770  // What about binary ASN.1?
771  bool merge_template_descriptors = (format != CFormatGuess::eTextASN);
772  return xApplyTemplate(pInputObject, merge_template_descriptors);
773 }
774 
775 
776 CFormatGuess::EFormat CMultiReader::OpenFile(const string& filename, CRef<CSerialObject>& input_sequence, TAnnots& annots)
777 {
779  CFileContentInfo content_info;
780  {
781  unique_ptr<istream> istream(new CNcbiIfstream(filename));
782  format = xInputGetFormat(*istream, &content_info);
783  }
784 
785  switch (format)
786  {
789  input_sequence = xReadASN1Binary(*m_obj_stream, content_info.mInfoGenbank.mObjectType);
790  break;
793  input_sequence = xReadASN1Text(*m_obj_stream);
794  break;
795  case CFormatGuess::eGff3:
796  {
797  unique_ptr<istream> in(new CNcbiIfstream(filename));
798  LoadGFF3Fasta(*in, annots);
799  m_iFlags = 0;
801  input_sequence = xReadFasta(*in);
802  }
803  break;
804  default: // RW-616 - Assume FASTA
805  {
807  m_iFlags = 0;
809 
810  CBufferedInput istream;
811  istream.get().open(filename);
812  input_sequence = xReadFasta(istream);
813  }
814  break;
815  }
816  if (input_sequence.Empty())
818  "File format not recognized", 0);
819  //rw-617: apply template descriptors only if input is *not* ASN1:
820  bool merge_template_descriptors = (format != CFormatGuess::eTextASN);
821  input_sequence = xApplyTemplate(input_sequence, merge_template_descriptors);
822 
823  return format;
824 }
825 
827 {
828  if (obj->GetThisTypeInfo() == CSeq_submit::GetTypeInfo()) {
829  submit.Reset(static_cast<CSeq_submit*>(obj.GetPointer()));
830  entry = submit->SetData().SetEntrys().front();
831  } else if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
832  entry.Reset(static_cast<CSeq_entry*>(obj.GetPointer()));
833  }
834 }
835 
836 CRef<CSerialObject> CMultiReader::xApplyTemplate(CRef<CSerialObject> obj, bool merge_template_descriptors) const
837 {
838  CRef<CSeq_entry> entry;
839  CRef<CSeq_submit> submit;
840 
841  GetSeqEntry(entry, submit, obj);
842 
843  if (entry.NotEmpty()) // &&
844  {
845  if (submit.Empty())
846  if (entry->IsSet() && entry->GetSet().GetSeq_set().size() < 2 &&
847  entry->GetSet().GetSeq_set().front()->IsSeq())
848  {
849  CRef<CSeq_entry> seq = entry->SetSet().SetSeq_set().front();
850  CopyDescr(*seq, *entry);
851  CopyAnnot(*seq, *entry);
852  entry = seq;
853  }
854  entry->ResetParentEntry();
855  entry->Parentize();
856 
857  if (merge_template_descriptors) {
859  } else {
860  if (m_context.m_t && m_context.m_logger) {
861  string msg(
862  "Template file descriptors are ignored if input is ASN.1");
863  m_context.m_logger->PutError(
864  *unique_ptr<CLineError>(
866  eDiag_Warning, "", 0, "", "", "", msg)));
867  }
868  }
869  }
870 
871  if (submit.Empty())
872  return entry;
873  else
874  return submit;
875 }
876 
878 {
879  if (m_obj_stream)
880  return xReadASN1Text(*m_obj_stream);
881  else
882  return CRef<CSerialObject>();
883 }
884 
886 {
887  int flags = 0;
892 
893  CReaderListener readerListener;
894  CGff3Reader reader(flags, m_AnnotName, m_AnnotTitle, CReadUtil::AsSeqId, &readerListener);
895 
896  CStreamLineReader lr(instream);
897  TAnnots annots;
898 
899  try {
900  reader.ReadSeqAnnots(annots, lr, m_context.m_logger);
901  m_gff3_merger = reader.GetLocationMerger();
902  mAtSequenceData = reader.AtSequenceData();
903 
904  if (post_process) {
905  x_PostProcessAnnots(annots);
906  }
907 
908  for (const auto& msg : readerListener) {
909  m_context.m_logger->PutMessage(msg);
910  }
911  }
912  catch (const CReaderMessage& msg) {
913  m_context.m_logger->PutMessage(msg);
914  }
915 
916  return annots;
917 }
918 
919 
921 {
922  for (auto pFeat : ftable) {
923  if (pFeat->IsSetDbxref()) {
924  auto& dbxrefs = pFeat->SetDbxref();
925  auto it = remove_if(dbxrefs.begin(), dbxrefs.end(),
926  [](const CRef<CDbtag>& pDbtag) {
927  return(pDbtag && pDbtag->IsSetDb() &&
928  NStr::EqualNocase(pDbtag->GetDb(), "GenBank"));
929  });
930  dbxrefs.erase(it, dbxrefs.end());
931  if (dbxrefs.empty()) {
932  pFeat->ResetDbxref();
933  }
934  }
935  }
936 }
937 
938 
940 {
941  unsigned int startingLocusTagNumber = 1;
942  unsigned int startingFeatureId = 1;
943  for (auto it = annots.begin(); it != annots.end(); ++it) {
944 
945  auto& annot = **it;
946  auto& data = annot.SetData();
947  if (! data.IsFtable() || data.GetFtable().empty()) {
948  continue; // all that follows applies to feature tables only
949  }
950 
951  s_RemoveGenBankDbxrefs(data.SetFtable()); // RW-1861
952 
953  edit::CFeatTableEdit fte(
954  annot, 0, m_context.m_locus_tag_prefix, startingLocusTagNumber, startingFeatureId, m_context.m_logger);
955  //fte.InferPartials();
956  fte.GenerateMissingParentFeatures(m_context.m_eukaryote, m_gff3_merger.get());
958  if (m_context.m_locus_tag_prefix.empty() && !fte.AnnotHasAllLocusTags()) {
959  NCBI_THROW(CArgException, eNoArg,
960  "GFF annotation requires locus tags, which are missing from one or more genes, so the command line argument -locus-tag-prefix is needed");
961  }
962  fte.GenerateLocusTags();
963  }
964  fte.GenerateProteinAndTranscriptIds();
965  //fte.InstantiateProducts();
966  fte.ProcessCodonRecognized();
967  fte.EliminateBadQualifiers();
968  fte.SubmitFixProducts();
969 
970  startingLocusTagNumber = fte.PendingLocusTagNumber();
971  startingFeatureId = fte.PendingFeatureId();
972  }
973 }
974 
975 
976 unique_ptr<CObjectIStream> CMultiReader::xCreateASNStream(const string& filename) const
977 {
978  unique_ptr<istream> instream(new CNcbiIfstream(filename));
979  return xCreateASNStream(CFormatGuess::eUnknown, instream);
980 }
981 
982 unique_ptr<CObjectIStream> CMultiReader::xCreateASNStream(CFormatGuess::EFormat format, unique_ptr<istream>& instream) const
983 {
984  // guess format
985  ESerialDataFormat eSerialDataFormat = eSerial_None;
986  {
988  format = xInputGetFormat(*instream);
989 
990  switch(format) {
992  eSerialDataFormat = eSerial_AsnBinary;
993  break;
996  eSerialDataFormat = eSerial_AsnText;
997  break;
998  case CFormatGuess::eXml:
999  eSerialDataFormat = eSerial_Xml;
1000  break;
1001  default:
1003  "Descriptor file seems to be in an unsupported format: "
1005  break;
1006  }
1007 
1008  //instream.seekg(0);
1009  }
1010 
1011  unique_ptr<CObjectIStream> pObjIstrm(
1012  CObjectIStream::Open(eSerialDataFormat, *instream.release(), eTakeOwnership));
1013 
1014  return pObjIstrm;
1015 }
1016 
1018 {
1019 }
1020 
1021 void CMultiReader::LoadIndexedAnnot(std::unique_ptr<IIndexedFeatureReader>& reader, const string& filename)
1022 {
1023  auto hugefile = std::make_unique<objects::edit::CHugeFile>();
1024  hugefile->OpenPlain(filename);
1025 
1026  xAnnotGetFormat(*hugefile);
1027  CFormatGuess::EFormat uFormat = hugefile->m_format;
1028 
1029  if (uFormat == CFormatGuess::eUnknown) {
1030  string ext;
1031  CDirEntry::SplitPath(filename, nullptr, nullptr, &ext);
1032  NStr::ToLower(ext);
1033  if (ext == ".gff" || ext == ".gff3")
1034  uFormat = CFormatGuess::eGff3;
1035  else if (ext == ".gtf")
1036  uFormat = CFormatGuess::eGtf;
1037  else if (ext == ".tbl")
1039  else if (ext == ".asn" || ext == ".sqn" || ext == ".sap")
1040  uFormat = CFormatGuess::eTextASN;
1041 
1042  if (uFormat != CFormatGuess::eUnknown) {
1046  string("Presuming annotation format by filename suffix: ") + CFormatGuess::GetFormatName(uFormat));
1047  }
1048  } else {
1052  string("Recognized annotation format: ") + CFormatGuess::GetFormatName(uFormat));
1053  }
1054 
1055  TAnnots annots;
1056  auto* in = hugefile->m_stream.get();
1057 
1058  switch (uFormat) {
1060  auto reader5col = std::make_unique<CFast5colReader>();
1061  //auto reader5col = std::make_unique<CWholeFileAnnotation>();
1062  long reader_flags =
1067 
1068  reader5col->Init(m_context.m_genome_center_id, reader_flags, m_context.m_logger);
1069  reader5col->Open(std::move(hugefile));
1070  reader = std::move(reader5col);
1071  } break;
1072  case CFormatGuess::eTextASN: {
1073  auto obj_stream = hugefile->MakeObjStream(0);
1074  CRef<CSerialObject> obj = xReadASN1Text(*obj_stream);
1076  CRef<CSeq_entry> pEntry;
1077  GetSeqEntry(pEntry, unused, obj);
1078  if (pEntry && pEntry->IsSetAnnot()) {
1079  annots = pEntry->GetAnnot();
1080  }
1081  } break;
1082  case CFormatGuess::eGff3:
1083  annots = xReadGFF3(*in, true);
1084  break;
1085  case CFormatGuess::eGtf:
1087  annots = xReadGTF(*in);
1088  break;
1089 #ifdef FLATFILE_PARSER_ENABLED
1093  auto pEntry = xReadFlatfile(uFormat, filename, *in);
1094  if (pEntry && pEntry->IsSetAnnot()) {
1095  annots = pEntry->GetAnnot();
1096  }
1097  } break;
1098 #endif
1099 
1100  default:
1102  "Annotation file format not recognized. Run format validator on your annotation file", 1);
1103  }
1104 
1105  if (!reader.get() && !annots.empty()) {
1106  auto whole_file = std::make_unique<CWholeFileAnnotation>();
1107  whole_file->Init(m_context.m_genome_center_id, 0);
1108  whole_file->AddAnnots(annots);
1109  reader = std::move(whole_file);
1110  }
1111 }
1112 
1114 {
1115  if (!reader)
1116  return;
1117 
1118  CRef<CSeq_annot> pBioseqAnnot;
1119 
1120  std::vector<CRef<CSeq_id>> ids(bioseq.GetId().begin(), bioseq.GetId().end());
1121  std::sort(ids.begin(), ids.end(), [](CRef<CSeq_id> l, CRef<CSeq_id> r) -> bool
1122  {
1123  return CSeq_id::Score(l) < CSeq_id::Score(r);
1124  });
1125 
1126  //for (auto pSeqId : bioseq.GetId()) {
1127  for (auto pSeqId : ids) {
1128  auto annots = reader->GetAndUseAnnot(pSeqId);
1129 
1130  if (annots.empty()) {
1131 #ifdef _DEBUG
1132  //std::cerr << "Failed to find annot for:\n" << MSerial_AsnText << *pSeqId;
1133 #endif
1134  continue;
1135  }
1136 
1137  for (auto pAnnot : annots) {
1138  auto pAnnotId = IIndexedFeatureReader::GetAnnotId(*pAnnot);
1139  CRef<CSeq_id> matching_id = pSeqId;
1140 #ifdef _DEBUG
1141  //std::cerr << MSerial_AsnText << *pSeqId << *pAnnotId << *matching_id;
1142 #endif
1143 
1144  s_ModifySeqIds(*pAnnot, *pAnnotId, matching_id);
1145  }
1146 
1147  s_AddAnnotsToBioseq(annots, bioseq, pBioseqAnnot);
1148  }
1149 }
1150 
1151 
1153 {
1154  int flags = 0;
1158 
1160  CStreamLineReader lr(instream);
1161  TAnnots annots;
1162  try {
1163  reader.ReadSeqAnnots(annots, lr, m_context.m_logger);
1164  x_PostProcessAnnots(annots);
1165  } catch (CReaderMessage& msg) {
1166  m_context.m_logger->PutMessage(msg);
1167  }
1168 
1169  return annots;
1170 }
1171 
1172 #ifdef FLATFILE_PARSER_ENABLED
1174 {
1175  unique_ptr<Parser> pp(new Parser);
1176  switch (format)
1177  {
1179  pp->format = Parser::EFormat::GenBank;
1180  pp->source = Parser::ESource::GenBank;
1181  pp->seqtype = CSeq_id::e_Genbank;
1182  break;
1184  pp->format = Parser::EFormat::EMBL;
1185  pp->source = Parser::ESource::EMBL;
1186  pp->acprefix = ParFlat_EMBL_AC;
1187  pp->seqtype = CSeq_id::e_Embl;
1188  break;
1190  pp->format = Parser::EFormat::SPROT;
1191  pp->source = Parser::ESource::SPROT;
1192  pp->seqtype = CSeq_id::e_Swissprot;
1193  break;
1194  default:
1196  "This flat file format is not supported: " + filename, 0);
1197  break;
1198  }
1199 
1200  pp->output_format = Parser::EOutput::BioseqSet;
1201 
1203  auto obj = ffparser.Parse(*pp, instream);
1204  if (obj.NotEmpty()) {
1205  if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
1206  auto bioseq_set = Ref(CTypeConverter<CBioseq_set>::SafeCast(obj.GetPointerOrNull()));
1207  auto entry = Ref(new CSeq_entry);
1208  entry->SetSeq();
1209  auto& annot = entry->SetAnnot();
1210  for (auto& bioseq : bioseq_set->SetSeq_set()) {
1211  if (bioseq->IsSetAnnot())
1212  annot.splice(annot.end(), bioseq->SetAnnot());
1213  }
1214  if (entry->IsSetAnnot())
1215  return entry;
1216  }
1217  }
1218  return {};
1219 }
1220 #endif
1221 
1222 void CMultiReader::GetIndexedAnnot(std::unique_ptr<IIndexedFeatureReader>& reader, TAnnots& annots)
1223 {
1224  auto indexed_annots = std::make_unique<CWholeFileAnnotation>();
1225  indexed_annots->Init(m_context.m_genome_center_id, 0);
1226  indexed_annots->AddAnnots(annots);
1227  reader = std::move(indexed_annots);
1228 }
1229 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
Definition: utils.cpp:41
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
Definition: aln_reader.hpp:100
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
void SetAlphabet(const string &value)
Definition: aln_reader.hpp:371
CRef< objects::CSeq_entry > GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods, objects::ILineErrorListener *pErrorListener=nullptr)
Definition: aln_reader.cpp:722
void SetMissing(const string &value)
Definition: aln_reader.hpp:192
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
Definition: aln_reader.hpp:433
CArgException –.
Definition: ncbiargs.hpp:120
CArgs –.
Definition: ncbiargs.hpp:379
_Stream & get()
Definition: utils.hpp:54
Definition: Date.hpp:53
@ ePrecision_day
Definition: Date.hpp:58
Modification of the CFastaReader class that allows for reading a single sequence as a degenarate mult...
@ fAllIdsAsLocal
= 0x100 (Do not attempt to parse accessions)
Definition: readfeat.hpp:75
@ fLeaveProteinIds
= 0x80 (Leave all protein_id as a qualifiers)
Definition: readfeat.hpp:74
@ fCreateGenesFromCDSs
= 0x10 (If a CDS has a gene xref, create a gene with the same intervals if one doesn't already exist....
Definition: readfeat.hpp:71
@ fPreferGenbankId
= 0x200 (Prefer Genbank accession ids)
Definition: readfeat.hpp:76
CRef< CSerialObject > Parse(Parser &parseInfo)
Definition: ftamain.cpp:721
Wraps CFormatGuess, and if CFormatGuess's result is Unknown, it tries every file reader until one wor...
CFormatGuess::EFormat GuessFormatAndContent(CFileContentInfo &contentInfo)
CFormatGuess::CFormatHints & GetFormatHints(void)
Get format hints.
void SetRecognizedGenbankTypes(const set< TTypeInfo > &recognizedGenbankTypes)
CFormatGuess::EFormat GuessFormat()
CFormatHints & AddPreferredFormat(TFormat fmt)
Mark the format as preferred.
CFormatHints & DisableAllNonpreferred(void)
Disable all formats not marked as preferred.
Class implements different ad-hoc unreliable file format identifications.
CFormatHints & GetFormatHints(void)
Get format hints.
EFormat
The formats are checked in the same order as declared here.
@ eFiveColFeatureTable
Five-column feature table.
@ eBinaryASN
Binary ASN.1.
@ eGtf
New GTF, CGtfReader.
@ eGff3
GFF3, CGff3Reader.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eUnknown
unknown format
@ eGffAugustus
GFFish output of Augustus Gene Prediction.
@ eTextASN
Text ASN.1.
EFormat GuessFormat(EMode)
static const char * GetFormatName(EFormat format)
void ConvertNs2Gaps(CSeq_entry &entry)
Definition: gaps_edit.cpp:403
bool AtSequenceData() const
void ReadSeqAnnots(TAnnotList &, CNcbiIstream &, ILineErrorListener *=nullptr) override
Read all objects from given insput stream, returning them as a vector of Seq-annots.
shared_ptr< CGff3LocationMerger > GetLocationMerger()
@ fGenerateChildXrefs
Definition: gtf_reader.hpp:218
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
Definition: line_error.cpp:42
string m_AnnotName
Definition: multireader.hpp:92
CFormatGuess::EFormat OpenFile(const string &filename, CRef< CSerialObject > &input_sequence, TAnnots &annots)
void LoadGFF3Fasta(istream &in, TAnnots &annots)
static void GetSeqEntry(CRef< objects::CSeq_entry > &entry, CRef< objects::CSeq_submit > &submit, CRef< CSerialObject > obj)
string m_AnnotTitle
Definition: multireader.hpp:93
TAnnots xReadGFF3(CNcbiIstream &instream, bool post_process)
void MergeDescriptors(objects::CSeq_descr &dest, const objects::CSeq_descr &source) const
CMultiReader(CTable2AsnContext &context)
list< CRef< objects::CSeq_annot > > TAnnots
Definition: multireader.hpp:48
unique_ptr< CObjectIStream > xCreateASNStream(const string &filename) const
void LoadIndexedAnnot(std::unique_ptr< IIndexedFeatureReader > &reader, const string &filename)
void AddAnnots(IIndexedFeatureReader *reader, CBioseq &bioseq) const
CTable2AsnContext & m_context
Definition: multireader.hpp:94
CRef< CSerialObject > ReadNextEntry()
void xAnnotGetFormat(objects::edit::CHugeFile &file) const
static const set< TTypeInfo > kSupportedTypes
Definition: multireader.hpp:50
TAnnots xReadGTF(CNcbiIstream &instream) const
shared_ptr< objects::CGff3LocationMerger > m_gff3_merger
Definition: multireader.hpp:96
CRef< CSerialObject > xReadASN1Binary(CObjectIStream &pObjIstrm, const string &content_type) const
bool AtSeqenceData() const
Definition: multireader.hpp:75
bool mAtSequenceData
Definition: multireader.hpp:97
void WriteObject(const CSerialObject &, ostream &)
void LoadDescriptors(const string &ifname, CRef< objects::CSeq_descr > &out_desc) const
void ApplyDescriptors(objects::CSeq_entry &obj, const objects::CSeq_descr &source) const
CRef< CSerialObject > xApplyTemplate(CRef< CSerialObject > obj, bool merge_template_descriptors) const
CRef< objects::CSeq_entry > xReadFasta(CNcbiIstream &instream)
CRef< CSerialObject > FetchEntry(const CFormatGuess::EFormat &format, const string &objectType, unique_ptr< CNcbiIstream > &pIstr, TAnnots &annots)
CRef< objects::CSeq_entry > ReadAlignment(CNcbiIstream &instream, const CArgs &args)
void LoadTemplate(const string &ifname)
void GetIndexedAnnot(std::unique_ptr< IIndexedFeatureReader > &reader, TAnnots &annots)
void x_PostProcessAnnots(TAnnots &annots) const
CFormatGuess::EFormat xInputGetFormat(CNcbiIstream &, CFileContentInfo *=nullptr) const
unique_ptr< CObjectIStream > m_obj_stream
Definition: multireader.hpp:95
CRef< objects::CSeq_entry > xReadFlatfile(CFormatGuess::EFormat format, const string &filename, CNcbiIstream &instream)
CRef< CSerialObject > xReadASN1Text(CObjectIStream &pObjIstrm) const
CObjectIStream –.
Definition: objistr.hpp:93
Definition: Pub.hpp:56
static CRef< CSeq_id > AsSeqId(const string &rawId, long flags=0, bool localInts=true)
Convert a raw ID string to a Seq-id, based in given customization flags.
Definition: read_util.cpp:89
@ fAllIdsAsLocal
all identifiers are local IDs
Definition: reader_base.hpp:78
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
const TAnnot & GetAnnot(void) const
Definition: Seq_entry.cpp:179
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
bool IsSetAnnot(void) const
Definition: Seq_entry.cpp:165
void ResetParentEntry(void)
Definition: Seq_entry.cpp:61
void SetDescr(CSeq_descr &value)
Definition: Seq_entry.cpp:134
TAnnot & SetAnnot(void)
Definition: Seq_entry.cpp:195
void Parentize(void)
Definition: Seq_entry.cpp:71
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
Base class for all serializable objects.
Definition: serialbase.hpp:150
Simple implementation of ILineReader for i(o)streams.
CSubmit_block –.
objects::ILineErrorListener * m_logger
objects::CGapsEditor::TEvidenceSet m_DefaultEvidence
CRef< objects::CSeq_entry > m_entry_template
CRef< objects::CSeq_submit > m_submit_template
void MergeWithTemplate(objects::CSeq_entry &entry) const
objects::CGapsEditor::TCountToEvidenceMap m_GapsizeToEvidence
objects::CBioseq_set::TClass m_ClassValue
void MakeGenomeCenterId(objects::CSeq_entry &entry) const
CTime –.
Definition: ncbitime.hpp:296
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
bool IsDBLink() const
virtual std::list< CRef< objects::CSeq_annot > > GetAndUseAnnot(CRef< objects::CSeq_id > seqid)=0
static CRef< objects::CSeq_id > GetAnnotId(const objects::CSeq_annot &annot)
Definition: annot_match.cpp:52
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
static uch flags
#define ParFlat_EMBL_AC
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_USER_THROW_FMT(message)
Throw a "user exception" with message processed as output to ostream.
Definition: ncbiexpt.hpp:724
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
Definition: ncbifile.cpp:358
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
@ fHyphensIgnoreAndWarn
When a hyphen is encountered in seq data, ignore it but warn.
Definition: fasta.hpp:112
@ fLetterGaps
Parse runs of Ns when splitting data.
Definition: fasta.hpp:105
@ fIgnoreMods
Ignore mods entirely. Incompatible with fAddMods.
Definition: fasta.hpp:115
@ fNoUserObjs
Don't save raw deflines in User-objects.
Definition: fasta.hpp:106
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fNoSplit
Don't split out ambiguous sequence regions.
Definition: fasta.hpp:99
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
@ fValidate
Check (alphabetic) residue validity.
Definition: fasta.hpp:100
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
Definition: fasta.hpp:114
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
void Read(const CObjectInfo &object)
Read object of know type.
Definition: objistr.cpp:952
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
Definition: objectinfo.hpp:762
virtual string ReadFileHeader(void)
Read file header.
Definition: objistr.cpp:1121
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5355
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
TData & SetData(void)
Assign a value to Data data member.
TSub & SetSub(void)
Select the variant.
Definition: Pub_.cpp:195
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_genbank
converted genbank
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
void SetPub(TPub &value)
Assign a value to Pub data member.
Definition: Pubdesc_.cpp:72
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
FILE * file
static const CS_INT unused
Definition: long_binary.c:20
USING_SCOPE(objects)
constexpr auto sort(_Init &&init)
const CharType(& source)[N]
Definition: pointer.h:1149
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins lr(sljit_gpr dst, sljit_gpr src)
static void s_RemoveGenBankDbxrefs(list< CRef< CSeq_feat >> &ftable)
void g_LogDiagMessage(ILineErrorListener *logger, EDiagSev sev, const string &msg)
Definition: table2asn.cpp:185
else result
Definition: token2.c:20
CFileContentInfoGenbank mInfoGenbank
#define ftable
Definition: utilfeat.h:37
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Wed Sep 04 14:59:54 2024 by modify_doxy.py rev. 669887