NCBI C++ ToolKit
asn2flat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: asn2flat.cpp 102531 2024-05-24 19:31:38Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aaron Ucko, Mati Shomrat, Mike DiCuccio, Jonathan Kans, NCBI
27  *
28  * File Description:
29  * flat-file generator application
30  *
31  * ===========================================================================
32  */
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbistr.hpp>
38 #include <corelib/ncbi_signal.hpp>
40 
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/serial.hpp>
44 
51 
53 
55 #include <objmgr/scope.hpp>
56 #include <objmgr/seq_entry_ci.hpp>
57 #include <objmgr/util/sequence.hpp>
59 
66 
67 #include <util/compress/zlib.hpp>
68 #include <util/compress/stream.hpp>
69 
70 #include <objmgr/util/objutil.hpp>
71 
76 #include <future>
77 #include "fileset.hpp"
79 
80 #define USE_THREAD_POOL1
81 
82 #ifdef USE_THREAD_POOL
83 #include "threadpool.hpp"
84 #endif
85 
86 
87 #define USE_CDDLOADER
88 
89 #if defined(HAVE_LIBGRPC) && defined(HAVE_NCBI_VDB)
90 # define USE_SNPLOADER
91 #endif
92 
93 #ifdef USE_SNPLOADER
94 // ID-5865 : For SNP retrieval in PSG mode via SNP data loader
96 # include <grpc++/grpc++.h>
99 #endif
100 
101 #ifdef USE_CDDLOADER
104 #endif
105 
106 // For command-line app, the URL paths need to be absolute
107 #define NCBI_URL_BASE "https://www.ncbi.nlm.nih.gov"
108 
111 
112 
113 // all sequence output stream
114 // nucleotide output stream
115 // genomic output stream
116 // RNA output stream
117 // protein output stream
118 // unknown output stream
119 enum class eFlatFileCodes { all, nuc, gen, rna, prot, unk, };
120 
128 
129 class CWrapINSDSet;
130 
132 {
133 public:
134  CAsn2FlatApp();
135  ~CAsn2FlatApp();
136 
137  void Init() override;
138  int Run() override;
139  bool WrapINSDSet(bool unwrap);
140 
141  // Each thread should have its own context
142  struct TFFContext {
145  CRef<CFlatFileGenerator> m_FFGenerator; // Flat-file generator
146  CFFMultiSourceFileSet::fileset_type m_streams; // multiple streams for each of eFlatFileCodes
147  };
148 
150 
151 protected:
152  bool SetFlatfileOstream(eFlatFileCodes _code, const string& name);
153 
154 private:
155  // types
157 
158  using TThreadStatePool = TResourcePool<TFFContext>;
159 
163  bool HandleSeqSubmit(TFFContext& context, CSeq_submit& sub) const;
164  void HandleTextId(TFFContext& context, const string& id) const;
165  bool HandleSeqId(TFFContext& context, const edit::CHugeAsnReader* reader, CConstRef<CSeq_id> seqid) const;
166 
170 
171  [[nodiscard]] unique_ptr<CObjectIStream> x_OpenIStream(const CArgs& args) const;
172 
173  void x_CreateFlatFileGenerator(TFFContext& context, const CArgs& args) const;
174  TSeqPos x_GetFrom(const CArgs& args) const;
175  TSeqPos x_GetTo(const CArgs& args) const;
176  ENa_strand x_GetStrand(const CArgs& args) const;
177  void x_GetLocation(const CSeq_entry_Handle& entry, const CArgs& args, CSeq_loc& loc) const;
178  CBioseq_Handle x_DeduceTarget(const CSeq_entry_Handle& entry) const;
181  int x_GenerateBatchMode(unique_ptr<CObjectIStream> is);
182  int x_GenerateTraditionally(unique_ptr<CObjectIStream> is, TFFContext& context, const CArgs& args) const;
183  int x_GenerateHugeMode();
184  template<typename _TMethod, typename... TArgs>
185  bool x_OneShot(_TMethod method, TArgs&& ... args);
186  template<typename _TMethod, typename... TArgs>
187  static void x_OneShotMethod(_TMethod method, const CAsn2FlatApp* app, CAsn2FlatApp::TThreadStatePool::TUniqPointer thread_state, TArgs ... args);
188 
191 
192  //[[nodiscard]] TGenbankBlockCallback* x_GetGenbankCallback(const CArgs& args) const;
193  //[[nodiscard]] ICanceled* x_CreateCancelBenchmarkCallback() const;
194 
195  // data
196  CRef<CObjectManager> m_Objmgr; // Object Manager can be mutable
198 #ifdef USE_THREAD_POOL
199  TThreadPool m_thread_pool;
200 #endif
201 
202  // Everything else should be unchangeable within CFlatfileGenerator runs
203 
206  bool m_use_mt{false};
208  unique_ptr<ICanceled> m_pCanceledCallback;
210  mutable std::atomic<bool> m_Exception;
212  bool m_PSGMode;
215  mutable std::atomic<bool> m_stopit{false};
216  edit::CHugeFileProcess m_huge_process;
217 
218 #ifdef USE_SNPLOADER
219  CRef<CSNPDataLoader> m_SNPDataLoader;
220  unique_ptr<ncbi::grpcapi::dbsnp::primary_track::DbSnpPrimaryTrack::Stub> m_SNPTrackStub;
221 #endif
222 #ifdef USE_CDDLOADER
224 #endif
225 };
226 
227 // only print <INSDSet> ... </INSDSet> wrappers if single output stream
229 {
230 public:
232  {
233  if (app)
234  if (app->WrapINSDSet(false))
235  m_app = app;
236  }
238  {
239  if (m_app)
240  m_app->WrapINSDSet(true);
241  }
242 
243 private:
244  CAsn2FlatApp* m_app = nullptr;
245 };
246 
247 // constructor
249 {
251  SetVersion(vers);
252 }
253 
254 // destructor
256 {
257 }
258 
260 {
261  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
262  arg_desc->SetUsageContext("", "Convert an ASN.1 Seq-entry into a flat report");
263 
264  // input
265  {
266  arg_desc->SetCurrentGroup("Input/Output Options");
267  // name
268  arg_desc->AddOptionalKey("i", "InputFile", "Input file name", CArgDescriptions::eInputFile);
269 
270  // input file serial format (AsnText\AsnBinary\XML, default: AsnText)
271  arg_desc->AddOptionalKey("serial", "SerialFormat", "Input file format", CArgDescriptions::eString);
272  arg_desc->SetConstraint("serial",
273  &(*new CArgAllow_Strings, "text", "binary", "XML"));
274  arg_desc->AddFlag("sub", "Submission");
275  // id
276  arg_desc->AddOptionalKey("id", "ID", "Specific ID to display", CArgDescriptions::eString);
277  arg_desc->AddOptionalKey("ids", "IDFile", "FIle of IDs to display, one per line", CArgDescriptions::eInputFile);
278  // accn
279  arg_desc->AddOptionalKey("accn", "AccnFilter", "Limit to specific accession", CArgDescriptions::eString);
280 
281  // input type:
282  arg_desc->AddDefaultKey("type", "AsnType", "ASN.1 object type", CArgDescriptions::eString, "any");
283  arg_desc->SetConstraint("type",
284  &(*new CArgAllow_Strings, "any", "seq-entry", "bioseq", "bioseq-set", "seq-submit"));
285 
286  // single output name
287  arg_desc->AddOptionalKey("o", "SingleOutputFile", "Single output file name", CArgDescriptions::eOutputFile);
288 
289  // file names
290  arg_desc->AddOptionalKey("on", "NucleotideOutputFile", "Nucleotide output file name", CArgDescriptions::eOutputFile);
291  arg_desc->SetDependency("on", CArgDescriptions::eExcludes, "o");
292 
293  arg_desc->AddOptionalKey("og", "GenomicOutputFile", "Genomic output file name", CArgDescriptions::eOutputFile);
294  arg_desc->SetDependency("og", CArgDescriptions::eExcludes, "o");
295  arg_desc->SetDependency("og", CArgDescriptions::eExcludes, "on");
296 
297  arg_desc->AddOptionalKey("or", "RNAOutputFile", "RNA output file name", CArgDescriptions::eOutputFile);
298  arg_desc->SetDependency("or", CArgDescriptions::eExcludes, "o");
299  arg_desc->SetDependency("or", CArgDescriptions::eExcludes, "on");
300 
301  arg_desc->AddOptionalKey("op", "ProteinOutputFile", "Protein output file name", CArgDescriptions::eOutputFile);
302  arg_desc->SetDependency("op", CArgDescriptions::eExcludes, "o");
303 
304  arg_desc->AddOptionalKey("ou", "UnknownOutputFile", "Unknown output file name", CArgDescriptions::eOutputFile);
305  arg_desc->SetDependency("ou", CArgDescriptions::eExcludes, "o");
306  }
307 
308  // batch processing
309  {
310  arg_desc->SetCurrentGroup("Batch Processing Options");
311  arg_desc->AddFlag("batch", "Process NCBI release file");
312  // compression
313  arg_desc->AddFlag("c", "Compressed file");
314  // propogate top descriptors
315  arg_desc->AddFlag("p", "Propagate top descriptors");
316  }
317 
318  // in flat_file_config.cpp
320 
321  // debugging options
322  {
323  arg_desc->SetCurrentGroup("Debugging Options - Subject to change or removal without warning");
324 
325  arg_desc->AddFlag("huge", "Use Huge files mode");
326  arg_desc->AddFlag("disable-huge", "Explicitly disable huge-files mode");
327  arg_desc->SetDependency("disable-huge",
329  "huge");
330  arg_desc->AddFlag("use_mt", "Use multiple threads when possible");
331 
332 #if 0
333  // benchmark cancel-checking
334  arg_desc->AddFlag(
335  "benchmark-cancel-checking",
336  "Check statistics on how often the flatfile generator checks if "
337  "it should be canceled. This also sets up SIGUSR1 to trigger "
338  "cancellation.");
339 #endif
340  }
341 
343 
344  arg_desc->SetCurrentGroup(kEmptyStr);
345  SetupArgDescriptions(arg_desc.release());
346 }
347 
348 
349 bool CAsn2FlatApp::SetFlatfileOstream(eFlatFileCodes _code, const string& name)
350 {
351  const CArgs& args = GetArgs();
352  if (args[name]) {
353  auto filename = args[name].AsString();
354  m_writers.Open(_code, filename);
355  return true;
356  } else
357  return false;
358 }
359 
361 {
363  if (! expires.IsEmpty()) {
364  expires.AddYear();
365  if (CTime(CTime::eCurrent) > expires) {
366  NcbiCerr << "This copy of " << GetProgramDisplayName()
367  << " is more than 1 year old. Please download the current version if it is newer." << endl;
368  }
369  }
370 
371  m_Exception = false;
372  m_FetchFail = false;
373  m_HugeFileMode = false;
374  m_AccessionFilter.clear();
375 
376  // initialize conn library
378 
379  const CArgs& args = GetArgs();
380 
382 
383  // create object manager
385  if (! m_Objmgr) {
386  NCBI_THROW(CException, eUnknown, "Could not create object manager");
387  }
388  if (args["gbload"] || args["id"] || args["ids"]) {
392 
393  if ((args["enable-external"] && ! args["no-external"]) || args["policy"].AsString() == "external") {
394  CGBDataLoader* gb_loader = dynamic_cast<CGBDataLoader*>(CObjectManager::GetInstance()->FindDataLoader("GBLOADER"));
395  if (gb_loader) {
396  // needed to find remote features when reading local ASN.1 file
397  gb_loader->CGBDataLoader::SetAlwaysLoadExternal(true);
398  }
399  }
400  }
401 
402  //const CNcbiRegistry& cfg = CNcbiApplication::Instance()->GetConfig();
403  const CNcbiRegistry& cfg = GetConfig();
404  m_PSGMode = cfg.GetBool("genbank", "loader_psg", false, 0, CNcbiRegistry::eReturn);
405  if (m_PSGMode) {
406 #ifdef USE_SNPLOADER
407  string host = cfg.GetString("SNPAccess", "host", "");
408  string port = cfg.GetString("SNPAccess", "port", "");
409  string hostport = host + ":" + port;
410 
411  auto channel =
412  grpc::CreateChannel(hostport, grpc::InsecureChannelCredentials());
413  m_SNPTrackStub =
415  m_SNPDataLoader.Reset(CSNPDataLoader::RegisterInObjectManager(*m_Objmgr).GetLoader());
416 #endif
417 #ifdef USE_CDDLOADER
418  bool use_mongo_cdd =
419  cfg.GetBool("genbank", "vdb_cdd", false, 0, CNcbiRegistry::eReturn) &&
420  cfg.GetBool("genbank", "always_load_external", false, 0, CNcbiRegistry::eReturn);
421  if (use_mongo_cdd) {
423  }
424 #endif
425  }
426 
427  m_state_pool.SetReserved(20);
428  m_state_pool.SetInitFunc(
429  [this](TFFContext& ctx) {
431  },
432  [this](TFFContext& ctx) {
434  });
435 
436  m_HugeFileMode = ! args["disable-huge"] && (args["huge"] || cfg.GetBool("asn2flat", "UseHugeFiles", false));
437 
438  if (m_HugeFileMode && ! args["i"]) {
439  NcbiCerr << "Use of -huge mode also requires use of the -i argument. Disabling -huge mode." << endl;
440  m_HugeFileMode = false;
441  }
442  if (m_HugeFileMode && args["i"].AsString() == "/dev/stdin") {
443  NcbiCerr << "Use of -huge mode is incompatible with -i /dev/stdin. Disabling -huge mode." << endl;
444  m_HugeFileMode = false;
445  }
446  if (m_HugeFileMode && args["batch"]) {
447  NcbiCerr << "Use of -huge cannot be combined with -batch. Disabling -huge mode." << endl;
448  m_HugeFileMode = false;
449  }
450  if (m_HugeFileMode && args["c"]) {
451  NcbiCerr << "Use of -huge cannot be combined with -c. Disabling -huge mode." << endl;
452  m_HugeFileMode = false;
453  }
454 
455  m_use_mt = args["use_mt"] && (args["batch"] || m_HugeFileMode);
456 
458  m_writers.SetDepth(20);
459 
460  // open the output streams
461  bool has_o_arg = SetFlatfileOstream(eFlatFileCodes::all, "o");
462  bool has_on_arg = SetFlatfileOstream(eFlatFileCodes::nuc, "on");
463  bool has_og_arg = SetFlatfileOstream(eFlatFileCodes::gen, "og");
464  bool has_or_arg = SetFlatfileOstream(eFlatFileCodes::rna, "or");
465  bool has_op_arg = SetFlatfileOstream(eFlatFileCodes::prot, "op");
466  bool has_ou_arg = SetFlatfileOstream(eFlatFileCodes::unk, "ou");
467 
468  bool has_o_args = has_o_arg || has_on_arg || has_og_arg || has_or_arg || has_op_arg || has_ou_arg;
469  if (! has_o_args) {
470  // No output (-o*) argument given - default to stdout
471  m_writers.Open(eFlatFileCodes::all, std::cout);
472  }
473 
474  m_do_cleanup = ! args["nocleanup"];
475  m_OnlyNucs = false;
476  m_OnlyProts = false;
477  if (args["o"]) {
478  auto view = args["view"].AsString();
479  m_OnlyNucs = (view == "nuc");
480  m_OnlyProts = (view == "prot");
481  }
482 
483  if (args["accn"]) {
484  m_AccessionFilter = args["accn"].AsString();
485  }
486 
487  CWrapINSDSet wrap(this);
488 
489  if (args["id"]) {
490  auto thread_state = m_state_pool.Allocate(); // this can block and wait if too many writers already allocated
491  TFFContext& context = *thread_state;
492  HandleTextId(context, args["id"].AsString());
493  if (m_Exception)
494  return -1;
495  return 0;
496  }
497 
498  if (args["ids"]) {
499  auto thread_state = m_state_pool.Allocate(); // this can block and wait if too many writers already allocated
500  TFFContext& context = *thread_state;
501 
502  CNcbiIstream& istr = args["ids"].AsInputFile();
503  string id_str;
504  while (NcbiGetlineEOL(istr, id_str)) {
505  id_str = NStr::TruncateSpaces(id_str);
506  if (id_str.empty() || id_str[0] == '#') {
507  continue;
508  }
509 
510  try {
511  LOG_POST(Error << "id = " << id_str);
512  HandleTextId(context, id_str);
513  } catch (CException& e) {
514  ERR_POST(Error << e);
515  }
516  }
517  if (m_Exception)
518  return -1;
519  return 0;
520  }
521 
522  if (args["i"]) {
523  m_huge_process.OpenFile(args["i"].AsString(), args["c"] ? nullptr : &edit::CHugeFileProcess::g_supported_types);
524  }
525 
526  if (m_HugeFileMode) {
527  m_huge_process.OpenReader();
528  return x_GenerateHugeMode();
529  }
530 
531  // only uncompressed files go this faster shortcut
532  if (args["i"] && args["batch"] && !args["c"] &&
533  m_huge_process.GetFile().IsOpen() &&
534  (m_huge_process.GetFile().m_serial_format == eSerial_AsnBinary ||
535  m_huge_process.GetFile().m_serial_format == eSerial_AsnText)
536  ) {
537  auto content = m_huge_process.GetFile().RecognizeContent(0);
538  if (content == CBioseq_set::GetTypeInfo()) {
539  auto istr = m_huge_process.GetFile().MakeObjStream();
540  return x_GenerateBatchMode(std::move(istr));
541  }
542  }
543 
544  // m_huge_process.GetFile() would be still used if it is already open
545  unique_ptr<CObjectIStream> is = x_OpenIStream(args);
546  if (! is) {
547  string msg = args["i"] ? "Unable to open input file" + args["i"].AsString() : "Unable to read data from stdin";
549  }
550 
551  // traditional way of opening batch mode
552  if (args["batch"]) {
553  // batch mode can do multi-threaded, so it has it's own TFFContext(s)
554  return x_GenerateBatchMode(std::move(is));
555  }
556 
557  // temporaly, only one context exists
558  auto thread_state = m_state_pool.Allocate(); // this can block and wait if too many writers already allocated
559  TFFContext& context = *thread_state;
560 
561  if (args["sub"]) {
562  HandleSeqSubmit(context, *is);
563  if (m_Exception)
564  return -1;
565  return 0;
566  }
567 
568  return x_GenerateTraditionally(std::move(is), context, args);
569 }
570 
571 template<typename _TMethod, typename... TArgs>
572 void CAsn2FlatApp::x_OneShotMethod(_TMethod method, const CAsn2FlatApp* app, CAsn2FlatApp::TThreadStatePool::TUniqPointer thread_state, TArgs ... args)
573 { // this could called from main or processing threads
574  if (app->m_stopit)
575  return;
576 
577  CAsn2FlatApp::TFFContext& context = *thread_state;
578 
579  auto success = method(app, context, std::forward<TArgs>(args)...);
580  if (!success) app->m_stopit = true;
581 }
582 
583 template<typename _TMethod, typename... TArgs>
584 bool CAsn2FlatApp::x_OneShot(_TMethod method, TArgs&& ... args)
585 {
587  "CAsn2FlatApp::x_OneShot should be used with class member function pointers");
588 
589  if (m_stopit) // faster exit if other threads already failed
590  return false;
591 
592  // each thread needs to have it's own context
593  // writers queue control how many of them can run simultaneosly
594  // these resources must allocated sequentially in order of incoming entry's
595  auto thread_state = m_state_pool.Allocate(); // this can block and wait if too many writers already allocated
596 
597  // method is just a pointer to a member function, need to make some calleable out of it
598  auto member = std::mem_fn(method);
599  // just make an instantiated std::function
600  auto calleable = x_OneShotMethod<decltype(member), std::decay_t<TArgs>...>;
601 
602  if (m_use_mt) {
603 #ifdef USE_THREAD_POOL
604  m_thread_pool.OneShot(calleable, member, this, std::move(thread_state), std::forward<TArgs>(args)...);
605 #else
606  std::thread(calleable, member, this, std::move(thread_state), std::forward<TArgs>(args)...).detach();
607 #endif
608  } else {
609  calleable(member, this, std::move(thread_state), std::forward<TArgs>(args)...);
610  if (m_stopit)
611  return false;
612  }
613 
614  return true;
615 }
616 
617 bool CAsn2FlatApp::HandleSeqId(TFFContext& context, const edit::CHugeAsnReader* reader, CConstRef<CSeq_id> seqid) const
618 {
619  if (reader && seqid) {
620  auto entry = reader->LoadSeqEntry(seqid);
621  if (!entry) {
622  return false;
623  }
624  if (auto pSubmitBlock = reader->GetSubmitBlock(); pSubmitBlock) {
625  auto pSeqSubmit = Ref(new CSeq_submit());
626  pSeqSubmit->SetSub().Assign(*pSubmitBlock);
627  pSeqSubmit->SetData().SetEntrys().push_back(entry);
628  return HandleSeqSubmit(context, *pSeqSubmit);
629  }
630  else {
631  return HandleSeqEntry(context, entry);
632  }
633  }
634  return false;
635 }
636 
637 int CAsn2FlatApp::x_GenerateBatchMode(unique_ptr<CObjectIStream> is)
638 {
639  bool propagate = GetArgs()[ "p" ];
640  CGBReleaseFile in( *is.release(), propagate ); // CGBReleaseFile will delete the input stream
641 
642  // for multi-threading processing TFFContext instance is associated with thread
643  // thread can run detached, entry and context will be associated with the thread
644  in.RegisterHandler( [this] (CRef<CSeq_entry>& entry)->bool
645  {
646  return x_OneShot(&CAsn2FlatApp::HandleSeqEntry, std::move(entry));
647  });
648 
649  in.Read(); // registered handlers will be called from this function
650 
651  m_writers.FlushAll(); // this will wait until all writers quit
652 
653  if (m_Exception) return -1;
654  return 0;
655 }
656 
658 {
659  CRef<CSeq_id> seqid;
660  if (! m_AccessionFilter.empty()) {
661  CBioseq::TId ids;
663  if (! ids.empty()) {
664  seqid = ids.front();
665  }
666  }
667 
668  bool all_success = m_huge_process.Read([this, seqid](edit::CHugeAsnReader* reader, const std::list<CConstRef<CSeq_id>>& idlist)
669  {
670  bool success = true;
671  if (seqid) {
672  success = x_OneShot(&CAsn2FlatApp::HandleSeqId, reader, seqid);
673  }
674  else {
675  for (auto id: idlist) {
676  if (!x_OneShot(&CAsn2FlatApp::HandleSeqId, reader, id)) {
677  success = false;
678  break;
679  }
680  }
681  }
682  m_writers.FlushAll(); // this will wait until all writers quit
683  return success;
684  });
685 
686  if (! all_success || m_Exception)
687  return -1;
688  return 0;
689 }
690 
691 int CAsn2FlatApp::x_GenerateTraditionally(unique_ptr<CObjectIStream> is, TFFContext& context, const CArgs& args) const
692 {
693  TTypeInfo asn_info = nullptr;
694  string asn_type = args["type"].AsString();
695 
696  if (m_huge_process.GetConstFile().IsOpen()) {
697  asn_info = m_huge_process.GetConstFile().m_content;
698  }
699 
700  if (asn_info == nullptr)
701  { // if we failed to recognize content type let's take user's instruction which is otherwise ignored
702  if (asn_type == "seq-entry") {
703  asn_info = CSeq_entry::GetTypeInfo();
704  } else if (asn_type == "bioseq") {
705  asn_info = CBioseq::GetTypeInfo();
706  } else if (asn_type == "bioseq-set") {
707  asn_info = CBioseq_set::GetTypeInfo();
708  } else if (asn_type == "seq-submit") {
709  asn_info = CSeq_submit::GetTypeInfo();
710  }
711  }
712 
713  if (asn_info == CSeq_entry::GetTypeInfo()) {
714  //
715  // Straight through processing: Read a seq_entry, then process
716  // a seq_entry:
717  //
718  while (! is->EndOfData()) {
720  if (! seh) {
721  NCBI_THROW(CException, eUnknown, "Unable to construct Seq-entry object");
722  }
724  context.m_Scope->RemoveTopLevelSeqEntry(seh);
725  }
726  } else if (asn_info == CBioseq::GetTypeInfo()) {
727  //
728  // Read object as a bioseq, wrap it into a seq_entry, then process
729  // the wrapped bioseq as a seq_entry:
730  //
731  while (! is->EndOfData()) {
733  if (! seh) {
734  NCBI_THROW(CException, eUnknown, "Unable to construct Seq-entry object");
735  }
737  context.m_Scope->RemoveTopLevelSeqEntry(seh);
738  }
739  } else if (asn_info == CBioseq_set::GetTypeInfo()) {
740  //
741  // Read object as a bioseq_set, wrap it into a seq_entry, then
742  // process the wrapped bioseq_set as a seq_entry:
743  //
744  while (! is->EndOfData()) {
746  if (! seh) {
747  NCBI_THROW(CException, eUnknown, "Unable to construct Seq-entry object");
748  }
750  context.m_Scope->RemoveTopLevelSeqEntry(seh);
751  }
752  } else if (asn_info == CSeq_submit::GetTypeInfo()) {
753  while (! is->EndOfData()) {
754  HandleSeqSubmit(context, *is);
755  }
756  } else if (asn_type == "any") {
757  //
758  // Try the first four in turn:
759  //
760  while (! is->EndOfData()) {
761  string strNextTypeName = is->PeekNextTypeName();
762 
764  if (! seh) {
765  is->Close();
766  is = x_OpenIStream(args);
767  seh = ObtainSeqEntryFromBioseqSet(context, *is, false);
768  if (! seh) {
769  is->Close();
770  is = x_OpenIStream(args);
771  seh = ObtainSeqEntryFromBioseq(context, *is, false);
772  if (! seh) {
773  is->Close();
774  is = x_OpenIStream(args);
776  *is >> *sub;
777  if (sub->IsSetSub() && sub->IsSetData()) {
778  HandleSeqSubmit(context, *sub);
779  if (m_Exception)
780  return -1;
781  return 0;
782  } else {
783  NCBI_THROW(
784  CException, eUnknown, "Unable to construct Seq-entry object");
785  }
786  }
787  }
788  }
790  context.m_Scope->RemoveTopLevelSeqEntry(seh);
791  }
792  }
793 
794  if (m_Exception)
795  return -1;
796  return 0;
797 }
798 
799 
801 {
802  if (! sub.IsSetSub() || ! sub.IsSetData() || ! sub.GetData().IsEntrys() || sub.GetData().GetEntrys().empty()) {
803  return false;
804  }
805 
806  if (m_do_cleanup) {
807  context.m_cleanup.BasicCleanup(sub);
808  }
809  // NB: though the spec specifies a submission may contain multiple entries
810  // this is not the case. A submission should only have a single Top-level
811  // Seq-entry
812  CConstRef<CSeq_entry> e(sub.GetData().GetEntrys().front());
813  CSeq_entry_Handle seh;
814  try {
815  seh = context.m_Scope->GetSeq_entryHandle(*e);
816  } catch (CException&) {
817  }
818 
819  if (! seh) { // add to scope if not already in it
820  seh = context.m_Scope->AddTopLevelSeqEntry(*e);
821  }
822  // "remember" the submission block
823  context.m_FFGenerator->SetSubmit(sub.GetSub());
824 
825  try {
826  x_FFGenerate(seh, context);
827  } catch (const CException& exc) {
828  ERR_POST(Error << exc);
829  m_Exception = true;
830  }
831  context.m_Scope->RemoveTopLevelSeqEntry(seh);
832  return true;
833 }
834 
835 
836 // ============================================================================
838 // ============================================================================
839 {
841  is >> *sub;
842  return HandleSeqSubmit(context, *sub);
843 }
844 
845 // ============================================================================
846 void CAsn2FlatApp::HandleTextId(TFFContext& context, const string& strId) const
847 // ============================================================================
848 {
849  CSeq_entry_Handle seh;
850 
851  // This C++-scope gets a raw CSeq_entry that has no attachment
852  // to any CScope and puts it into entry.
853  {
854  CSeq_id id(strId);
855  CBioseq_Handle bsh = context.m_Scope->GetBioseqHandle(id);
856  if (! bsh) {
857  NCBI_THROW(
858  CException, eUnknown, "Unable to retrieve data for the given ID");
859  }
860  seh = bsh.GetParentEntry();
861  /*
862  CConstRef<CSeq_entry> ser = seh.GetTopLevelEntry().GetCompleteSeq_entry();
863  if (ser) {
864  cout << MSerial_AsnText << *ser << endl;
865  }
866  */
867  }
868 
869  //
870  // ... and use that to generate the flat file:
871  //
872  HandleSeqEntryHandle(context, seh);
873 }
874 
875 // ============================================================================
877 // ============================================================================
878 {
879  //const CArgs& args = GetArgs();
880 
881  if (m_do_cleanup) {
884  CBioseq_EditHandle bseqh;
885  CRef<CSeq_entry> tmp_se(new CSeq_entry);
886  if (tseh.IsSet()) {
887  bseth = tseh.SetSet();
889  bseth.Remove(bseth.eKeepSeq_entry);
890  tmp_se->SetSet(const_cast<CBioseq_set&>(*bset));
891  } else {
892  bseqh = tseh.SetSeq();
893  CConstRef<CBioseq> bseq = bseqh.GetCompleteObject();
894  bseqh.Remove(bseqh.eKeepSeq_entry);
895  tmp_se->SetSeq(const_cast<CBioseq&>(*bseq));
896  }
897 
898  context.m_cleanup.BasicCleanup(*tmp_se);
899 
900  if (tmp_se->IsSet()) {
901  tseh.SelectSet(bseth);
902  } else {
903  tseh.SelectSeq(bseqh);
904  }
905  }
906 
907  try {
908  x_FFGenerate(seh, context);
909  } catch (CException& e) {
910  ERR_POST(Error << e);
911  m_Exception = true;
912  }
913 
914  return true;
915 }
916 
918 {
919  try {
921  is >> *se;
922  if (se->Which() == CSeq_entry::e_not_set) {
923  NCBI_THROW(CException, eUnknown, "provided Seq-entry is empty");
924  }
925  return context.m_Scope->AddTopLevelSeqEntry(*se);
926  } catch (CException& e) {
927  if (report) {
928  ERR_POST(Error << e);
929  }
930  }
931  return CSeq_entry_Handle();
932 }
933 
935 {
936  try {
937  CRef<CBioseq> bs(new CBioseq);
938  is >> *bs;
939  CBioseq_Handle bsh = context.m_Scope->AddBioseq(*bs);
940  return bsh.GetTopLevelEntry();
941  } catch (CException& e) {
942  if (report) {
943  ERR_POST(Error << e);
944  }
945  }
946  return CSeq_entry_Handle();
947 }
948 
950 {
951  try {
952  CRef<CSeq_entry> entry(new CSeq_entry);
953  is >> entry->SetSet();
954  return context.m_Scope->AddTopLevelSeqEntry(*entry);
955  } catch (CException& e) {
956  if (report) {
957  ERR_POST(Error << e);
958  }
959  }
960  return CSeq_entry_Handle();
961 }
962 
964 {
965  if (! se) {
966  return false;
967  }
968 
969  // add entry to scope
970  CSeq_entry_Handle entry = context.m_Scope->AddTopLevelSeqEntry(*se);
971  if (! entry) {
972  NCBI_THROW(CException, eUnknown, "Failed to insert entry to scope.");
973  }
974 
975  bool ret = HandleSeqEntryHandle(context, entry);
976  // Needed because we can really accumulate a lot of junk otherwise,
977  // and we end up with significant slowdown due to repeatedly doing
978  // linear scans on a growing CScope.
979  context.m_Scope->ResetDataAndHistory();
980  return ret;
981 }
982 
983 unique_ptr<CObjectIStream> CAsn2FlatApp::x_OpenIStream(const CArgs& args) const
984 {
985  // determine the file serialization format.
986  // default for batch files is binary, otherwise text.
987  ESerialDataFormat serial = args["batch"] ? eSerial_AsnBinary : eSerial_AsnText;
988  if (args["serial"]) {
989  const string& val = args["serial"].AsString();
990  if (val == "text") {
991  serial = eSerial_AsnText;
992  } else if (val == "binary") {
993  serial = eSerial_AsnBinary;
994  } else if (val == "XML") {
995  serial = eSerial_Xml;
996  }
997  }
998 
999  // make sure of the underlying input stream. If -i was given on the command line
1000  // then the input comes from a file. Otherwise, it comes from stdin:
1001  CNcbiIstream* pInputStream = nullptr;
1002  bool bDeleteOnClose = false;
1003 
1004  if (m_huge_process.GetConstFile().IsOpen()) {
1005  if (!args["c"])
1006  return m_huge_process.GetConstFile().MakeObjStream();
1007 
1008  pInputStream = m_huge_process.GetConstFile().m_stream.get();
1009  } else {
1010  if (args["i"]) {
1011  pInputStream = new CNcbiIfstream(args["i"].AsString(), ios::binary);
1012  bDeleteOnClose = true;
1013  } else
1014  pInputStream = &std::cin;
1015  }
1016 
1017  // if -c was specified then wrap the input stream into a gzip decompressor before
1018  // turning it into an object stream:
1019  CObjectIStream* pI = nullptr;
1020  if (args["c"]) {
1021  CZipStreamDecompressor* pDecompressor =
1023  CCompressionIStream* pUnzipStream =
1024  new CCompressionIStream(*pInputStream, pDecompressor, CCompressionIStream::fOwnProcessor);
1025  pI = CObjectIStream::Open(serial, *pUnzipStream, eTakeOwnership);
1026  } else {
1027  pI = CObjectIStream::Open(
1028  serial, *pInputStream, (bDeleteOnClose ? eTakeOwnership : eNoOwnership));
1029  }
1030 
1031  if (pI) {
1032  pI->UseMemoryPool();
1033  }
1034 
1035  return unique_ptr<CObjectIStream>{pI};
1036 }
1037 
1038 
1040 {
1041  CFlatFileConfig cfg;
1042  cfg.FromArguments(args);
1043  cfg.BasicCleanup(false);
1044 
1045  if (args["html"]) {
1046  CHTMLFormatterEx* html_fmt_ex = new CHTMLFormatterEx(context.m_Scope);
1047  html_fmt_ex->SetNcbiURLBase(NCBI_URL_BASE);
1048  CRef<IHTMLFormatter> html_fmt(html_fmt_ex);
1049  cfg.SetHTMLFormatter(html_fmt);
1050  }
1051 
1052 #if 0
1053  // temporarly disabled because they never used
1054  CRef<TGenbankBlockCallback> genbank_callback( x_GetGenbankCallback(args) );
1055 
1056  if( args["benchmark-cancel-checking"] ) {
1057  m_pCanceledCallback.reset(x_CreateCancelBenchmarkCallback());
1058  }
1059 #endif
1060 
1061  {
1062  bool nuc = args["og"] || args["or"] || args["on"];
1063  bool prot = args["op"];
1064  if (nuc && prot) {
1065  cfg.SetViewAll();
1066  } else {
1067  if (nuc) {
1068  cfg.SetViewNuc();
1069  } else if (prot) {
1070  cfg.SetViewProt();
1071  }
1072  }
1073  }
1074 
1075  // CFlatFileConfig cfg(
1076  // format, mode, style, flags, view, gff_options, genbank_blocks,
1077  // genbank_callback.GetPointerOrNull(), m_pCanceledCallback.get(),
1078  // args["cleanup"]);
1079  context.m_FFGenerator.Reset(new CFlatFileGenerator(cfg));
1080 
1081  // ID-5865 : SNP annotations must be explicitly added to the annot selector
1082  if (! m_PSGMode && cfg.ShowSNPFeatures()) {
1083  cfg.SetHideSNPFeatures(false);
1084  context.m_FFGenerator->SetAnnotSelector().IncludeNamedAnnotAccession("SNP");
1085  }
1086 
1087  if (args["no-external"] || args["policy"].AsString() == "internal") {
1088  context.m_FFGenerator->SetAnnotSelector().SetExcludeExternal(true);
1089  }
1090  // else if (!m_Scope->GetKeepExternalAnnotsForEdit()) {
1091  // m_Scope->SetKeepExternalAnnotsForEdit();
1092  // }
1093  if (args["resolve-all"] || args["policy"].AsString() == "external") {
1094  context.m_FFGenerator->SetAnnotSelector().SetResolveAll();
1095  }
1096  if (args["depth"]) {
1097  context.m_FFGenerator->SetAnnotSelector().SetResolveDepth(args["depth"].AsInteger());
1098  }
1099  if (args["max_search_segments"]) {
1100  context.m_FFGenerator->SetAnnotSelector().SetMaxSearchSegments(args["max_search_segments"].AsInteger());
1101  }
1102  if (args["max_search_time"]) {
1103  context.m_FFGenerator->SetAnnotSelector().SetMaxSearchTime(float(args["max_search_time"].AsDouble()));
1104  }
1105 
1106 }
1107 
1109 {
1110  return args["from"] ? static_cast<TSeqPos>(args["from"].AsInteger() - 1) : CRange<TSeqPos>::GetWholeFrom();
1111 }
1112 
1113 
1115 {
1116  return args["to"] ? static_cast<TSeqPos>(args["to"].AsInteger() - 1) : CRange<TSeqPos>::GetWholeTo();
1117 }
1118 
1120 {
1121  return static_cast<ENa_strand>(args["strand"].AsInteger());
1122 }
1123 
1124 
1126 public:
1128  : m_scope(scope) { }
1129 
1131  const CSeq_loc& loc1,
1132  const CSeq_loc& loc2,
1134  {
1135  return sequence::Seq_loc_Add( loc1, loc2, flags, m_scope );
1136  }
1137 
1138 private:
1140 };
1141 
1143  const CArgs& args,
1144  CSeq_loc& loc) const
1145 {
1146  _ASSERT(entry);
1147 
1148  CBioseq_Handle h = x_DeduceTarget(entry);
1149  if (! h) {
1150  NCBI_THROW(CFlatException, eInvalidParam, "Cannot deduce target bioseq.");
1151  }
1152 
1153  if (args["location"]) {
1154  vector<string> location;
1155  const string& locn = args["location"].AsString();
1157  CRef<CSeq_loc> lc = GetSeqLocFromString(locn, h.GetSeqId(), &helper);
1158  if (lc) {
1159  loc.Assign(*lc);
1160  }
1161  return;
1162  }
1163 
1164  TSeqPos length = h.GetInst_Length();
1165  TSeqPos from = x_GetFrom(args);
1166  TSeqPos to = min(x_GetTo(args), length - 1);
1167  ENa_strand strand = eNa_strand_unknown;
1168  if (args["strand"]) {
1169  strand = x_GetStrand(args);
1170  }
1171 
1172  if (from == CRange<TSeqPos>::GetWholeFrom() && to == length - 1 && strand == eNa_strand_unknown) {
1173  // whole
1174  loc.SetWhole().Assign(*h.GetSeqId());
1175  } else {
1176  // interval
1177  loc.SetInt().SetId().Assign(*h.GetSeqId());
1178  loc.SetInt().SetFrom(from);
1179  loc.SetInt().SetTo(to);
1180  if (strand > 0) {
1181  loc.SetInt().SetStrand(strand);
1182  }
1183  }
1184 }
1185 
1186 
1187 // if the 'from' or 'to' flags are specified try to guess the bioseq.
1189 {
1190  if (entry.IsSeq()) {
1191  return entry.GetSeq();
1192  }
1193 
1194  _ASSERT(entry.IsSet());
1195  CBioseq_set_Handle bsst = entry.GetSet();
1196  if (! bsst.IsSetClass()) {
1197  NCBI_THROW(CFlatException, eInvalidParam, "Cannot deduce target bioseq.");
1198  }
1199  _ASSERT(bsst.IsSetClass());
1200  switch (bsst.GetClass()) {
1202  // return the nucleotide
1203  for (CSeq_entry_CI it(entry); it; ++it) {
1204  if (it->IsSeq()) {
1205  CBioseq_Handle h = it->GetSeq();
1206  if (h && CSeq_inst::IsNa(h.GetInst_Mol())) {
1207  return h;
1208  }
1209  }
1210  }
1211  break;
1213  // return the genomic
1214  for (CSeq_entry_CI it(bsst); it; ++it) {
1215  if (it->IsSeq() &&
1216  it->GetSeq().GetInst_Mol() == CSeq_inst::eMol_dna) {
1217  return it->GetSeq();
1218  }
1219  }
1220  break;
1222  // return the segmented bioseq
1223  for (CSeq_entry_CI it(bsst); it; ++it) {
1224  if (it->IsSeq()) {
1225  return it->GetSeq();
1226  }
1227  }
1228  break;
1230  CBioseq_CI bi(bsst, CSeq_inst::eMol_na);
1231  if (bi) {
1232  return *bi;
1233  }
1234  } break;
1235  default:
1236  break;
1237  }
1238  NCBI_THROW(CFlatException, eInvalidParam, "Cannot deduce target bioseq.");
1239 }
1240 
1242 {
1243  int rc = 0;
1244 
1245  // SNP annotations can be available only for nucleotide human RefSeq records
1246  if (bsh.GetInst_Mol() == CSeq_inst::eMol_aa ||
1247  sequence::GetTaxId(bsh) != TAX_ID_CONST(9606))
1248  return 0;
1249 
1250  // Also skip large scaffolds and chromosomes
1251  CConstRef<CSeq_id> accid =
1253 
1254  bool skip = (accid->Which() != CSeq_id::e_Other);
1255  if (! skip) {
1256  string acc;
1257  accid->GetLabel(&acc, CSeq_id::eContent);
1258  string acc_prefix = acc.substr(0, 2);
1259  if (acc_prefix == "NC" || acc_prefix == "AC" ||
1260  acc_prefix == "NT" || acc_prefix == "NW") {
1261  skip = true;
1262  }
1263  }
1264  if (skip)
1265  return 0;
1266 
1267  // If GenBank loader is connecting to PubSeqOS, it's sufficient to add the 'SNP'
1268  // named annot type to the scope.
1269  // Otherwise (in PSG mode), use a separate SNP data loader. For that to work,
1270  // it is necessary to find the actual NA accession corresponding to this record's
1271  // SNP annotation and add it to the SAnnotSelector used by the flatfile generator.
1272 #ifdef USE_SNPLOADER
1273  TGi gi = FindGi(bsh.GetBioseqCore()->GetId());
1274  if (gi > ZERO_GI) {
1276  request.set_gi(GI_TO(::google::protobuf::uint64, gi));
1278 
1280  auto snp_status = m_SNPTrackStub->ForSeqId(&context, request, &reply);
1281  if (snp_status.ok()) {
1282  string na_acc = reply.na_track_acc_with_filter();
1283  if (! na_acc.empty())
1285  }
1286  }
1287 #endif
1288 
1289  return rc;
1290 }
1291 
1293 {
1294  const CArgs& args = GetArgs();
1295  if (args["from"] || args["to"] || args["strand"] || args["location"]) {
1296  CSeq_loc loc;
1297  x_GetLocation(seh, args, loc);
1298  auto* flatfile_os = context.m_streams[eFlatFileCodes::all].get();
1299  context.m_FFGenerator->Generate(loc, seh.GetScope(), *flatfile_os, { flatfile_os });
1300  } else {
1301  auto* all = context.m_streams[eFlatFileCodes::all].get();
1302  auto* nuc = context.m_streams[eFlatFileCodes::nuc].get();
1303  auto* gen = context.m_streams[eFlatFileCodes::gen].get();
1304  auto* rna = context.m_streams[eFlatFileCodes::rna].get();
1305  auto* prot = context.m_streams[eFlatFileCodes::prot].get();
1306  auto* unk = context.m_streams[eFlatFileCodes::unk].get();
1307  context.m_FFGenerator->Generate(seh, *all, { all, nuc, gen, rna, prot, unk });
1308  }
1309 }
1310 
1312 {
1313  context.m_streams.Reset();
1314 
1315  if (context.m_FFGenerator.NotEmpty()) {
1316  context.m_FFGenerator->SetFeatTree(nullptr);
1317  }
1318 
1319  if (context.m_Scope.NotEmpty()) {
1320  context.m_Scope->ResetDataAndHistory();
1321  }
1322 }
1323 
1325 {
1326  if (context.m_Scope.Empty()) {
1327  context.m_Scope.Reset(new CScope(*m_Objmgr));
1328  context.m_Scope->AddDefaults();
1329 
1330 #ifdef USE_SNPLOADER
1331  if (m_SNPDataLoader) {
1332  context.m_Scope->AddDataLoader(m_SNPDataLoader->GetLoaderNameFromArgs());
1333  }
1334 #endif
1335 #ifdef USE_CDDLOADER
1336  if (m_CDDDataLoader) {
1337  context.m_Scope->AddDataLoader(m_CDDDataLoader->GetLoaderNameFromArgs());
1338  }
1339 #endif
1340  }
1341 
1342  // create the flat-file generator
1343  if (context.m_FFGenerator.Empty())
1345 
1346 
1347  context.m_streams = m_writers.MakeNewFileset();
1348 }
1349 
1350 bool CAsn2FlatApp::WrapINSDSet(bool unwrap)
1351 {
1352  auto& args = GetArgs();
1353  if (args["o"] && args["format"] && args["format"].AsString() == "insdseq") {
1354  auto streams = m_writers.MakeNewFileset();
1355  auto os = streams[eFlatFileCodes::all].get();
1356  if (os)
1357  *os << (unwrap ? "</INSDSet>" : "<INSDSet>") << endl;
1358  return true;
1359  }
1360  return false;
1361 }
1362 
1363 
1365 
1367 
1368 
1369 /////////////////////////////////////////////////////////////////////////////
1370 //
1371 // Main
1372 
1373 int main(int argc, const char** argv)
1374 {
1375  // this code converts single argument into multiple, just to simplify testing
1376  list<string> split_args;
1377  vector<const char*> new_argv;
1378 
1379  if (argc==2 && argv && argv[1] && strchr(argv[1], ' '))
1380  {
1381  NStr::Split(argv[1], " ", split_args);
1382 
1383  auto it = split_args.begin();
1384  while (it != split_args.end())
1385  {
1386  auto next = it; ++next;
1387  if (next != split_args.end() &&
1388  ((it->front() == '"' && it->back() != '"') ||
1389  (it->front() == '\'' && it->back() != '\'')))
1390  {
1391  it->append(" "); it->append(*next);
1392  next = split_args.erase(next);
1393  } else it = next;
1394  }
1395  for (auto& rec: split_args)
1396  {
1397  if (rec.front()=='\'' && rec.back()=='\'')
1398  rec=rec.substr(1, rec.length()-2);
1399  }
1400  argc = 1 + int(split_args.size());
1401  new_argv.reserve(argc);
1402  new_argv.push_back(argv[0]);
1403  for (const string& s : split_args)
1404  {
1405  new_argv.push_back(s.c_str());
1406  std::cerr << s.c_str() << " ";
1407  }
1408  std::cerr << "\n";
1409 
1410 
1411  argv = new_argv.data();
1412  }
1413 
1415  return CAsn2FlatApp().AppMain(argc, argv);
1416 }
User-defined methods of the data storage class.
static const CDataLoadersUtil::TLoaders default_loaders
Definition: annotwriter.cpp:76
USING_SCOPE(objects)
int main(int argc, const char **argv)
Definition: asn2flat.cpp:1373
USING_NCBI_SCOPE
Definition: asn2flat.cpp:1366
#define NCBI_URL_BASE
Definition: asn2flat.cpp:107
eFlatFileCodes
Definition: asn2flat.cpp:119
uint64_t uint64
Definition: city.h:71
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
void Init() override
Initialize the application.
Definition: asn2flat.cpp:259
void x_GetLocation(const CSeq_entry_Handle &entry, const CArgs &args, CSeq_loc &loc) const
Definition: asn2flat.cpp:1142
static void x_OneShotMethod(_TMethod method, const CAsn2FlatApp *app, CAsn2FlatApp::TThreadStatePool::TUniqPointer thread_state, TArgs ... args)
Definition: asn2flat.cpp:572
int x_GenerateTraditionally(unique_ptr< CObjectIStream > is, TFFContext &context, const CArgs &args) const
Definition: asn2flat.cpp:691
bool m_do_cleanup
Definition: asn2flat.cpp:209
bool HandleSeqSubmit(TFFContext &context, CObjectIStream &is) const
Definition: asn2flat.cpp:837
TResourcePool< TFFContext > TThreadStatePool
Definition: asn2flat.cpp:158
bool m_HugeFileMode
Definition: asn2flat.cpp:213
TThreadStatePool m_state_pool
Definition: asn2flat.cpp:197
void x_FFGenerate(CSeq_entry_Handle seh, TFFContext &context) const
Definition: asn2flat.cpp:1292
int x_AddSNPAnnots(CBioseq_Handle &bsh, TFFContext &context) const
Definition: asn2flat.cpp:1241
bool HandleSeqId(TFFContext &context, const edit::CHugeAsnReader *reader, CConstRef< CSeq_id > seqid) const
Definition: asn2flat.cpp:617
TSeqPos x_GetTo(const CArgs &args) const
Definition: asn2flat.cpp:1114
unique_ptr< CObjectIStream > x_OpenIStream(const CArgs &args) const
Definition: asn2flat.cpp:983
void HandleTextId(TFFContext &context, const string &id) const
Definition: asn2flat.cpp:846
edit::CHugeFileProcess m_huge_process
Definition: asn2flat.cpp:216
CSeq_entry_Handle ObtainSeqEntryFromBioseq(TFFContext &context, CObjectIStream &is, bool report) const
Definition: asn2flat.cpp:934
string m_AccessionFilter
Definition: asn2flat.cpp:214
void x_CreateFlatFileGenerator(TFFContext &context, const CArgs &args) const
Definition: asn2flat.cpp:1039
std::atomic< bool > m_stopit
Definition: asn2flat.cpp:215
unique_ptr< ICanceled > m_pCanceledCallback
Definition: asn2flat.cpp:208
CRef< CCDDDataLoader > m_CDDDataLoader
Definition: asn2flat.cpp:223
void x_ResetContext(TFFContext &context)
Definition: asn2flat.cpp:1311
TSeqPos x_GetFrom(const CArgs &args) const
Definition: asn2flat.cpp:1108
int x_GenerateHugeMode()
Definition: asn2flat.cpp:657
CBioseq_Handle x_DeduceTarget(const CSeq_entry_Handle &entry) const
Definition: asn2flat.cpp:1188
bool m_OnlyNucs
Definition: asn2flat.cpp:204
bool HandleSeqEntryHandle(TFFContext &context, CSeq_entry_Handle seh) const
Definition: asn2flat.cpp:876
bool m_PSGMode
Definition: asn2flat.cpp:212
std::atomic< bool > m_Exception
Definition: asn2flat.cpp:210
bool WrapINSDSet(bool unwrap)
Definition: asn2flat.cpp:1350
ENa_strand x_GetStrand(const CArgs &args) const
Definition: asn2flat.cpp:1119
bool m_FetchFail
Definition: asn2flat.cpp:211
int Run() override
Run the application.
Definition: asn2flat.cpp:360
void x_InitNewContext(TFFContext &context)
Definition: asn2flat.cpp:1324
CSeq_entry_Handle ObtainSeqEntryFromBioseqSet(TFFContext &context, CObjectIStream &is, bool report) const
Definition: asn2flat.cpp:949
bool x_OneShot(_TMethod method, TArgs &&... args)
Definition: asn2flat.cpp:584
CSeq_entry_Handle ObtainSeqEntryFromSeqEntry(TFFContext &context, CObjectIStream &is, bool report) const
Definition: asn2flat.cpp:917
CFFMultiSourceFileSet m_writers
Definition: asn2flat.cpp:207
bool SetFlatfileOstream(eFlatFileCodes _code, const string &name)
Definition: asn2flat.cpp:349
CFlatFileConfig::CGenbankBlockCallback TGenbankBlockCallback
Definition: asn2flat.cpp:156
decltype(TFFContext::m_streams) fileset_type
Definition: asn2flat.cpp:149
int x_GenerateBatchMode(unique_ptr< CObjectIStream > is)
Definition: asn2flat.cpp:637
CRef< CObjectManager > m_Objmgr
Definition: asn2flat.cpp:196
bool HandleSeqEntry(TFFContext &context, CRef< CSeq_entry > se) const
Definition: asn2flat.cpp:963
bool m_OnlyProts
Definition: asn2flat.cpp:205
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_EditHandle –.
CBioseq_Handle –.
CBioseq_set_EditHandle –.
CBioseq_set_Handle –.
static string GetLoaderNameFromArgs(void)
Definition: cdd_loader.cpp:177
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: cdd_loader.cpp:134
static void SetupObjectManager(const CArgs &args, objects::CObjectManager &obj_mgr, TLoaders loaders=fDefault)
Set up the standard object manager data loaders according to the arguments provided above.
static void AddArgumentDescriptions(CArgDescriptions &arg_desc, TLoaders loaders=fDefault)
Add a standard set of arguments used to configure the object manager.
static void AddArgumentDescriptions(CArgDescriptions &args)
void SetHTMLFormatter(CRef< IHTMLFormatter > html_fmt)
bool ShowSNPFeatures(void) const
void FromArguments(const CArgs &args)
CFlatFileConfig & SetHideSNPFeatures(bool val=true)
bool BasicCleanup(void) const
SAnnotSelector & SetAnnotSelector(void)
CGBReleaseFile is a utility class to ease the processing of Genbank release files one Seq-entry at a ...
CGRPCClientContext – client context for NCBI gRPC services.
CGetSeqLocFromStringHelper_ReadLocFromText(CScope *scope)
Definition: asn2flat.cpp:1127
virtual CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags)
Definition: asn2flat.cpp:1130
void SetNcbiURLBase(const string &path)
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CObjectIStream –.
Definition: objistr.hpp:93
CRange –.
Definition: Range.hpp:68
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const SLoaderParams &params, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: snploader.cpp:144
CScope –.
Definition: scope.hpp:92
CSeq_entry_CI –.
CSeq_entry_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
bool IsNa(void) const
Definition: Seq_inst.hpp:106
CTime –.
Definition: ncbitime.hpp:296
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
CVersionInfo –.
CAsn2FlatApp * m_app
Definition: asn2flat.cpp:244
CWrapINSDSet(CAsn2FlatApp *app)
Definition: asn2flat.cpp:231
CZipStreamDecompressor – zlib based decompression stream processor.
Definition: zlib.hpp:817
void SetDepth(size_t depth)
Definition: fileset.hpp:168
void SetUseMT(bool use_mt)
Definition: fileset.hpp:143
fileset_type MakeNewFileset()
Definition: fileset.hpp:186
void Open(enum_type _enum, const std::string &filename)
Definition: fileset.hpp:147
static std::unique_ptr< Stub > NewStub(const std::shared_ptr< ::grpc::ChannelInterface > &channel, const ::grpc::StubOptions &options=::grpc::StubOptions())
const std::string & na_track_acc_with_filter() const
Definition: dbsnp.pb.h:655
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static uch flags
CS_CONTEXT * ctx
Definition: t0006.c:12
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int lc
Definition: getdata.c:30
static const char location[]
Definition: config.c:97
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
const string & GetProgramDisplayName(void) const
Get the application's "display" name.
const CVersionAPI & GetFullVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1202
#define ZERO_GI
Definition: ncbimisc.hpp:1088
void SetVersion(const CVersionInfo &version)
Set the version number for the program.
Definition: ncbiapp.cpp:1168
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ fCheckFileHeader
Check (and skip) gzip file header on decompression stage.
Definition: zlib.hpp:104
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
TGi FindGi(const container &ids)
Return gi from id list if exists, return 0 otherwise.
Definition: Seq_id.hpp:1041
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
Definition: Seq_id.cpp:2613
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:772
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
int TOpFlags
Definition: Seq_loc.hpp:336
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
void UseMemoryPool(void)
Definition: objistr.cpp:775
CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Add two seq-locs.
TTaxId GetTaxId(const CBioseq_Handle &handle)
return the tax-id associated with a given sequence.
Definition: sequence.cpp:274
TLoader * GetLoader(void) const
Get pointer to the loader.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CDataLoader * FindDataLoader(const string &loader_name) const
Try to find a registered data loader by name.
static void SetDefaultKeepExternalAnnotsForEdit(bool keep=true)
Set new application-wide KeepExternalAnnotsForEdit flag.
Definition: scope_impl.cpp:153
TClass GetClass(void) const
TSet SelectSet(TClass set_class=CBioseq_set::eClass_not_set) const
Convert the empty Seq-entry to Bioseq-set.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
CConstRef< TObject > GetCompleteObject(void) const
TSet GetSet(void) const
TInst_Mol GetInst_Mol(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
void Remove(ERemoveMode mode=eRemoveSeq_entry) const
void Remove(ERemoveMode mode=eRemoveSeq_entry) const
Remove current seqset-entry from its location.
TSeq GetSeq(void) const
CConstRef< TObject > GetCompleteObject(void) const
TSeq SelectSeq(CBioseq &seq) const
Make the empty Seq-entry be in seq state with specified Bioseq object.
CSeq_entry_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
TInst_Length GetInst_Length(void) const
bool IsSetClass(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsSet(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
bool IsSeq(void) const
SAnnotSelector & IncludeNamedAnnotAccession(const string &acc, int zoom_level=0)
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
virtual bool GetBool(const string &section, const string &name, bool default_value, TFlags flags=0, EErrAction err_action=eThrow) const
Get boolean value of specified parameter name.
Definition: ncbireg.cpp:391
virtual string GetString(const string &section, const string &name, const string &default_value, TFlags flags=0) const
Get the parameter string value.
Definition: ncbireg.cpp:321
@ eReturn
Return default value.
Definition: ncbireg.hpp:203
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define NcbiCerr
Definition: ncbistre.hpp:544
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
Definition: ncbistr.cpp:3177
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
void CONNECT_Init(const IRWRegistry *reg=0, CRWLock *lock=0, TConnectInitFlags flag=eConnectInit_OwnNothing, FSSLSetup ssl=0)
Init [X]CONNECT library with the specified "reg" and "lock" (ownership for either or both can be deta...
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
static void TrapSignals(TSignalMask signals)
Sets interrupt signal handling.
@ eSignal_USR1
User defined signal 1.
Definition: ncbi_signal.hpp:79
const SBuildInfo & GetBuildInfo() const
Get build info (date and tag, if set)
Definition: version.cpp:705
CTime GetBuildTime(void) const
Converts 'date' parameter to CTime.
Definition: version.cpp:606
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_entry_.hpp:228
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_genbank
converted genbank
@ eClass_segset
segmented sequence + parts
@ e_not_set
No variant selected.
Definition: Seq_entry_.hpp:88
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TSub & GetSub(void) const
Get the Sub member data.
bool IsSetSub(void) const
Check if a value has been assigned to Sub data member.
bool IsEntrys(void) const
Check if variant Entrys is selected.
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
const struct ncbi::grid::netcache::search::fields::EXPIRES expires
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define NCBI_SC_VERSION_PROXY
#define NCBI_TEAMCITY_BUILD_NUMBER_PROXY
Setup interrupt signal handling.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
The Object manager core.
CRef< CSeq_loc > GetSeqLocFromString(const string &text, const CSeq_id *id, CGetSeqLocFromStringHelper *helper)
Utility macros and typedefs for exploring NCBI objects from seq.asn.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
CFFMultiSourceFileSet::fileset_type m_streams
Definition: asn2flat.cpp:146
CRef< CScope > m_Scope
Definition: asn2flat.cpp:144
CRef< CFlatFileGenerator > m_FFGenerator
Definition: asn2flat.cpp:145
#define _ASSERT
static CS_CONTEXT * context
Definition: will_convert.c:21
ZLib Compression API.
Modified on Fri Sep 20 14:58:13 2024 by modify_doxy.py rev. 669887