NCBI C++ ToolKit
prime_cache.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: prime_cache.cpp 101298 2023-11-28 17:15:47Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbiargs.hpp>
36 #include <corelib/request_ctx.hpp>
37 #include <corelib/ncbi_signal.hpp>
38 
39 #include <util/static_map.hpp>
40 #include <util/stream_source.hpp>
41 #include <util/compress/stream.hpp>
42 #include <util/compress/zlib.hpp>
43 
44 #include <serial/serial.hpp>
45 #include <serial/objistr.hpp>
46 #include <serial/objostr.hpp>
47 
49 #include <objects/seq/Bioseq.hpp>
51 #include <objects/seq/Seq_inst.hpp>
52 #include <objects/seq/Seq_ext.hpp>
55 #include <objects/seq/Seqdesc.hpp>
56 #include <objects/seq/MolInfo.hpp>
62 
63 #ifdef HAVE_NCBI_VDB
65 #endif
66 
69 
71 #include <objmgr/bioseq_handle.hpp>
72 #include <objmgr/bioseq_ci.hpp>
73 #include <objmgr/scope.hpp>
74 #include <objmgr/util/sequence.hpp>
75 
81 
83 
86 
88 
90  { "crna", CMolInfo::eBiomol_cRNA },
91  { "genomic", CMolInfo::eBiomol_genomic },
92  { "genomic-mrna", CMolInfo::eBiomol_genomic_mRNA },
93  { "mrna", CMolInfo::eBiomol_mRNA },
94  { "ncrna", CMolInfo::eBiomol_ncRNA },
95  { "other", CMolInfo::eBiomol_other },
96  { "other-genetic", CMolInfo::eBiomol_other_genetic },
97  { "peptide", CMolInfo::eBiomol_peptide },
98  { "pre-rna", CMolInfo::eBiomol_pre_RNA },
99  { "rrna", CMolInfo::eBiomol_rRNA },
100  { "scrna", CMolInfo::eBiomol_scRNA },
101  { "snorna", CMolInfo::eBiomol_snoRNA },
102  { "snrna", CMolInfo::eBiomol_snRNA },
103  { "tmrna", CMolInfo::eBiomol_tmRNA },
104  { "transcribed-rna", CMolInfo::eBiomol_transcribed_RNA },
105  { "trna", CMolInfo::eBiomol_tRNA }
106 };
107 
110 
111 
113 
115  { "chromosome", CBioSource::eGenome_chromosome },
116  { "genomic", CBioSource::eGenome_genomic },
117  { "mitochondrion", CBioSource::eGenome_mitochondrion },
118  { "plasmid", CBioSource::eGenome_plasmid },
119  { "plastid", CBioSource::eGenome_plastid }
120 };
121 
124 
125 
127 
129  { "aa", CSeq_inst::eMol_aa },
130  { "dna", CSeq_inst::eMol_dna },
131  { "na", CSeq_inst::eMol_na },
132  { "other", CSeq_inst::eMol_other },
133  { "rna", CSeq_inst::eMol_rna }
134 };
135 
138 
139 
140 
141 /////////////////////////////////////////////////////////////////////////////
142 // CPrimeCacheApplication::
143 
144 
146 {
147 public:
149  : m_MainIndex(CAsnIndex::e_main)
150  , m_SeqIdIndex(CAsnIndex::e_seq_id)
151  , m_Genome (CBioSource::eGenome_unknown)
153  , m_MaxDeltaLevel(UINT_MAX)
154  {
155  }
156 
157 private:
158  virtual void Init(void);
159  virtual int Run(void);
160  virtual void Exit(void);
161 
162  void x_Read_Ids(CNcbiIstream& istr,
163  set<CSeq_id_Handle> &ids);
164 
165  void x_Process_Ids(const set<CSeq_id_Handle> &ids,
166  CNcbiOstream& ostr_seqids,
167  unsigned delta_level,
168  size_t count);
169 
170 #ifdef HAVE_NCBI_VDB
171  void x_Process_SRA(CNcbiIstream& istr,
172  CNcbiOstream& ostr_seqids);
173 #endif
174 
175  void x_Process_Fasta(CNcbiIstream& istr,
176  CNcbiOstream& ostr_seqids);
177 
178  void x_Process_SeqEntry(CNcbiIstream& istr,
179  CNcbiOstream& ostr_seqids,
180  ESerialDataFormat serial_fmt,
181  set<CSeq_id_Handle> &delta_ids,
182  size_t &count);
183 
184  void x_ExtractAndIndex(const CSeq_entry& entry,
185  CAsnIndex::TTimestamp timestamp,
186  CAsnIndex::TChunkId chunk_id,
189 
190  bool x_StripSeqEntry(CScope& scope, CSeq_entry& entry, set<CSeq_id_Handle>& trimmed_bioseqs);
191 
192  // x_CacheSeqEntry: cache group of sequences packaged together
193  // as a single blob.
194  void x_CacheSeqEntry(CNcbiIstream& istr,
195  CNcbiOstream& ostr_seqids,
196  ESerialDataFormat serial_fmt,
197  set<CSeq_id_Handle> &delta_ids,
198  size_t &count);
199 
200  // Split group of sequences packaged together
201  // and cache each sequence separately from the others.
203  CNcbiOstream& ostr_seqids,
204  ESerialDataFormat serial_fmt);
205 
207  set<CSeq_id_Handle>& delta_ids);
208  void x_UpsertDescriptor(list<CRef<CSeqdesc> >& descs, CRef<CSeqdesc> new_desc);
209 
211  {
215  time_t timestamp_;
216  unsigned int count_;
217  public:
219  CNcbiOstream* ostr);
220  void operator () (CBioseq& bseq);
221  };
222  friend class CCacheBioseq;
223 
224 private: // data
225  struct SOrgData {
228  };
229 
230  string m_CachePath;
241  list< CRef<CSeqdesc> > m_other_descs;
245  unsigned m_MaxDeltaLevel;
249 };
250 
251 template <typename T, typename Consumer>
253 {
254 public:
255  explicit CObjectEnum(Consumer consumer)
256  : m_Consumer(consumer)
257  {
258  }
259 
261  const CObjectTypeInfo& info)
262  {
263  T record;
264  istr.ReadObject(&record, info.GetTypeInfo());
265  m_Consumer(record);
266  }
267 
268 private:
272 
273  Consumer m_Consumer;
274 };
275 
276 
277 /////////////////////////////////////////////////////////////////////////////
278 // Init test for all different types of arguments
279 
280 
282 {
283  // Create command-line argument descriptions class
284  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
285 
286  // Specify USAGE context
287  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
288  "Seq-id-to-ASN-cache converter");
289 
290  arg_desc->AddDefaultKey("i", "InputFile",
291  "FASTA file to process",
293  "-");
294  arg_desc->AddOptionalKey("input-manifest", "Manifest",
295  "Manifest file listing FASTA files to process, "
296  "one per line",
298 
299  arg_desc->AddOptionalKey("submit-block-template", "Manifest",
300  "Manifest file with template",
302 
303  arg_desc->AddDefaultKey("ifmt", "InputFormat",
304  "Format of input data",
306  "ids");
307 
308  arg_desc->SetConstraint("ifmt",
309  &(*new CArgAllow_Strings,
310  "ids", "fasta",
311 #ifdef HAVE_NCBI_VDB
312  "csra",
313 #endif
314  "asnb-seq-entry",
315  "asn-seq-entry"));
316 
317  arg_desc->AddOptionalKey("taxid", "Taxid",
318  "Taxid of input FASTA sequences",
320 
321  arg_desc->AddOptionalKey("taxid-table", "TaxidTable",
322  "Table of taxids for individual sequences",
324  arg_desc->AddOptionalKey("taxid-table-manifest", "TaxidTableManifest",
325  "Manifest of taxid tables",
327  arg_desc->SetDependency("taxid-table-manifest",
328  CArgDescriptions::eExcludes, "taxid-table");
329  arg_desc->AddDefaultKey("taxid-column", "TaxidColumn",
330  "column in taxid table with taxid",
332 
333  CLocalTaxon::AddArguments(*arg_desc);
334 
335  arg_desc->AddOptionalKey("molinfo", "Molinfo",
336  "Type of molecule that sequences represent",
338 
339  CArgAllow_Strings *molinfo_options = new CArgAllow_Strings;
340  ITERATE (TBiomolTypeMap, it, sm_BiomolTypes) {
341  molinfo_options->Allow(it->first);
342  }
343  arg_desc->SetConstraint("molinfo", molinfo_options);
344 
345  arg_desc->AddOptionalKey("biosource", "Biosource",
346  "genome source of sequences",
348 
349  arg_desc->SetDependency("biosource", CArgDescriptions::eExcludes,
350  "taxid-table");
351  arg_desc->SetDependency("biosource", CArgDescriptions::eExcludes,
352  "taxid-table-manifest");
353  arg_desc->SetDependency("submit-block-template", CArgDescriptions::eExcludes,
354  "taxid-table");
355  arg_desc->SetDependency("submit-block-template", CArgDescriptions::eExcludes,
356  "taxid-table-manifest");
357 
358 
359  CArgAllow_Strings *biosource_options = new CArgAllow_Strings;
360  ITERATE (TGenomeTypeMap, it, sm_GenomeTypes) {
361  biosource_options->Allow(it->first);
362  }
363  arg_desc->SetConstraint("biosource", biosource_options);
364 
365  arg_desc->AddDefaultKey("inst-mol", "InstMol",
366  "Value for Seq.inst.mol",
368 
369  CArgAllow_Strings *inst_mol_options = new CArgAllow_Strings;
370  ITERATE (TInstMolTypeMap, it, sm_InstMolTypes) {
371  inst_mol_options->Allow(it->first);
372  }
373  arg_desc->SetConstraint("inst-mol", inst_mol_options);
374 
375  arg_desc->AddOptionalKey("uniprot-source-table", "UniprotSourceTable",
376  "Table of uniprot source for individual sequences",
378  arg_desc->AddOptionalKey("uniprot-source-table-manifest",
379  "UniprotSourceTableManifest",
380  "Manifest of uniprot source tables",
382  arg_desc->SetDependency("uniprot-source-table-manifest",
383  CArgDescriptions::eExcludes, "uniprot-source-table");
384  arg_desc->AddDefaultKey("uniprot-source-column", "UniprotSourceColumn",
385  "column in uniprot source table with uniprot source",
387 
388  arg_desc->AddKey("cache", "OutputFile",
389  "Path to the cache directory",
391 
392  arg_desc->AddDefaultKey("oseq-ids", "OutputFile",
393  "Seq-ids that were added to the cache",
395  "-");
396 
397  arg_desc->AddDefaultKey("seq-id-type", "SeqIdType",
398  "If sequence has several seq-ids, which one to choose",
399  CArgDescriptions::eString, "canonical");
400  arg_desc->SetConstraint("seq-id-type",
401  &(*new CArgAllow_Strings,
402  "canonical",
403  "best"));
404 
405  arg_desc->AddFlag("no-title",
406  "For FASTA input, don't put a title on the Bioseq");
407 
408  arg_desc->AddOptionalKey("max-fasta-id", "MaxFastaIDLength",
409  "For FASTA input, maximum ID size, overriding "
410  "CSeq_id-defined limits",
412 
413  arg_desc->AddOptionalKey("id-prefix", "FASTAIdPrefix",
414  "For FASTA input with local ids, add this prefix to each id",
416 
417  arg_desc->AddOptionalKey("strip-annots-and-inst-mol", "StripAnnotsAndInstMol",
418  "Comma-separated list of molecule classes of instances - Seq.inst.mol - to strip.",
420 
421  arg_desc->AddFlag("split-sequences",
422  "Split group of sequences packaged together. Applicable to asn(b)-seq-entry format.");
423 
424  arg_desc->AddFlag("extract-delta",
425  "Extract and index delta-seq far-pointers");
426  arg_desc->SetDependency("split-sequences",
427  CArgDescriptions::eExcludes, "extract-delta");
428 
429  arg_desc->AddOptionalKey("delta-level", "RecursionLevel",
430  "Number of levels to descend when retrieving "
431  "items in delta sequences",
433  arg_desc->SetDependency("delta-level",
434  CArgDescriptions::eRequires, "extract-delta");
435 
436  arg_desc->AddFlag("resume", "Resume interrupted previous execution");
437 
438  arg_desc->AddFlag("non-exclusive",
439  "Can run this cache process in parallel with other "
440  "tasks; use this if writing to a dedicated cache rather "
441  "than the build's standard cache. Ignored by "
442  "application, but provides information to action node");
443 
444  // Setup arg.descriptions for this application
445  arg_desc->SetCurrentGroup("Default application arguments");
446  SetupArgDescriptions(arg_desc.release());
447 }
448 
449 
450 
452  CAsnIndex::TTimestamp timestamp,
453  CAsnIndex::TChunkId chunk_id,
456 {
457  if (entry.IsSet()) {
459  entry.GetSet().GetSeq_set()) {
460  x_ExtractAndIndex(**iter, timestamp, chunk_id, offset, size);
461  }
462  } else if (entry.IsSeq()) {
463  const objects::CBioseq& bioseq = entry.GetSeq();
464  IndexABioseq( bioseq, m_MainIndex, timestamp, chunk_id, offset, size );
465  Int8 seq_id_offset = m_SeqIdChunk.GetOffset();
466  m_SeqIdChunk.Write( bioseq.GetId() );
467  IndexABioseq( bioseq, m_SeqIdIndex, timestamp, 0,
468  seq_id_offset, m_SeqIdChunk.GetOffset() - seq_id_offset );
469  ITERATE (CBioseq::TId, id_it, bioseq.GetId()) {
470  m_CachedIds.insert(CSeq_id_Handle::GetHandle(**id_it));
471  }
472  }
473 }
474 
476 {
477  bool updated = false;
478  // some descriptors must be singletons some not
479  for(auto& orig_desc: descs) {
480  if(new_desc->Which() != orig_desc->Which() ) continue;
481  switch ( orig_desc->Which() ) {
482  case CSeqdesc::e_Source:
483  case CSeqdesc::e_Molinfo:
484  case CSeqdesc::e_Name:
485  case CSeqdesc::e_Title:
486  case CSeqdesc::e_Pir:
487  case CSeqdesc::e_Genbank:
488  case CSeqdesc::e_Sp:
489  case CSeqdesc::e_Embl:
492  case CSeqdesc::e_Prf:
493  case CSeqdesc::e_Pdb:
494  updated = true;
495  orig_desc->Assign(*new_desc);
496  break;
497  default:
498  break;
499  }
500  }
501  if(!updated) {
502  descs.push_back(new_desc);
503  }
504 }
505 
507  CNcbiOstream& ostr_seqids)
508 {
510 
511  time_t timestamp = CTime(CTime::eCurrent).GetTimeT();
512  CStopWatch sw;
513  sw.Start();
514  size_t count = 0;
521  switch(m_InstMol) {
522  case CSeq_inst::eMol_aa:
524  break;
525  default:
527  break;
528  }
529  CFastaReader reader(istr, flags);
530  if (GetArgs()["max-fasta-id"]) {
531  reader.SetMaxIDLength(GetArgs()["max-fasta-id"].AsInteger());
532  }
533  objects::CGPipeMessageListener messageListener;
534  while ( !reader.AtEOF() ) {
535 
536  if (CSignal::IsSignaled()) {
537  ostr_seqids << "#Clean wrapup\n";
539  "trapped signal, exiting");
540  }
541 
542  CRef<CSeq_entry> entry = reader.ReadOneSeq(&messageListener);
543 
544  if (GetArgs()["id-prefix"]) {
545  NON_CONST_ITERATE (CBioseq::TId, id_it, entry->SetSeq().SetId()) {
546  if ((*id_it)->IsLocal()) {
547  if ((*id_it)->GetLocal().IsStr()) {
548  (*id_it)->SetLocal().SetStr()
549  .insert(0, GetArgs()["id-prefix"].AsString());
550  } else {
551  string str_id = NStr::NumericToString(
552  (*id_it)->GetLocal().GetId());
553  (*id_it)->SetLocal().SetStr(
554  GetArgs()["id-prefix"].AsString() + str_id);
555  }
556  }
557  }
558  }
559 
560  // extract canonical ID
562  if (m_PreviousExecutionIds.count(idh)) {
563  // This is a resumption of a previous task, and we already
564  // cached this sequence
565  continue;
566  }
567 
568  TTaxId taxid = m_SequenceTaxids.count(idh.AsString())
569  ? m_SequenceTaxids[idh.AsString()]
570  : (m_SequenceTaxids.count("")
571  ? m_SequenceTaxids[""] : ZERO_TAX_ID);
572  SOrgData &org_data = m_Orgs[taxid];
573 
574  entry->SetSeq().SetInst().SetMol(m_InstMol);
575  CSeq_descr::Tdata& descs = entry->SetSeq().SetDescr().Set();
576  bool molinfo_found=false;
577  bool source_found=false;
578  for(CRef<CSeqdesc>& desc: descs) {
579  switch ( desc->Which() ) {
580  case CSeqdesc::e_Molinfo:
581  molinfo_found=true;
582  if (m_MolInfo) {
583  if( ! desc->GetMolinfo().IsSetCompleteness()
585  desc->SetMolinfo().SetCompleteness( m_MolInfo->GetMolinfo().GetCompleteness() );
586  }
587  if( ! desc->GetMolinfo().IsSetBiomol()
588  && m_MolInfo->GetMolinfo().IsSetBiomol() ) {
589  desc->SetMolinfo().SetBiomol( m_MolInfo->GetMolinfo().GetBiomol() );
590  }
591  }
592  break;
593  case CSeqdesc::e_Source:
594  source_found=true;
596  (
597  ! desc->GetSource().IsSetGenome() || desc->GetSource().GetGenome() == CBioSource::eGenome_unknown
598  )
599  ) {
600  desc->SetSource().SetGenome(m_Genome );
601  }
602  if(org_data.orgref &&
603  (
604  ! desc->GetSource().IsSetOrg() || ! desc->GetSource().GetOrg().IsSetOrgname()
605  )
606  ) {
607  desc->SetSource().SetOrg().Assign(*org_data.orgref);
608  }
609  break;
610  default:
611  break;
612  }
613  }
614  if(!molinfo_found && m_MolInfo) {
615  descs.push_back(m_MolInfo);
616  }
617  if(!source_found && org_data.biosource) {
618  descs.push_back(org_data.biosource);
619  }
620 
621  if (m_other_descs.size()>0) {
623  x_UpsertDescriptor(descs, *desc);
624  }
625  }
626 
627  if (GetArgs()["no-title"]) {
628  NON_CONST_ITERATE (list<CRef<CSeqdesc> >, desc, descs) {
629  if ((*desc)->IsTitle()) {
630  descs.erase(desc);
631  break;
632  }
633  }
634  }
635 
636  if (m_SequenceUniprotSources.count(idh.AsString())) {
637  CRef<CSeqdesc> uniprot_source_comment(new CSeqdesc);
638  uniprot_source_comment->SetComment("Uniprot Source: "
640  descs.push_back(uniprot_source_comment);
641  }
642 
643  if (entry->IsSetDescr() && entry->GetDescr().Get().empty()) {
644  entry->SetSeq().ResetDescr();
645  }
646  CCache_blob blob;
647  blob.SetTimestamp(timestamp);
648  blob.Pack(*entry);
649 
651  size_t offset = m_MainChunk.GetOffset();
652  m_MainChunk.Write(blob);
653  size_t size = m_MainChunk.GetOffset() - offset;
654  Uint4 chunk_id = m_MainChunk.GetChunkSerialNum();
655 
656  entry->Parentize();
657  x_ExtractAndIndex(*entry, timestamp, chunk_id, offset, size);
658 
659  ostr_seqids << idh << endl;
660 
661  ++count;
662  if (count % 100000 == 0) {
663  LOG_POST(Error << " processed " << count << " entries...");
664  }
665  }
666 
667  LOG_POST(Error << "done, dumped " << count << " items");
668 
669 }
670 
671 #ifdef HAVE_NCBI_VDB
672 void CPrimeCacheApplication::x_Process_SRA(CNcbiIstream& istr,
673  CNcbiOstream& ostr_seqids)
674 {
675  time_t timestamp = CTime(CTime::eCurrent).GetTimeT();
676  CStopWatch sw;
677  sw.Start();
678  size_t count = 0;
679 
680  string acc;
681  while (NcbiGetlineEOL(istr, acc)) {
683  if (acc.empty() || acc[0] == '#' || m_PreviousExecutionRuns.count(acc))
684  {
685  continue;
686  }
687 
688  CVDBMgr mgr;
689  CCSraDb sra_db(mgr, acc);
690  CCSraShortReadIterator iter(sra_db);
691 
692  for ( ; iter; ++iter) {
693  CRef<CBioseq> bs = iter.GetShortBioseq();
695  if (m_PreviousExecutionIds.count(idh)) {
696  // This is a resumption of a previous task, and we already
697  // cached this read
698  continue;
699  }
700  CRef<CSeq_entry> entry(new CSeq_entry);
701  entry->SetSeq(*bs);
702  entry->SetSeq().SetInst().SetMol(m_InstMol);
703 
704  if (m_MolInfo) {
705  entry->SetSeq().SetDescr().Set().push_back(m_MolInfo);
706  }
707  if (m_Orgs.begin()->second.biosource) {
708  entry->SetSeq().SetDescr().Set().push_back(
709  m_Orgs.begin()->second.biosource);
710  }
711  if (m_other_descs.size()>0) {
713  entry->SetSeq().SetDescr().Set().push_back(*desc);
714  }
715  }
716 
717  if (CSignal::IsSignaled()) {
718  ostr_seqids << "#Clean wrapup\n";
720  "trapped signal, exiting");
721  }
722 
723  CCache_blob blob;
724  blob.SetTimestamp(timestamp);
725  blob.Pack(*entry);
726 
728  size_t offset = m_MainChunk.GetOffset();
729  m_MainChunk.Write(blob);
730  size_t size = m_MainChunk.GetOffset() - offset;
731  Uint4 chunk_id = m_MainChunk.GetChunkSerialNum();
732 
733  entry->Parentize();
734  x_ExtractAndIndex(*entry, timestamp, chunk_id, offset, size);
735  ostr_seqids << idh << endl;
736 
737  // extract canonical IDs
738  // note that we do this without the object manager, for performance
739 
740  ++count;
741  if (count % 100000 == 0) {
742  LOG_POST(Error << " processed " << count << " reads...");
743  }
744  }
745  ostr_seqids << "#Completed run " << acc << endl;
746  }
747 
748  LOG_POST(Error << "done, dumped " << count << " items");
749 
750 }
751 #endif
752 
754  CNcbiOstream& ostr_seqids,
755  ESerialDataFormat serial_fmt,
756  set<CSeq_id_Handle> &delta_ids,
757  size_t &count)
758 {
759  const CArgs& args = GetArgs();
760 
761  if ( args["split-sequences"] ) {
762  x_SplitAndCacheSeqEntry(istr, ostr_seqids, serial_fmt);
763  }
764  else {
765  x_CacheSeqEntry(istr, ostr_seqids, serial_fmt, delta_ids, count);
766  }
767 }
769  CNcbiOstream& ostr_seqids,
770  ESerialDataFormat serial_fmt)
771 {
772  CPrimeCacheApplication::CCacheBioseq cache_bioseq(this, &ostr_seqids);
773  unique_ptr<CObjectIStream> is(CObjectIStream::Open(serial_fmt, istr));
774 
777 
778  while ( !is->EndOfData() ) {
779  if (CSignal::IsSignaled()) {
781  "trapped signal, exiting");
782  }
783  is->Skip(CType<CSeq_entry>());
784  }
785  is->ResetLocalHooks();
786 }
787 
789  CNcbiOstream& ostr_seqids,
790  ESerialDataFormat serial_fmt,
791  set<CSeq_id_Handle> &delta_ids,
792  size_t &count)
793 {
795 
796  time_t timestamp = CTime(CTime::eCurrent).GetTimeT();
797  CStopWatch sw;
798  sw.Start();
799 
800  unique_ptr<CObjectIStream> is(CObjectIStream::Open(serial_fmt, istr));
801  while ( !is->EndOfData() ) {
802 
803  if (CSignal::IsSignaled()) {
805  "trapped signal, exiting");
806  }
807 
808  CRef<CSeq_entry> entry(new CSeq_entry);
809  *is >> *entry;
810 
811  // Private scope that uses no data loaders.
812  CScope scope(*om);
813  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
814 
815  // Trim seq-entry of annotations and instances of Bioseq of specified classes.
816  // Store ids of trimmed instances of type Bioseq in the @trimmed_bioseqs.
817  set<CSeq_id_Handle> trimmed_bioseqs;
818  if (!m_StripInstMol.empty()) {
819  if (false == x_StripSeqEntry(scope, *entry, trimmed_bioseqs)) {
820  return;
821  }
822  }
823 
824  CCache_blob blob;
825  blob.SetTimestamp(timestamp);
826  blob.Pack(*entry);
827 
829  size_t offset = m_MainChunk.GetOffset();
830  m_MainChunk.Write(blob);
831  size_t size = m_MainChunk.GetOffset() - offset;
832  Uint4 chunk_id = m_MainChunk.GetChunkSerialNum();
833 
834  entry->Parentize();
835  x_ExtractAndIndex(*entry, timestamp, chunk_id, offset, size);
836 
837  // extract canonical IDs
838  // note that we do this in a private scope and use no data loaders
839  for (CBioseq_CI bioseq_it(seh); bioseq_it; ++bioseq_it) {
840  CSeq_id_Handle idh = sequence::GetId(*bioseq_it, m_id_type);
841  if ( trimmed_bioseqs.empty() || !trimmed_bioseqs.count(idh) ) {
842  ostr_seqids << idh << '\n';
843  if (m_ExtractDelta) {
844  x_ExtractDelta(*bioseq_it, delta_ids);
845  }
846  }
847  }
848 
849  ++count;
850  if (count % 100000 == 0) {
851  LOG_POST(Error << "Cache Seq-entry: processed " << count << " entries...");
852  }
853  }
854 
855  LOG_POST(Error << "Cache Seq-entry: done, cached " << count << " items");
856 }
857 
859  set<CSeq_id_Handle> &ids)
860 {
861  string line;
862  while (NcbiGetlineEOL(istr, line)) {
864  if (line.empty() || line[0] == '#') {
865  continue;
866  }
868  }
869 }
870 
872  CNcbiOstream& ostr_seqids,
873  unsigned delta_level,
874  size_t count)
875 {
878  scope.AddDefaults();
879 
880  time_t timestamp = CTime(CTime::eCurrent).GetTimeT();
881  CStopWatch sw;
882  sw.Start();
883 
884  set<CSeq_id_Handle> delta_ids;
885  ITERATE (set<CSeq_id_Handle>, id_it, ids) {
886  CSeq_id_Handle idh = *id_it;
887  if (m_CachedIds.count(idh)) {
888  /// ID already cached
889  continue;
890  }
891  CBioseq_Handle bsh = scope.GetBioseqHandle(idh);
892  if ( !bsh ) {
894  "failed to retrieve sequence for id: " + idh.AsString());
895  }
896 
899  // Trim seq-entry of annotations and instances of Bioseq of specified classes.
900  // Store ids of trimmed instances of type Bioseq in the @trimmed_bioseqs.
901  set<CSeq_id_Handle> trimmed_bioseqs;
902  if (!m_StripInstMol.empty()) {
903  if (false == x_StripSeqEntry(scope, const_cast<CSeq_entry&>(*entry), trimmed_bioseqs)) {
904  return;
905  }
906  }
907 
908  CCache_blob blob;
909  blob.SetTimestamp(timestamp);
910  blob.Pack(*entry);
911 
913  size_t offset = m_MainChunk.GetOffset();
914  m_MainChunk.Write(blob);
915  size_t size = m_MainChunk.GetOffset() - offset;
916  Uint4 chunk_id = m_MainChunk.GetChunkSerialNum();
917 
918  x_ExtractAndIndex(*entry, timestamp, chunk_id, offset, size);
919 
920  // extract canonical IDs
921  for (CBioseq_CI bioseq_it(seh); bioseq_it; ++bioseq_it) {
922  idh = sequence::GetId(*bioseq_it, m_id_type);
923  if ( trimmed_bioseqs.empty() || !trimmed_bioseqs.count(idh) ) {
924  if (delta_level == 0) {
925  ostr_seqids << idh << '\n';
926  }
927  if (m_ExtractDelta) {
928  x_ExtractDelta(*bioseq_it, delta_ids);
929  }
930  }
931  }
932 
933  ++count;
934  if (count % 100000 == 0) {
935  LOG_POST(Error << " processed " << count << " entries...");
936  }
937  }
938 
939  if (!delta_ids.empty() && delta_level++ < m_MaxDeltaLevel) {
940  x_Process_Ids(delta_ids, ostr_seqids, delta_level, count);
941  } else {
942  LOG_POST(Error << "done, cached " << count << " items");
943  }
944 }
945 
947 {
948  if ( seq_entry.IsSet() ) {
949  CBioseq_set& bset = seq_entry.SetSet();
950  list<CRef<CSeq_entry> >& coll = bset.SetSeq_set();
951  for ( list<CRef<CSeq_entry> >::iterator i = coll.begin(); i != coll.end(); ) {
952  if ( false == x_StripSeqEntry(scope, **i, trimmed_bioseqs) ) {
953  i = coll.erase(i);
954  }
955  else {
956  ++i;
957  }
958  }
959 
960  if ( 0 == bset.GetSeq_set().size() ) {
961  return false;
962  }
963 
964  bset.ResetAnnot();
965  bset.ResetDescr();
966  return true;
967  }
968  else if ( seq_entry.IsSeq() ) {
969  CBioseq const& bioseq = seq_entry.GetSeq();
970  if ( bioseq.CanGetInst() ) {
971  if (m_StripInstMol.count(bioseq.GetInst().GetMol()) > 0) {
972  CBioseq_Handle bsh = scope.GetBioseqHandle(*bioseq.GetFirstId());
973  trimmed_bioseqs.insert(sequence::GetId(bsh, m_id_type));
974  return false;
975  }
976  }
977 
978  seq_entry.SetSeq().ResetAnnot();
979  seq_entry.SetSeq().ResetDescr();
980  return true;
981  }
982  else {
983  return true;
984  }
985 }
986 
988  CNcbiOstream* ostr)
989 : parent_(p),
990  ostr_seqids_(ostr),
991  om_(CObjectManager::GetInstance()),
992  timestamp_(CTime(CTime::eCurrent).GetTimeT()),
993  count_(0)
994 {
995 }
996 
998 {
999  CRef<CSeq_entry> entry(new CSeq_entry);
1000  entry->SetSeq(bioseq);
1001  CScope scope(*om_);
1002  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
1003  // Trim seq-entry of annotations and instances of Bioseq of specified classes.
1004  // Store ids of trimmed instances of type Bioseq in the @trimmed_bioseqs.
1005  set<CSeq_id_Handle> trimmed_bioseqs;
1006  if (!parent_->m_StripInstMol.empty()) {
1007  if (false == parent_->x_StripSeqEntry(scope, *entry, trimmed_bioseqs)) {
1008  return;
1009  }
1010  }
1011 
1012  CCache_blob blob;
1013  blob.SetTimestamp(timestamp_);
1014  blob.Pack(*entry);
1015 
1016  parent_->m_MainChunk.OpenForWrite(parent_->m_CachePath);
1017  size_t offset = parent_->m_MainChunk.GetOffset();
1018  parent_->m_MainChunk.Write(blob);
1019  size_t size = parent_->m_MainChunk.GetOffset() - offset;
1020  Uint4 chunk_id = parent_->m_MainChunk.GetChunkSerialNum();
1021 
1022  entry->Parentize();
1023  parent_->x_ExtractAndIndex(*entry, timestamp_, chunk_id, offset, size);
1024 
1025  // extract canonical IDs
1026  // note that we do this in a private scope and use no data loaders
1027  for (CBioseq_CI bioseq_it(seh); bioseq_it; ++bioseq_it) {
1028  CSeq_id_Handle idh = sequence::GetId(*bioseq_it, parent_->m_id_type);
1029  (*ostr_seqids_) << idh << '\n';
1030  }
1031 
1032  ++count_;
1033  if (count_ % 100000 == 0) {
1034  LOG_POST(Error << " processed " << count_ << " entries...");
1035  }
1036 }
1037 
1038 
1040 {
1041  // Get arguments
1042  const CArgs& args = GetArgs();
1043  {{
1046  }}
1047 
1052 
1053  string ifmt = args["ifmt"].AsString();
1054 
1055  if ((args["taxid"] || args["taxid-table"] || args["taxid-table-manifest"] ||
1056  args["molinfo"] || args["biosource"] || args["submit-block-template"])
1057  && ifmt != "fasta" && ifmt != "csra")
1058  {
1060  "metadata parameters only allowed with fasta or SRA input");
1061  }
1062  if ((args["uniprot-source-table"] || args["uniprot-source-table-manifest"])
1063  && ifmt != "fasta")
1064  {
1066  "uniprot source parameters only allowed with fasta input");
1067  }
1068  if (args["resume"] && ifmt != "fasta" && ifmt != "csra")
1069  {
1071  "Resume only supported with fasta or SRA input");
1072  }
1073 
1074  if (args["taxid"] || args["taxid-table"] || args["taxid-table-manifest"]) {
1075  if (args["taxid"]) {
1076  m_SequenceTaxids[""] = TAX_ID_FROM(TIntId, args["taxid"].AsIntId());
1077  }
1078  CInputStreamSource taxids_source;
1079  if (args["taxid-table"]) {
1080  taxids_source.InitStream(args["taxid-table"].AsInputFile());
1081  } else if (args["taxid-table-manifest"]) {
1082  taxids_source.InitManifest(args["taxid-table-manifest"].AsString());
1083  }
1084  unsigned col = args["taxid-column"].AsInteger() - 1;
1085  for (; taxids_source; ++taxids_source) {
1086  string line;
1087  while (NcbiGetlineEOL(*taxids_source, line)) {
1088  if (line.empty() || line[0] == '#') {
1089  continue;
1090  }
1091  vector<string> tokens;
1092  NStr::Split(line, "\t", tokens);
1094  = TAX_ID_FROM(int, NStr::StringToInt(tokens[col]));
1095  }
1096  }
1097 
1098  CLocalTaxon taxon(args);
1099  for (const pair<const string, TTaxId> &seq_taxid : m_SequenceTaxids)
1100  {
1101  TTaxId taxid = seq_taxid.second;
1102  if (m_Orgs.count(taxid)) {
1103  continue;
1104  }
1105 
1106  CConstRef<COrg_ref> ref = taxon.GetOrgRef(taxid);
1107  if ( !ref ) {
1109  "failed to find Org-ref for taxid " +
1110  NStr::NumericToString(taxid));
1111  }
1112 
1113  m_Orgs[taxid].biosource.Reset(new CSeqdesc);
1114  m_Orgs[taxid].biosource->SetSource().SetOrg().Assign(*ref);
1115  m_Orgs[taxid].orgref.Reset(new COrg_ref);
1116  m_Orgs[taxid].orgref->Assign(*ref);
1117  }
1118  }
1119 
1120  if (args["uniprot-source-table"] || args["uniprot-source-table-manifest"]) {
1121  unsigned col = args["uniprot-source-column"].AsInteger() - 1;
1122  CInputStreamSource uniprot_sources_source;
1123  if (args["uniprot-source-table"]) {
1124  uniprot_sources_source.InitStream(args["uniprot-source-table"].AsInputFile());
1125  } else if (args["uniprot-source-table-manifest"]) {
1126  uniprot_sources_source.InitManifest(args["uniprot-source-table-manifest"].AsString());
1127  }
1128  for (; uniprot_sources_source; ++uniprot_sources_source) {
1129  string line;
1130  while (NcbiGetlineEOL(*uniprot_sources_source, line)) {
1131  if (line.empty() || line[0] == '#') {
1132  continue;
1133  }
1134  vector<string> tokens;
1135  NStr::Split(line, "\t", tokens);
1137  . AsString()] = tokens[col];
1138  }
1139  }
1140  }
1141 
1142  if (m_Orgs.empty()) {
1143  /// No taxid specified in arguments; create catch-all organism entry
1145  }
1146 
1147  m_InstMol = sm_InstMolTypes.find(args["inst-mol"].AsString().c_str())->second;
1148 
1149  if (args["biosource"]) {
1150  if (!m_Orgs.begin()->second.biosource) {
1151  m_Orgs.begin()->second.biosource.Reset(new CSeqdesc);
1152  }
1153  m_Orgs.begin()->second.biosource->SetSource().SetGenome(
1154  sm_GenomeTypes.find(args["biosource"].AsString().c_str())->second);
1155  m_Genome = sm_GenomeTypes.find(args["biosource"].AsString().c_str())->second;
1156 
1157  }
1158 
1159  if (args["molinfo"]) {
1160  m_MolInfo.Reset(new CSeqdesc);
1162  sm_BiomolTypes.find(args["molinfo"].AsString().c_str())->second);
1163  }
1164  if (args["submit-block-template"]) {
1165  CRef<CSubmit_block> submit_block;
1166  CNcbiIstream& istr_manifest = args["submit-block-template"].AsInputFile();
1167  unique_ptr<CObjectIStream> is
1168  (CObjectIStream::Open(eSerial_AsnText, istr_manifest));
1169  while ( !is->EndOfData() ) {
1170  if ( !submit_block ) {
1171  submit_block.Reset(new CSubmit_block);
1172  *is >> *submit_block;
1173  }
1174  else {
1175  CRef<CSeqdesc> desc(new CSeqdesc);
1176  *is >> *desc;
1177 
1178  switch (desc->Which()) {
1179  case CSeqdesc::e_Source:
1180  {{
1181  SOrgData &org_data = m_Orgs.begin()->second;
1182  org_data.biosource = desc;
1183  if(!org_data.orgref) {
1184  org_data.orgref.Reset(new COrg_ref);
1185  }
1186  org_data.orgref->Assign(desc->GetSource().GetOrg() );
1187  break;
1188  }}
1189 
1190  case CSeqdesc::e_Molinfo:
1191  m_MolInfo = desc;
1192  break;
1193 
1194  default:
1195  m_other_descs.push_back(desc);
1196  break;
1197  }
1198  }
1199  }
1200  }
1201 
1203 
1204  {{
1205  m_CachePath = args["cache"].AsString();
1206  CDir dir(m_CachePath);
1207  if ( !dir.Exists() ) {
1208  dir.CreatePath();
1209  }
1211 
1212  m_MainIndex.SetCacheSize(1 * 1024 * 1024 * 1024);
1214  m_SeqIdIndex.SetCacheSize(1 * 1024 * 1024 * 1024);
1216  }}
1217 
1218  bool resuming_from_clean_wrapup = false;
1219  if (args["resume"]) {
1220  CFile output_file(args["oseq-ids"].AsString());
1221  if (!output_file.Exists()) {
1223  "Can't resums; " + output_file.GetPath() + " not found");
1224  }
1225  CNcbiIfstream istr(output_file.GetPath());
1226  string line;
1227  set<CSeq_id_Handle> previous_execution_ids;
1228  set<string> previous_execution_runs;
1229  while (NcbiGetlineEOL(istr, line)) {
1230  if (!line.empty() && line[0] != '#') {
1231  previous_execution_ids.insert(CSeq_id_Handle::GetHandle(line));
1232  } else if (NStr::StartsWith(line, "#Completed run ")) {
1233  previous_execution_runs.insert(line.substr(15));
1234  }
1235  if ((resuming_from_clean_wrapup = (line == "#Clean wrapup"))) {
1236  /// Execution ended in a clean wrapup, so the ids and runs that
1237  /// it completed have been successfully cached
1238  m_PreviousExecutionIds = previous_execution_ids;
1239  m_PreviousExecutionRuns = previous_execution_runs;
1240  }
1241  }
1242  }
1243 
1244  CNcbiOstream &ostr = args["oseq-ids"].AsOutputFile(
1245  resuming_from_clean_wrapup ? CArgValue::fAppend : 0);
1246  if (!resuming_from_clean_wrapup && !m_PreviousExecutionIds.empty()) {
1247  /// Had a clean wrapup at some point, but last execution ended
1248  /// uncleanly; need to re-write output file with those ids that were
1249  /// cached cleanly, then we'll redo the rest
1250  ostr << "#interrupted-execution\n";
1251  for (const string &run : m_PreviousExecutionRuns) {
1252  ostr << "#Completed run " << run << '\n';
1253  }
1254  for (const CSeq_id_Handle &id : m_PreviousExecutionIds) {
1255  if (id.Which() == CSeq_id::e_General &&
1256  id.GetSeqId()->GetGeneral().GetTag().IsStr())
1257  {
1258  string tag = id.GetSeqId()->GetGeneral().GetTag().GetStr();
1259  if (m_PreviousExecutionRuns.count(tag.substr(0, tag.find('.'))))
1260  {
1261  continue;
1262  }
1263  }
1264  ostr << id << '\n';
1265  }
1266  ostr << "#Clean wrapup\n";
1267  }
1268  ostr << "#" << args["seq-id-type"].AsString() << "-id" << endl;
1269  m_id_type = args["seq-id-type"].AsString() == "canonical"
1271 
1272  if (args["strip-annots-and-inst-mol"]) {
1273  list<string> mol_types;
1274  NStr::Split(args["strip-annots-and-inst-mol"].AsString(), string(","), mol_types, 0);
1275  vector<string> unknown_mols;
1276  for (list<string>::const_iterator mol = mol_types.cbegin(); mol != mol_types.cend(); ++mol) {
1277  string key = NStr::TruncateSpaces(*mol);
1278  TInstMolTypeMap::const_iterator record = sm_InstMolTypes.find(key.c_str());
1279  if ( record != sm_InstMolTypes.end() ) {
1280  m_StripInstMol.insert(record->second);
1281  }
1282  else {
1283  unknown_mols.push_back(key);
1284  }
1285  }
1286 
1287  if (!unknown_mols.empty()) {
1288  ostringstream oss;
1289  oss << "Unknown molecule classes: [";
1290  for (vector<string>::const_iterator i = unknown_mols.cbegin(); i != unknown_mols.end(); ++i ) {
1291  oss << *i;
1292  }
1293  oss << "]. Valid classes: [aa, dna, na, other, rna]";
1294 
1295  NCBI_THROW(CException, eUnknown, oss.str());
1296  }
1297  }
1298 
1299  m_ExtractDelta = args["extract-delta"];
1300  if (args["delta-level"]) {
1301  m_MaxDeltaLevel = args["delta-level"].AsInteger();
1302  }
1303 
1304  size_t count = 0;
1305  set<CSeq_id_Handle> ids;
1306  if (args["input-manifest"]) {
1307  CNcbiIstream& istr = args["input-manifest"].AsInputFile();
1308  string line;
1309  while (NcbiGetlineEOL(istr, line)) {
1311  if (line.empty() || line[0] == '#') {
1312  continue;
1313  }
1314 
1315  CNcbiIfstream is(line.c_str());
1316  if (ifmt == "ids") {
1317  x_Read_Ids(is, ids);
1318  }
1319  else if (ifmt == "fasta") {
1320  x_Process_Fasta(is, ostr);
1321  }
1322 #ifdef HAVE_NCBI_VDB
1323  else if (ifmt == "csra") {
1324  x_Process_SRA(is, ostr);
1325  }
1326 #endif
1327  else if (ifmt == "asn-seq-entry") {
1328  x_Process_SeqEntry(is, ostr, eSerial_AsnText, ids, count);
1329  }
1330  else if (ifmt == "asnb-seq-entry") {
1331  x_Process_SeqEntry(is, ostr, eSerial_AsnBinary, ids, count);
1332  }
1333  else {
1335  "unhandled input format");
1336  }
1337  }
1338  }
1339  else {
1340  CNcbiIstream& istr = args["i"].AsInputFile();
1341  if (ifmt == "ids") {
1342  x_Read_Ids(istr, ids);
1343  }
1344  else if (ifmt == "fasta") {
1345  x_Process_Fasta(istr, ostr);
1346  }
1347 #ifdef HAVE_NCBI_VDB
1348  else if (ifmt == "csra") {
1349  x_Process_SRA(istr, ostr);
1350  }
1351 #endif
1352  else if (ifmt == "asn-seq-entry") {
1353  x_Process_SeqEntry(istr, ostr, eSerial_AsnText, ids, count);
1354  }
1355  else if (ifmt == "asnb-seq-entry") {
1356  x_Process_SeqEntry(istr, ostr, eSerial_AsnBinary, ids, count);
1357  }
1358  else {
1360  "unhandled input format");
1361  }
1362  }
1363 
1364  if (!ids.empty()) {
1365  x_Process_Ids(ids, ostr, ifmt == "ids" ? 0 : 1, count);
1366  }
1367 
1370 
1371  return 0;
1372 }
1373 
1375  set<CSeq_id_Handle>& delta_ids)
1376 {
1377  ///
1378  /// process any delta-seqs
1379  ///
1380  if (bsh.GetInst().IsSetExt() &&
1381  bsh.GetInst().GetExt().IsDelta()) {
1382  ITERATE (CBioseq::TInst::TExt::TDelta::Tdata, iter,
1383  bsh.GetInst().GetExt().GetDelta().Get()) {
1384  const CDelta_seq& seg = **iter;
1385  CTypeConstIterator<CSeq_id> id_iter(seg);
1386  for ( ; id_iter; ++id_iter) {
1387  delta_ids.insert
1388  (CSeq_id_Handle::GetHandle(*id_iter));
1389  }
1390  }
1391  }
1392 }
1393 
1394 /////////////////////////////////////////////////////////////////////////////
1395 // Cleanup
1396 
1397 
1399 {
1400  SetDiagStream(0);
1401 }
1402 
1403 
1404 /////////////////////////////////////////////////////////////////////////////
1405 // MAIN
1406 
1407 
1408 int main(int argc, const char* argv[])
1409 {
1410  // Execute main application function
1411  return CPrimeCacheApplication().AppMain(argc, argv);
1412 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
size_t IndexABioseq(const objects::CBioseq &bioseq, CAsnIndex &index, CAsnIndex::TTimestamp timestamp, CAsnIndex::TChunkId chunk_id, CAsnIndex::TOffset offset, CAsnIndex::TSize size)
Definition: asn_index.cpp:197
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
This is a simple BDB structure holding information about a given accession and its indexed location.
Definition: asn_index.hpp:53
Uint4 TChunkId
Definition: asn_index.hpp:59
Uint8 TOffset
Definition: asn_index.hpp:60
Uint4 TSize
Definition: asn_index.hpp:61
Uint4 TTimestamp
Definition: asn_index.hpp:58
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
void Pack(const CSeq_entry &entry)
Definition: Cache_blob.cpp:75
Int8 GetOffset()
Definition: chunk_file.hpp:72
void OpenForWrite(const std::string &root_path="")
Definition: chunk_file.cpp:54
void Write(const CCache_blob &cache_blob)
Definition: chunk_file.cpp:141
unsigned int GetChunkSerialNum() const
Definition: chunk_file.hpp:73
CDelta_seq –.
Definition: Delta_seq.hpp:66
Temporary object for holding extra message arguments.
Definition: ncbidiag.hpp:1828
CDir –.
Definition: ncbifile.hpp:1695
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
CFile –.
Definition: ncbifile.hpp:1604
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
class CInputStreamSource encapsulates details of how we supply applications with input data through s...
void InitManifest(const string &manifest)
Initialize from a manifest file.
void InitStream(CNcbiIstream &istr, const string &fname=kEmptyStr)
Initialize from a stream No ownership is claimed by this class - lifetime management of the stream is...
static void AddArguments(CArgDescriptions &arg_desc)
Definition: local_taxon.cpp:52
CConstRef< objects::COrg_ref > GetOrgRef(TTaxid taxid)
CObjectEnum(const CObjectEnum &)
CObjectEnum & operator=(const CObjectEnum &)
void SkipObject(CObjectIStream &istr, const CObjectTypeInfo &info)
Consumer m_Consumer
CObjectEnum(Consumer consumer)
CObjectIStream –.
Definition: objistr.hpp:93
CObjectManager –.
CObjectTypeInfo –.
Definition: objectinfo.hpp:94
CPrimeCacheApplication * parent_
CCacheBioseq(CPrimeCacheApplication *p, CNcbiOstream *ostr)
void x_ExtractDelta(CBioseq_Handle bsh, set< CSeq_id_Handle > &delta_ids)
bool x_StripSeqEntry(CScope &scope, CSeq_entry &entry, set< CSeq_id_Handle > &trimmed_bioseqs)
void x_UpsertDescriptor(list< CRef< CSeqdesc > > &descs, CRef< CSeqdesc > new_desc)
void x_Process_Ids(const set< CSeq_id_Handle > &ids, CNcbiOstream &ostr_seqids, unsigned delta_level, size_t count)
void x_Read_Ids(CNcbiIstream &istr, set< CSeq_id_Handle > &ids)
map< string, TTaxId > m_SequenceTaxids
void x_CacheSeqEntry(CNcbiIstream &istr, CNcbiOstream &ostr_seqids, ESerialDataFormat serial_fmt, set< CSeq_id_Handle > &delta_ids, size_t &count)
virtual void Init(void)
Initialize the application.
void x_Process_SeqEntry(CNcbiIstream &istr, CNcbiOstream &ostr_seqids, ESerialDataFormat serial_fmt, set< CSeq_id_Handle > &delta_ids, size_t &count)
sequence::EGetIdType m_id_type
void x_ExtractAndIndex(const CSeq_entry &entry, CAsnIndex::TTimestamp timestamp, CAsnIndex::TChunkId chunk_id, CAsnIndex::TOffset offset, CAsnIndex::TSize size)
CSeq_inst::EMol m_InstMol
void x_SplitAndCacheSeqEntry(CNcbiIstream &istr, CNcbiOstream &ostr_seqids, ESerialDataFormat serial_fmt)
map< TTaxId, SOrgData > m_Orgs
CBioSource::EGenome m_Genome
set< CSeq_id_Handle > m_CachedIds
CSeqIdChunkFile m_SeqIdChunk
list< CRef< CSeqdesc > > m_other_descs
map< string, string > m_SequenceUniprotSources
CRef< CSeqdesc > m_MolInfo
set< string > m_PreviousExecutionRuns
set< CSeq_inst::EMol > m_StripInstMol
virtual int Run(void)
Run the application.
virtual void Exit(void)
Cleanup on application exit.
void x_Process_Fasta(CNcbiIstream &istr, CNcbiOstream &ostr_seqids)
set< CSeq_id_Handle > m_PreviousExecutionIds
CScope –.
Definition: scope.hpp:92
void OpenForWrite(const std::string &root_path="")
void Write(const objects::CBioseq::TId &seq_ids)
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
void Parentize(void)
Definition: Seq_entry.cpp:71
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
Skip hook for a standalone object.
Definition: objhook.hpp:205
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
CStopWatch –.
Definition: ncbitime.hpp:1938
CSubmit_block –.
CTime –.
Definition: ncbitime.hpp:296
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
static uch flags
#define T(s)
Definition: common.h:230
Operators to edit gaps in sequences.
#define false
Definition: bool.h:36
int offset
Definition: replacements.h:160
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
Definition: ncbiargs.cpp:4598
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
@ fAppend
Open file in append mode.
Definition: ncbiargs.hpp:265
void SetCacheSize(unsigned int cache_size)
Set Berkeley DB memory cache size for the file (default is 256K).
Definition: bdb_file.cpp:563
void Open(const string &filename, EOpenMode open_mode, bool support_dirty_read=false, unsigned rec_len=0)
Open file with specified access mode.
Definition: bdb_file.hpp:774
@ eReadWriteCreate
read-write, create if it doesn't exist
Definition: bdb_file.hpp:82
void PrintRequestStop(void)
Print request stop message (for request-driven applications)
Definition: ncbidiag.cpp:2778
CDiagContext & GetDiagContext(void)
Get diag context instance.
Definition: logging.cpp:818
void PrintRequestStart(const string &message)
Print request start message (for request-driven applications)
Definition: ncbidiag.cpp:2762
static CRequestContext & GetRequestContext(void)
Shortcut to CDiagContextThreadData::GetThreadData().GetRequestContext()
Definition: ncbidiag.cpp:1901
void SetRequestStatus(int status)
const CStopWatch & GetRequestTimer(void) const
Request execution timer.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool CreatePath(TCreateFlags flags=fCreate_Default) const
Create the directory path recursively possibly more than one at a time.
Definition: ncbifile.cpp:4106
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3910
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
Definition: fasta.cpp:312
long TFlags
binary OR of EFlags
Definition: fasta.hpp:117
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
Definition: fasta.hpp:141
void SetMaxIDLength(Uint4 max_len)
If this is set, an exception will be thrown if a Sequence ID exceeds the given length.
Definition: fasta.cpp:485
@ fRequireID
Reject deflines that lack IDs.
Definition: fasta.hpp:95
@ fAddMods
Parse defline mods and add to SeqEntry.
Definition: fasta.hpp:104
@ fNoUserObjs
Don't save raw deflines in User-objects.
Definition: fasta.hpp:106
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fAssumeProt
Assume prots unless accns indicate otherwise.
Definition: fasta.hpp:88
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
Definition: fasta.hpp:114
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
void SetLocalSkipHook(CObjectIStream &stream, CSkipObjectHook *hook) const
Set local (for the specified stream) skip hook.
Definition: objectinfo.cpp:420
void ReadObject(const CObjectInfo &object)
Read child object.
Definition: objistr.cpp:1097
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
int EGetIdType
Definition: sequence.hpp:126
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ eGetId_Canonical
Definition: sequence.hpp:114
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TInst & GetInst(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
time_t GetTimeT(void) const
Get time in time_t format.
Definition: ncbitime.cpp:1395
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
static bool IsSignaled(TSignalMask signals=eSignal_Any)
Check that any of specified signals is received.
static void TrapSignals(TSignalMask signals)
Sets interrupt signal handling.
@ eSignal_QUIT
Quit.
Definition: ncbi_signal.hpp:72
@ eSignal_HUP
Hangup.
Definition: ncbi_signal.hpp:70
@ eSignal_TERM
Termination.
Definition: ncbi_signal.hpp:78
@ eSignal_INT
Interrupt.
Definition: ncbi_signal.hpp:71
void SetTimestamp(TTimestamp value)
Assign a value to Timestamp data member.
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
EGenome
biological context
Definition: BioSource_.hpp:97
@ e_General
for other databases
Definition: Seq_id_.hpp:105
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
void ResetAnnot(void)
Reset Annot data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
void ResetDescr(void)
Reset Descr data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
Definition: MolInfo_.hpp:569
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void ResetDescr(void)
Reset Descr data member.
Definition: Bioseq_.cpp:60
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
void ResetAnnot(void)
Reset Annot data member.
Definition: Bioseq_.cpp:91
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
Definition: MolInfo_.hpp:453
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_snoRNA
small nucleolar RNA
Definition: MolInfo_.hpp:112
@ eBiomol_genomic_mRNA
reported a mix of genomic and cdna sequence
Definition: MolInfo_.hpp:110
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
Definition: MolInfo_.hpp:113
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Pir
PIR specific info.
Definition: Seqdesc_.hpp:120
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Prf
PRF specific information.
Definition: Seqdesc_.hpp:130
@ e_Sp
SWISSPROT specific info.
Definition: Seqdesc_.hpp:125
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Create_date
date entry first created/released
Definition: Seqdesc_.hpp:128
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_Name
a name for this sequence
Definition: Seqdesc_.hpp:114
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
static CStopWatch sw
int i
static MDB_envinfo info
Definition: mdb_load.c:37
string GetBDBIndex()
Definition: file_names.hpp:44
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const char * tag
Setup interrupt signal handling.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
The Object manager core.
USING_SCOPE(objects)
SStaticPair< const char *, const CSeq_inst::EMol > TInstMolTypeKey
CStaticPairArrayMap< const char *, const CBioSource::EGenome, PCase > TGenomeTypeMap
CStaticPairArrayMap< const char *, const CSeq_inst::EMol, PCase > TInstMolTypeMap
SStaticPair< const char *, const CBioSource::EGenome > TGenomeTypeKey
DEFINE_STATIC_ARRAY_MAP(TBiomolTypeMap, sm_BiomolTypes, db_biomol_type_name_to_enum)
static const TBiomolTypeKey db_biomol_type_name_to_enum[]
Definition: prime_cache.cpp:89
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
Definition: prime_cache.cpp:84
CStaticPairArrayMap< const char *, const CMolInfo::EBiomol, PCase > TBiomolTypeMap
static const TInstMolTypeKey db_inst_mol_type_name_to_enum[]
SStaticPair< const char *, const CMolInfo::EBiomol > TBiomolTypeKey
Definition: prime_cache.cpp:87
static const TGenomeTypeKey db_genome_type_name_to_enum[]
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
Defines CRequestContext class for NCBI C++ diagnostic API.
CRef< objects::CObjectManager > om
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
ZLib Compression API.
Modified on Wed Apr 24 14:20:09 2024 by modify_doxy.py rev. 669887