NCBI C++ ToolKit
sub_cache_create.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sub_cache_create.cpp 101298 2023-11-28 17:15:47Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Cheinan Marks, Eyal Mozes
27  *
28  * File Description:
29  * Given a list of GIs, this program extracts an ASN.1 cache of the given GIs
30  * into another ASN.1 cache, the subcache.
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbienv.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/request_ctx.hpp>
39 #include <corelib/ncbifile.hpp>
40 #include <corelib/ncbistr.hpp>
41 #include <corelib/ncbitime.hpp>
42 #include <corelib/ncbi_signal.hpp>
43 #include <corelib/ncbi_process.hpp>
44 
50 
51 #include <serial/iterator.hpp>
52 
53 #include <connect/ncbi_pipe.hpp>
54 
55 #include <dbapi/simple/sdbapi.hpp>
56 
57 #include <util/file_manifest.hpp>
58 #include <util/compress/zlib.hpp>
59 #include <util/compress/stream.hpp>
61 #include <objects/seq/Seq_inst.hpp>
62 #include <objects/seq/Seq_ext.hpp>
67 #include <objects/seq/Seqdesc.hpp>
68 #include <objects/seq/MolInfo.hpp>
71 
73 #include <objmgr/scope.hpp>
74 #include <objmgr/bioseq_handle.hpp>
76 #include <objmgr/feat_ci.hpp>
77 #include <objmgr/bioseq_ci.hpp>
78 #include <objmgr/util/sequence.hpp>
80 #ifdef HAVE_NCBI_VDB
82 #endif
83 
85 
86 #include <db/bdb/bdb_cursor.hpp>
87 
88 #include <string>
89 #include <vector>
90 #include <cstdlib>
91 #include <algorithm>
92 #include <functional>
93 
94 
97 
98 
99 ///////////////////////////////////////////////////////////////////////////////
100 ///
101 /// Indexing primitives
102 ///
103 
104 struct SSeqIdIndex {
108 
110  : m_Idh(idh)
111  {
113  }
114 
115  bool operator<(const SSeqIdIndex& k2) const
116  {
117  return m_SeqId < k2.m_SeqId ||
118  (m_SeqId == k2.m_SeqId && m_Version < k2.m_Version);
119  }
120 
121  bool operator<(const CAsnIndex& index) const
122  {
123  return m_SeqId < index.GetSeqId() ||
124  (m_SeqId == index.GetSeqId() && m_Version < index.GetVersion());
125  }
126 
127  bool operator==(const CAsnIndex& index) const
128  {
129  return m_SeqId == index.GetSeqId() && m_Version == index.GetVersion();
130  }
131 
132  bool operator>=(const CAsnIndex& index) const
133  {
134  return !(*this < index);
135  }
136 };
137 
139 {
140  SBlobLocator(CSeq_id_Handle idh, const CDir &root_cache)
141  : m_Idh(idh), m_CacheRoot(&root_cache), m_ChunkId(0), m_Offset(0), m_BlobSize(0)
142  {}
143 
144  SBlobLocator &operator=(const CAsnIndex &main_index)
145  {
146  m_ChunkId = main_index.GetChunkId();
147  m_Offset = main_index.GetOffset();
148  m_BlobSize = main_index.GetSize();
149  return *this;
150  }
151 
152  bool operator<(const SBlobLocator& k2) const
153  {
154  if (m_CacheRoot->GetPath() < k2.m_CacheRoot->GetPath()) { return true; }
155  if (k2.m_CacheRoot->GetPath() < m_CacheRoot->GetPath()) { return false; }
156  if (m_ChunkId < k2.m_ChunkId) { return true; }
157  if (k2.m_ChunkId < m_ChunkId) { return false; }
158 
159  return (m_Offset < k2.m_Offset);
160  }
161 
162  bool operator==(const SBlobLocator& k2) const
163  {
164  return m_CacheRoot->GetPath() == k2.m_CacheRoot->GetPath()
165  && m_ChunkId == k2.m_ChunkId && m_Offset == k2.m_Offset;
166  }
167 
169  const CDir * m_CacheRoot;
173 };
174 
176 {
178  : m_Gi(0)
179  , m_Timestamp(0)
180  , m_BlobSize(0)
181  {}
182 
184  {
185  m_Gi = main_index.GetGi();
186  m_Timestamp = main_index.GetTimestamp();
187  m_BlobSize = main_index.GetSize();
188  m_SeqLength = main_index.GetSeqLength();
189  m_TaxId = main_index.GetTaxId();
190  return *this;
191  }
192 
193  operator bool() const
194  { return m_BlobSize; }
195 
196  vector<CSeq_id_Handle> m_Ids;
199 
207 };
208 
209 typedef deque<SSubcacheIndexData> TBlobLocationList;
213 typedef list<TIndexRef> TIndexRefList;
214 typedef vector< pair<SBlobLocator, TBlobLocationEntry> > TIndexMapByBlob;
216 
217 
219  vector<CSeq_id_Handle>& extra_ids,
220  bool extract_delta,
221  bool extract_products)
222 {
223  ///
224  /// process any delta-seqs
225  ///
226  if (extract_delta &&
227  bsh.GetInst().IsSetExt() &&
228  bsh.GetInst().GetExt().IsDelta()) {
229  ITERATE (CBioseq::TInst::TExt::TDelta::Tdata, iter,
230  bsh.GetInst().GetExt().GetDelta().Get()) {
231  const CDelta_seq& seg = **iter;
232  CTypeConstIterator<CSeq_id> id_iter(seg);
233  for ( ; id_iter; ++id_iter) {
234  extra_ids.push_back
235  (CSeq_id_Handle::GetHandle(*id_iter));
236  }
237  }
238  }
239 
240  ///
241  /// extract products
242  ///
243  if (extract_products) {
244  SAnnotSelector sel;
246  .SetExactDepth(false)
247  .SetAdaptiveDepth(true)
248  .SetResolveAll()
249  .ExcludeNamedAnnots("SNP")
250  .ExcludeNamedAnnots("STS")
251  .ExcludeNamedAnnots("CDD");
252  for (CFeat_CI feat_iter(bsh, sel); feat_iter; ++feat_iter) {
253  if (feat_iter->IsSetProduct()) {
255  (feat_iter->GetProduct());
256  for ( ; id_iter; ++id_iter) {
257  extra_ids.push_back
258  (CSeq_id_Handle::GetHandle(*id_iter));
259  }
260  }
261  }
262  }
263 }
264 
265 static bool s_TrimLargeNucprots = false;
266 static bool s_RemoveAnnot = false;
268 
270 {
271  if (entry.IsSeq()) {
272  bool removed_annot = entry.GetSeq().IsSetAnnot();
273  entry.SetSeq().ResetAnnot();
274  return removed_annot;
275  } else {
276  bool removed_annots = entry.GetSet().IsSetAnnot();
277  entry.SetSet().ResetAnnot();
279  entry.SetSet().SetSeq_set())
280  {
281  removed_annots |= s_RemoveAnnotsFromEntry(**seq_it);
282  }
283  return removed_annots;
284  }
285 }
286 
287 /// If entry is a large nucprot set, Optionally create a new trimmed Seq-entry
288 /// containing only the needed Bioseq. Also optionally remove all Seq-annots.
290 {
291  bool trim_large_nucprot = false;
292  bool trimmed = false;
293 
294  CSeq_entry *full_entry = const_cast<CSeq_entry *>(entry.GetPointer());
295  CConstRef<CBioseq> genomic_seq;
296  if (s_TrimLargeNucprots &&
297  entry->IsSet() &&
299  !entry->GetSet().GetSeq_set().empty() &&
300  entry->GetSet().GetSeq_set().front()->IsSeq())
301  {
302  /// This is a nuc-prot set. It is a large nucprot set if first Bioseq
303  /// is genomic
304  genomic_seq.Reset(&entry->GetSet().GetSeq_set().front()->GetSeq());
305  ITERATE (CSeq_descr::Tdata, desc_it, genomic_seq->GetDescr().Get()) {
306  if ((*desc_it)->IsMolinfo()) {
307  trim_large_nucprot = (*desc_it)->GetMolinfo().GetBiomol() ==
309  break;
310  }
311  }
312  }
313 
314  if (trim_large_nucprot) {
315  /// This is a large nucprot set; create a trimmed Seq-entry
316  CRef<CSeq_entry> child_to_include(const_cast<CSeq_entry *>(
318  CSeq_entry *trimmed_entry = new CSeq_entry;
319  trimmed_entry->SetSet().SetClass(CBioseq_set::eClass_nuc_prot);
320  if (full_entry->GetSet().IsSetId()) {
321  trimmed_entry->SetSet().SetId(full_entry->SetSet().SetId());
322  }
323  if (full_entry->GetSet().IsSetColl()) {
324  trimmed_entry->SetSet().SetColl(full_entry->SetSet().SetColl());
325  }
326  if (full_entry->GetSet().IsSetLevel()) {
327  trimmed_entry->SetSet().SetLevel(full_entry->GetSet().GetLevel());
328  }
329  if (full_entry->GetSet().IsSetRelease()) {
330  trimmed_entry->SetSet().SetRelease(full_entry->GetSet().GetRelease());
331  }
332  if (full_entry->GetSet().IsSetDate()) {
333  trimmed_entry->SetSet().SetDate(full_entry->SetSet().SetDate());
334  }
335  if (full_entry->GetSet().IsSetDescr()) {
336  trimmed_entry->SetSet().SetDescr(full_entry->SetSet().SetDescr());
337  }
338  // Include Bioseq-set-level annot only if the Bioseq requested
339  /// is the genomic one
340  if (full_entry->GetSet().IsSetAnnot() &&
341  genomic_seq.GetPointer() == &child_to_include->GetSeq())
342  {
343  trimmed_entry->SetSet().SetAnnot().insert(
344  trimmed_entry->SetSet().SetAnnot().end(),
345  full_entry->SetSet().SetAnnot().begin(),
346  full_entry->SetSet().SetAnnot().end());
347  }
348  trimmed_entry->SetSet().SetSeq_set().push_back(child_to_include);
349  entry.Reset(full_entry = trimmed_entry);
350  trimmed = true;
351  }
352  if (s_RemoveAnnot) {
353  trimmed |= s_RemoveAnnotsFromEntry(*full_entry);
354  }
355  return trimmed;
356 }
357 
359 {
361  s_MolType = bsh.GetBioseqMolType();
362  cout << bsh.GetSeq_id_Handle() << " mol type " << (unsigned) s_MolType << endl;
363  } else if (s_TrimLargeNucprots && s_MolType != bsh.GetBioseqMolType()) {
364  cout << bsh.GetSeq_id_Handle() << " contradictory mol type " << (unsigned) bsh.GetBioseqMolType() << endl;
366  "Mixed input mol types not allowed with trim-large-nucprots");
367  }
368 }
369 
371 {
372  const CTextseq_id *text_seqid = idh.GetSeqId()->GetTextseq_Id();
373  return text_seqid && text_seqid->IsSetName() && text_seqid->IsSetAccession();
374 }
375 
377 {
378  const CTextseq_id *text_seqid = idh.GetSeqId()->GetTextseq_Id();
379  string accver = text_seqid->GetAccession();
380  if (text_seqid->IsSetVersion()) {
381  accver += '.' + NStr::NumericToString(text_seqid->GetVersion());
382  }
383  return CSeq_id_Handle::GetHandle(accver);
384 }
385 
387 {
388  SBlobCopier(const CDir& subcache_root,
389  bool extract_delta,
390  bool extract_product,
391  sequence::EGetIdType id_type)
393  m_Scope(*CObjectManager::GetInstance()),
394  m_SubcacheRoot( subcache_root ),
395  m_LastBlob(NULL),
396  m_ExtractDelta(extract_delta),
397  m_ExtractProducts(extract_product),
398  m_IdType(id_type)
399  {
400  m_Buffer.reserve(128 * 1024 * 1024);
401  }
402 
403  void operator() (const SBlobLocator & main_cache_locator,
404  SSubcacheIndexData &sub_cache_locator,
405  CSeq_id_Handle &output_idh)
406  {
407 
408  if (CSignal::IsSignaled()) {
410  "trapped signal, exiting");
411  }
412 
413  CBioseq_Handle bsh;
414  if (m_LastBlob && *m_LastBlob == main_cache_locator) {
415  /// This is the same blob we copied on the last call (this can
416  /// happen if several bioseqs belong to the same seq-entry
417  bsh = m_Scope.GetBioseqHandle(main_cache_locator.m_Idh);
418  VerifyMolType(bsh);
419  if (sub_cache_locator) {
420  // If no sub_cache_locations, that's because blob has been
421  // invalidated (because it already exists in sub-cache), so no need to copy
423  /// Current blob, containing the previous id and this one, is a
424  /// large Nucprot; create a new trimmed entry with this new id
426  TrimEntry(trimmed_entry, bsh);
427 
429  sub_cache_locator.m_ChunkId = m_OutputChunk.GetChunkSerialNum();
430  m_LastBlobOffset = sub_cache_locator.m_Offset
432 
433  CCache_blob small_blob;
434  small_blob.SetTimestamp( m_LastBlobTimestamp );
435  small_blob.Pack(*trimmed_entry);
436  m_OutputChunk.Write(small_blob);
437  sub_cache_locator.m_BlobSize = m_OutputChunk.GetOffset()
438  - sub_cache_locator.m_Offset;
439 
440  } else {
441  sub_cache_locator.m_ChunkId = m_OutputChunk.GetChunkSerialNum();
442  sub_cache_locator.m_Offset = m_LastBlobOffset;
443  }
444  }
445  } else {
446  m_Buffer.clear();
447  m_Buffer.resize(main_cache_locator.m_BlobSize);
448 
449  m_InputChunk.OpenForRead( main_cache_locator.m_CacheRoot->GetPath(),
450  main_cache_locator.m_ChunkId );
451  m_InputChunk.RawRead( main_cache_locator.m_Offset,
452  &m_Buffer[0], m_Buffer.size());
453 
454  CRef<CSeq_entry> entry(new CSeq_entry);
455  CCache_blob blob;
456  string buffer(&m_Buffer[0], m_Buffer.size());
457  CNcbiIstrstream istr(buffer);
458  istr >> MSerial_AsnBinary >> blob;
459  blob.UnPack(*entry);
460 
461  CConstRef<CSeq_entry> trimmed_entry = entry;
464  bsh = m_Scope.GetBioseqHandle(main_cache_locator.m_Idh);
465  VerifyMolType(bsh);
466 
467  if (sub_cache_locator) {
470  sub_cache_locator.m_ChunkId = m_OutputChunk.GetChunkSerialNum();
471  m_LastBlobOffset = sub_cache_locator.m_Offset = m_OutputChunk.GetOffset();
472  m_LastBlob = &main_cache_locator;
473 
474  if (TrimEntry(trimmed_entry, bsh)) {
475  /// Seq-entry was trimmed, so we need to create and write
476  /// new smaller blob
477  m_CurrentNucprotSeqEntry = entry;
478  CCache_blob small_blob;
479  small_blob.SetTimestamp( m_LastBlobTimestamp );
480  small_blob.Pack(*trimmed_entry);
481  m_OutputChunk.Write(small_blob);
482  sub_cache_locator.m_BlobSize = m_OutputChunk.GetOffset()
483  - sub_cache_locator.m_Offset;
484  } else {
486  m_OutputChunk.RawWrite( &m_Buffer[0], m_Buffer.size());
487  }
488 
489  ++m_BlobCount;
490  }
491  }
492 
494  output_idh = sequence::GetId(bsh, m_IdType);
495  }
496  if (!output_idh) {
497  output_idh = main_cache_locator.m_Idh;
498  }
499 
500  if (sub_cache_locator) {
501  ++m_BioseqCount;
502  m_SeqIdCount += bsh.GetId().size();
503  sub_cache_locator.m_Ids = bsh.GetId();
504 
505  /// Write and index bioseq's seqids
507  sub_cache_locator.m_SeqIdOffset = m_SeqIdChunk.GetOffset();
508  m_SeqIdChunk.Write(bsh.GetId());
509  sub_cache_locator.m_SeqIdSize = m_SeqIdChunk.GetOffset()
510  - sub_cache_locator.m_SeqIdOffset;
511 
514  }
515  }
516 
517 public:
518  size_t m_BlobCount;
520  size_t m_SeqIdCount;
521  vector<CSeq_id_Handle> extra_ids;
522 
523 private:
530 
534  vector<char> m_Buffer;
538 };
539 
540 
541 
543 {
544  SBlobInserter(const CDir& subcache_root,
545  bool extract_delta,
546  bool extract_product)
548  m_SubcacheRoot( subcache_root ),
549  m_ExtractDelta(extract_delta),
550  m_ExtractProducts(extract_product)
551  {
553  }
554 
556  vector<SSubcacheIndexData> &sub_cache_locator)
557  {
558  if (CSignal::IsSignaled()) {
560  "trapped signal, exiting");
561  }
562 
563  CSeq_entry_Handle seh;
564  CConstRef<CSeq_entry> entry;
565  try {
566  seh = bsh.GetTopLevelEntry();
567  entry = seh.GetCompleteSeq_entry();
568  } catch (...) {
569  LOG_POST(Error << "Error fetching " << bsh.GetSeq_id_Handle().AsString());
570  throw;
571  }
572  TrimEntry(entry, bsh);
573  SSubcacheIndexData blob_locator;
574  {{
575  CCache_blob blob;
576  blob.SetTimestamp( m_Timestamp );
577  blob.Pack(*entry);
579  blob_locator.m_Timestamp = m_Timestamp;
580  blob_locator.m_ChunkId = m_OutputChunk.GetChunkSerialNum();
581  blob_locator.m_Offset = m_OutputChunk.GetOffset();
582  m_OutputChunk.Write(blob);
583  blob_locator.m_BlobSize = m_OutputChunk.GetOffset()
584  - blob_locator.m_Offset;
585  }}
586  ++m_BlobCount;
587 
588  vector<CBioseq_Handle> relevant_seqs;
589  if (entry->IsSeq() || entry->GetSet().GetSeq_set().size() == 1) {
590  relevant_seqs.push_back(bsh);
591  } else {
592  /// Blob written has multiple Bioseqs
593  for (CBioseq_CI seq_ci(seh); seq_ci; ++seq_ci) {
594  relevant_seqs.push_back(*seq_ci);
595  }
596  }
597  ITERATE (vector<CBioseq_Handle>, seq_it, relevant_seqs) {
598  CConstRef<CBioseq> bioseq = seq_it->GetCompleteBioseq();
599  BioseqIndexData(*bioseq, blob_locator.m_Gi,
600  blob_locator.m_SeqLength,
601  blob_locator.m_TaxId);
602 
603  ++m_BioseqCount;
604  m_SeqIdCount += seq_it->GetId().size();
605  blob_locator.m_Ids = seq_it->GetId();
606 
607  /// Write and index bioseq's seqids
609  blob_locator.m_SeqIdOffset = m_SeqIdChunk.GetOffset();
610  m_SeqIdChunk.Write(seq_it->GetId());
611  blob_locator.m_SeqIdSize = m_SeqIdChunk.GetOffset()
612  - blob_locator.m_SeqIdOffset;
613  sub_cache_locator.push_back(blob_locator);
614 
615  ExtractExtraIds(*seq_it, extra_ids,
617  }
618  }
619 
620 public:
621  size_t m_BlobCount;
623  size_t m_SeqIdCount;
624  vector<CSeq_id_Handle> extra_ids;
625 
626 private:
630  vector<char> m_Buffer;
633  time_t m_Timestamp;
634 };
635 
636 
637 
638 /////////////////////////////////////////////////////////////////////////////
639 // CAsnSubCacheCreateApplication::
640 
641 
643 {
644 public:
646  : m_TotalRecords(0),
653  m_IdType(sequence::eGetId_HandleDefault),
656  {
657  }
658 
659 private:
660  virtual void Init(void);
661  virtual int Run(void);
662  virtual void Exit(void);
663 
664  size_t WriteBlobsInSubCache( const vector<CDir>& main_cache_roots,
665  const CDir& sub_cache_root,
666  TIndexMapById& index_map,
667  TBlobLocationList& blob_locations,
668  time_t timestamp,
669  bool extract_delta,
670  bool extract_product,
671  bool fetch_missing,
672  bool update_existing,
673  int recursion_level);
674 
675  void x_FetchMissingBlobs( TIndexMapById& index_map,
676  const TIndexRefList& missing_ids,
677  TBlobLocationList& blob_locations,
678  TIndexMapById& extra_ids,
679  const CDir & subcache_root,
680  bool extract_delta,
681  bool extract_product );
682 
683  void x_LocateBlobsInCache(TIndexMapById& index_map,
684  TIndexMapByBlob& index_map_by_blob,
685  const vector<CDir>& main_cache_roots,
686  TBlobLocationList& blob_locations,
687  TIndexRefList& missing_ids,
688  time_t timestamp);
689 
691  const string & cache_index);
692 
693  void IndexNewBlobsInSubCache(const TIndexMapById& index_map,
694  const CDir & cache_root);
695 
697 
711 
713 
717 
718  struct SBlobVersion {
719  int sat;
720  int satkey;
723 
724  SBlobVersion(const string &line = "");
725 
727 
728  static void s_PopulateSatelliteMap();
729  };
730 
731 };
732 
734 
735 /////////////////////////////////////////////////////////////////////////////
736 // Init test for all different types of arguments
737 
738 
740 {
741  // Create command-line argument descriptions class
742  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
743 
744  // Specify USAGE context
745  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
746  "CArgDescriptions demo program");
747 
748  arg_desc->AddOptionalKey("cache", "Cache",
749  "Comma-separated paths of one or more main caches",
751 
752  arg_desc->AddOptionalKey("cache-manifest", "CacheManifest",
753  "manifest of paths of one or more main caches",
755  arg_desc->SetDependency("cache-manifest",
757  "cache");
758 
759  arg_desc->AddKey("subcache", "Subcache",
760  "Path to the ASN.1 subcache that will be created.",
762 
763  arg_desc->AddDefaultKey( "i", "SeqIds",
764  "The list of Seq-ids is read from here.",
766  "-");
767  arg_desc->AddAlias("-input", "i");
768 
769  arg_desc->AddOptionalKey( "input-manifest", "SeqIds",
770  "The list of Seq-ids is read from here.",
772  arg_desc->SetDependency("i",
774  "input-manifest");
775 
776  arg_desc->AddOptionalKey( "timestamp", "Timestamp",
777  "Only GIs stamped earlier than this timestamp (YYYY-MM-DD) are cached",
779 
780  arg_desc->AddFlag("extract-delta",
781  "Extract and index delta-seq far-pointers");
782 
783  arg_desc->AddOptionalKey("delta-level", "RecursionLevel",
784  "Number of levels to descend when retrieving "
785  "items in delta sequences",
787 
788  arg_desc->AddFlag("skip-retrieval-failures",
789  "Skip failed retrieval of sequences, "
790  "up to any limit imposed by -max-retrieval-failures");
791  arg_desc->AddOptionalKey("max-retrieval-failures", "MaximumAllowedFailures",
792  "Configures the option of -skip-failures: "
793  "Maximum number of sequences we're allowed to "
794  "fail to retrieve from ID and still consider "
795  "execution a success; does not include withdrawn "
796  "sequences, which are counted separately. The "
797  "default is unlimited.",
799 
800  arg_desc->AddFlag("skip-withdrawn",
801  "Skip retrieval of withdrawn sequences, "
802  "up to any limit imposed by -max-withdrawn");
803  arg_desc->AddOptionalKey("max-withdrawn", "MaximumWithdrawnSequences",
804  "Configures the option of -skip-withdrawn: "
805  "Maximum number of withdrawn sequences allowed in "
806  "the input Seq-ids. The default is unlimited.",
808 
809  arg_desc->AddFlag("extract-product",
810  "Extract and index product far-pointers");
811 
812  arg_desc->AddFlag("fetch-missing",
813  "Retrieve ASN.1 blobs from ID directly if a look-up in "
814  "the main cache fails");
815 
816  arg_desc->AddFlag("no-update-existing",
817  "Don't update sequences that are already in the subcache");
818 
819  arg_desc->AddFlag("trim-large-nucprots",
820  "Divide large nucprots into separate Seq-entry per "
821  "sequences, to avoid fetching huge blobs when only one "
822  "protein is needed");
823 
824  arg_desc->AddFlag("remove-annotation",
825  "Remove all annotation from caches entries");
826 
827  arg_desc->AddFlag("no-wgs-master-descs",
828  "When fetching missing WGS sequences from ID, don't add "
829  "the descriptiors from the master WGS record");
830 
831  arg_desc->AddFlag("overwrite-existing-cache",
832  "If the cache already exists, overwrite its current "
833  "contents; default action is to add to them");
834 
835  arg_desc->AddFlag("allow-approximate-ids",
836  "If this flag is specified, a bioseq retrieved by "
837  "provided ID is considered good even if the exact ID "
838  "does not appear in bioseq");
839 
840  arg_desc->SetDependency("skip-retrieval-failures",
841  CArgDescriptions::eRequires, "fetch-missing");
842  arg_desc->SetDependency("max-retrieval-failures",
843  CArgDescriptions::eRequires, "skip-retrieval-failures");
844 
845  arg_desc->SetDependency("skip-withdrawn",
846  CArgDescriptions::eRequires, "fetch-missing");
847  arg_desc->SetDependency("max-withdrawn",
848  CArgDescriptions::eRequires, "skip-withdrawn");
849 
850  arg_desc->SetDependency("no-wgs-master-descs",
851  CArgDescriptions::eRequires, "fetch-missing");
852  arg_desc->SetDependency("no-wgs-master-descs",
853  CArgDescriptions::eExcludes, "nogenbank");
854 
855  arg_desc->AddOptionalKey("oseqids","oseqids","Seqids that actually made it to the cache",
857  arg_desc->AddOptionalKey("seq-id-type", "TypeOfId",
858  "Kind of sequence identifier to use; by default "
859  "use same seq-id as provided in input",
861  arg_desc->SetConstraint("seq-id-type",
862  &(*new CArgAllow_Strings,
863  "canonical", "best"));
864 
865  arg_desc->AddOptionalKey("freeze-date", "FreezeDate",
866  "When fetching missing blobs, Get old blobs from "
867  "no later than specified date; format M/D/Y. "
868  "Supported for gis only",
870 
871  arg_desc->AddOptionalKey("idstat-executable", "IdstatExecutable",
872  "Path to idstat executable",
874  arg_desc->SetDependency("freeze-date",
875  CArgDescriptions::eRequires, "fetch-missing");
876  arg_desc->SetDependency("freeze-date",
877  CArgDescriptions::eRequires, "idstat-executable");
878 
879  arg_desc->AddFlag("accept-non-gi",
880  "Allow non-gi seq-ids, and get the latest version of the "
881  "sequence, ignoring freeze date. Default, if freeze "
882  "date is specified without this flag, is to fail if any "
883  "of the input seq-ids are not gis");
884  arg_desc->SetDependency("accept-non-gi",
885  CArgDescriptions::eRequires, "freeze-date");
886 
888 
889  // Setup arg.descriptions for this application
890  SetupArgDescriptions(arg_desc.release());
891 }
892 
893 
894 static void s_ReadIdsFromFile(CNcbiIstream& istr,
895  TIndexMapById& index_map,
896  TBlobLocationList& blob_locations,
897  TCachedSeqIds& cached_seq_ids
898  )
899 {
900  string line;
901  while ( NcbiGetlineEOL( istr, line ) ) {
902  if ( line.empty() || line[0] == '#') {
903  continue;
904  }
905 
906  try {
907  /// Put id in index map; blob location points to blank data
908  /// until we look up the data
911  &blob_locations.front()));
912  cached_seq_ids.insert(CSeq_id_Handle::GetHandle(line));
913  }
914  catch (CException&) {
915  LOG_POST(Error << "ignoring invalid line: " << line);
916  }
917  }
918 }
919 
920 
921 
923 {
924  const CArgs& args = GetArgs();
925  {{
928  }}
929 
930  GetRWConfig().Set("OBJMGR", "BLOB_CACHE", "500");
931 
932  s_TrimLargeNucprots = args["trim-large-nucprots"];
933  s_RemoveAnnot = args["remove-annotation"];
934  if (args["seq-id-type"]) {
935  m_IdType = args["seq-id-type"].AsString() == "best"
937  }
938 
939  vector<CDir> main_cache_roots;
940  vector<string> main_cache_paths;
941  if (args["cache"]) {
942  NStr::Split(args["cache"].AsString(), ",", main_cache_paths);
943  } else if (args["cache-manifest"]) {
944  main_cache_paths = CFileManifest(args["cache-manifest"].AsString()).GetAllFilePaths();
945  }
946  ITERATE (vector<string>, it, main_cache_paths) {
947  CDir cache_root(CDirEntry::NormalizePath(*it, eFollowLinks));
948  if (! cache_root.Exists() ) {
949  LOG_POST( Error << "Cache " << cache_root.GetPath()
950  << " does not exist!" );
951  return 1;
952  } else if ( ! cache_root.IsDir() ) {
953  LOG_POST( Error << cache_root.GetPath() << " does not point to a "
954  << "valid cache path!" );
955  return 2;
956  }
957  main_cache_roots.push_back(cache_root);
958 
959  /// Handle subcache directories
960  CDir::TEntries cache_subdirectories =
961  cache_root.GetEntries("subcache_*", CDir::fCreateObjects);
962  ITERATE (CDir::TEntries, subdir_it, cache_subdirectories) {
963  main_cache_roots.push_back(dynamic_cast<const CDir &>(**subdir_it));
964  }
965  }
966 
967  CDir subcache_root( args["subcache"].AsString() );
968  if ( subcache_root.Exists() ) {
969  if (args["overwrite-existing-cache"]) {
970  if (!subcache_root.Remove()) {
972  "Cache already exists and can't be removed");
973  }
974  } else {
975  LOG_POST( Warning << "Subcache " << subcache_root.GetPath()
976  << " already exists!" );
977  }
978  }
979 
980  if ( ! subcache_root.CreatePath() ) {
981  LOG_POST( Error << "Unable to create a path to a subcache at "
982  << subcache_root.GetPath() );
983  }
984 
985  CTime timestamp( CTime::eCurrent );
986  if ( args["timestamp"].HasValue() ) {
987  string timestamp_string( args["timestamp"].AsString() );
988  timestamp = CTime( timestamp_string,
990  LOG_POST( Info << "Timestamp: " << timestamp.AsString() );
991  }
992 
993  bool extract_delta = args["extract-delta"];
994  bool extract_product = args["extract-product"];
995  bool fetch_missing = args["fetch-missing"];
996  bool update_existing = !args["no-update-existing"];
997  LOG_POST(Error << "update existing = "
998  << (update_existing ? "true" : "false"));
999 
1000  unsigned max_retrieval_failures =
1001  args["skip-retrieval-failures"] ? UINT_MAX : 0;
1002  if (args["max-retrieval-failures"]) {
1003  max_retrieval_failures = args["max-retrieval-failures"].AsInteger();
1004  }
1005 
1006  unsigned max_withdrawn =
1007  args["skip-withdrawn"] ? UINT_MAX : 0;
1008  if (args["max-withdrawn"]) {
1009  max_withdrawn = args["max-withdrawn"].AsInteger();
1010  }
1011 
1014 
1015  m_Scope.Reset(new CScope(*om));
1016  m_Scope->AddDefaults();
1017 
1018  if (args["freeze-date"]) {
1019  m_FreezeDate = CTime(args["freeze-date"].AsString(), "M/D/Y");
1020  m_IdstatExecutable = args["idstat-executable"].AsString();
1021  m_AcceptNonGi = args["accept-non-gi"];
1022  m_GbLoader = dynamic_cast<CGBDataLoader *>(om->FindDataLoader("GBLOADER"));
1023  if (!m_GbLoader) {
1024  NCBI_THROW(CException, eUnknown, "freeze-date requires genbank loader");
1025  }
1027  }
1028 
1029  if (args["no-wgs-master-descs"]) {
1030  if (!m_GbLoader) {
1031  m_GbLoader = dynamic_cast<CGBDataLoader *>(om->FindDataLoader("GBLOADER"));
1032  }
1034 #ifdef HAVE_NCBI_VDB
1035  CObjectManager::TRegisteredNames registered_names;
1036  om->GetRegisteredNames(registered_names);
1037  for (const string& loader_name: registered_names) {
1038  CDataLoader* loader = om->FindDataLoader(loader_name);
1039  _ASSERT(loader);
1040  CWGSDataLoader* wgs_loader = dynamic_cast<CWGSDataLoader*>(loader);
1041  if (wgs_loader) {
1042  wgs_loader->SetAddWGSMasterDescr(false);
1043  }
1044  }
1045 #endif
1046  }
1047 
1048  CStopWatch sw;
1049  sw.Start();
1050 
1051  ///
1052  /// read our list of IDs
1053  ///
1054  TBlobLocationList blob_locations;
1055  blob_locations.push_back(m_BlankIndexData);
1056  TIndexMapById index_map;
1057  {{
1058  if (args["input-manifest"]) {
1059  string fname;
1060  CNcbiIstream& mft_istr = args["input-manifest"].AsInputFile();
1061  while (NcbiGetlineEOL(mft_istr, fname)) {
1063  if ( fname.empty() || fname[0] == '#') {
1064  continue;
1065  }
1066 
1067  CNcbiIfstream istr(fname.c_str());
1068  s_ReadIdsFromFile(istr, index_map, blob_locations, m_cached_seq_ids);
1069  }
1070  }
1071  else {
1072  CNcbiIstream& istr = args["i"].AsInputFile();
1073  s_ReadIdsFromFile(istr, index_map, blob_locations, m_cached_seq_ids);
1074  }
1075  LOG_POST( Error << index_map.size() << " IDs read from the bag." );
1076  }}
1077 
1078 
1079  if (args["delta-level"].HasValue()) {
1080  m_MaxRecursionLevel = args["delta-level"].AsInteger();
1081  }
1082 
1083  size_t total_count =
1084  WriteBlobsInSubCache( main_cache_roots, subcache_root, index_map,
1085  blob_locations, timestamp.GetTimeT(),
1086  extract_delta, extract_product, fetch_missing,
1087  update_existing, 0 );
1088 
1089  IndexNewBlobsInSubCache(index_map, subcache_root);
1090 
1091  double e = sw.Elapsed();
1092  LOG_POST( Error << "done: copied "
1093  << total_count << " items into cache ("
1094  << e << " seconds, " << total_count/e << " items/sec)");
1095  LOG_POST(Error << "total records requested: " << m_TotalRecords);
1096  LOG_POST(Error << "total records not found: " << m_RecordsNotInMainCache);
1097  LOG_POST(Error << "total records already cached: " << m_RecordsInSubCache);
1098  CProcess::SMemoryUsage memory_usage;
1099  CCurrentProcess::GetMemoryUsage(memory_usage);
1100  LOG_POST(Error << "total memory consumed: " << memory_usage.total);
1101  if (fetch_missing) {
1102  LOG_POST(Error << "total record fetched from ID: " << m_RecordsFetchedFromID);
1103  LOG_POST(Error << "total record retrieval failures: " << m_RecordsNotFound);
1104  LOG_POST(Error << "total records withdrawn: " << m_RecordsWithdrawn);
1105  }
1106 
1107  if (m_RecordsNotFound > max_retrieval_failures) {
1108  return 3;
1109  } else if (m_RecordsWithdrawn > max_withdrawn) {
1110  return 4;
1111  }
1112 
1113  if(args["oseqids"]) {
1114  args["oseqids"].AsOutputFile() <<"#seq-id"<<endl;
1116  args["oseqids"].AsOutputFile() << *it << endl;
1117  }
1118  }
1119 
1122 
1123  return 0;
1124 }
1125 
1126 
1127 size_t CAsnSubCacheCreateApplication::WriteBlobsInSubCache(const vector<CDir>& main_cache_roots,
1128  const CDir& subcache_root,
1129  TIndexMapById& index_map,
1130  TBlobLocationList& blob_locations,
1131  time_t timestamp,
1132  bool extract_delta,
1133  bool extract_product,
1134  bool fetch_missing,
1135  bool update_existing,
1136  int recursion_level)
1137 {
1138  if (recursion_level >= m_MaxRecursionLevel) {
1139  return 0;
1140  }
1141 
1142  m_Stopwatch.Start();
1143 
1144  string subcache_main_index =
1145  NASNCacheFileName::GetBDBIndex(subcache_root.GetPath(),
1147 
1148  size_t input_ids = index_map.size();
1149 
1150  ///
1151  /// step 1: if update_existing is false, eliminate blobs that are already
1152  /// in the subcache
1153  ///
1154  if (!update_existing) {
1155  /// At this point all ids are pointing to blob_locations.begin(), which is the blank
1156  /// blob locator; give it a timestamp of 0, so any entry that's already in the
1157  /// subcache will be considered up-to-date
1158  blob_locations.begin()->m_Timestamp = 0;
1159  x_EliminateIdsAlreadyInCache(index_map, subcache_main_index);
1160  }
1161 
1162  m_TotalRecords += index_map.size();
1163  ///
1164  /// step 2: locate blobs in main cache
1165  ///
1166  TIndexRefList ids_missing;
1167  TIndexMapByBlob index_by_blob;
1168  x_LocateBlobsInCache(index_map, index_by_blob, main_cache_roots, blob_locations,
1169  ids_missing, timestamp);
1170  size_t found = m_TotalRecords - m_RecordsNotInMainCache;
1171 
1172  LOG_POST(Error << input_ids
1173  << " unique source records in " << m_Stopwatch.Elapsed()
1174  << " seconds." );
1175 
1176  ///
1177  /// step 3: if update-existing is true, eliminate blobs that are already
1178  /// up-to-date in cache
1179  ///
1180  if (update_existing) {
1181  /// At this point ids still pointing to blob_locations.begin(), the
1182  /// blank blob locator, are those for which no blob was found in the
1183  /// main cache. for all of them we have to assume that the entry in
1184  /// genbank is more up-to-date and needs to be fetched; give it the
1185  /// most up-to-date timestamp, so blobs not found in the main cache
1186  /// will not be eliminated
1187  blob_locations.begin()->m_Timestamp = timestamp;
1188  x_EliminateIdsAlreadyInCache(index_map, subcache_main_index);
1189  }
1190  LOG_POST(Error << index_map.size() << " new records in "
1191  << m_Stopwatch.Elapsed() << " seconds." );
1192 
1193 
1194 
1195  /// If we're at recursion level 0, then this is the point just before we
1196  /// start writing to the cache; don't allow interruptions from now on
1197  if (recursion_level == 0) {
1202  }
1203 
1204  ///
1205  /// determine if we need to fetch any of the missing sequences
1206  ///
1207  TIndexMapById extra_ids;
1208  if (fetch_missing) {
1209  x_FetchMissingBlobs(index_map, ids_missing, blob_locations, extra_ids,
1210  subcache_root,
1211  extract_delta, extract_product );
1212  } else {
1213  /// We're not going to fetch the missing ids, so erase them from map
1214  ITERATE (TIndexRefList, it, ids_missing) {
1215  index_map.erase(*it);
1216  }
1217  }
1218 
1219  ///
1220  /// now, write these blobs
1221  /// while writing, we also extract indexable information
1222  ///
1223  {{
1224  SBlobCopier blob_writer(subcache_root,
1225  extract_delta, extract_product, m_IdType);
1226 
1227  ITERATE (TIndexMapByBlob, it, index_by_blob) {
1228  try {
1229  CSeq_id_Handle output_idh;
1230  blob_writer(it->first, *it->second, output_idh);
1231  /// Add all biodeq's other ids to index map, pointing to same blob
1232  ITERATE (vector<CSeq_id_Handle>, id_it, it->second->m_Ids) {
1233  if (*id_it != it->first.m_Idh) {
1234  index_map.insert(TIndexMapById::value_type(*id_it, it->second));
1235  }
1236  if (HasNameAndAccession(*id_it)) {
1237  /// Special case for seq-ids with both accession and name;
1238  /// also index by accession.version only
1239  index_map.insert(TIndexMapById::value_type(StrippedAccVer(*id_it), it->second));
1240  }
1241  }
1242  if (m_cached_seq_ids.count(it->first.m_Idh)) {
1243  m_output_seq_ids.insert(output_idh);
1244  }
1245  } catch (...) {
1246  LOG_POST(Error << "Error trying to copy " << it->first.m_Idh.AsString());
1247  throw;
1248  }
1249  }
1250 
1251  LOG_POST(Error << "copied "
1252  << blob_writer.m_BlobCount << " blobs / "
1253  << blob_writer.m_BioseqCount << " sequences / "
1254  << blob_writer.m_SeqIdCount << " identifiers");
1255 
1256  ITERATE (vector<CSeq_id_Handle>, it, blob_writer.extra_ids) {
1257  SSeqIdIndex new_id(*it);
1258  if (!index_map.count(new_id)) {
1259  extra_ids.insert(TIndexMapById::value_type(new_id, &blob_locations.front()));
1260  }
1261  }
1262 
1263  LOG_POST( Error << found
1264  << " records found in " << m_Stopwatch.Elapsed() << " seconds." );
1265  }}
1266 
1267  if (extra_ids.size() && recursion_level++ < m_MaxRecursionLevel) {
1268  found += WriteBlobsInSubCache(main_cache_roots, subcache_root,
1269  extra_ids, blob_locations, timestamp,
1270  extract_delta, extract_product, fetch_missing,
1271  update_existing, recursion_level);
1272  index_map.insert(extra_ids.begin(), extra_ids.end());
1273  }
1274 
1275  return found;
1276 }
1277 
1278 
1279 
1280 /////////////////////////////////////////////////////////////////////////////
1281 // Cleanup
1282 
1283 
1285 {
1286  SetDiagStream(0);
1287 }
1288 
1289 
1290 
1291 void
1293  const TIndexRefList& ids_missing,
1294  TBlobLocationList& blob_locations,
1295  TIndexMapById& extra_ids,
1296  const CDir & subcache_root,
1297  bool extract_delta,
1298  bool extract_product )
1299 {
1300  if (! ids_missing.empty()) {
1301  SBlobInserter inserter(subcache_root,
1302  extract_delta, extract_product);
1303  ITERATE (TIndexRefList, it, ids_missing) {
1304 
1305  if (CSignal::IsSignaled()) {
1307  "trapped signal, exiting");
1308  }
1309 
1310  if (*(*it)->second) {
1311  /// Already fetched a blob for this id
1312  continue;
1313  }
1314 
1315  CBioseq_Handle bsh;
1316  CSeq_id_Handle idh = (*it)->first.m_Idh;
1317  index_map.erase(*it);
1318  bool is_withdrawn = false;
1319  try {
1320  bsh = x_GetBioseqHandle(idh);
1321  if ( !bsh ) {
1322  is_withdrawn =
1325  is_withdrawn ? "bioseq withdrawn"
1326  : "empty bioseq handle");
1327  }
1328  if (!GetArgs()["allow-approximate-ids"] &&
1329  find(bsh.GetId().begin(), bsh.GetId().end(), idh) == bsh.GetId().end())
1330  {
1332  "Retrieved bioseq does not have this Seq-id");
1333  }
1334  VerifyMolType(bsh);
1335  if (m_cached_seq_ids.count(idh)) {
1336  CSeq_id_Handle output_idh;
1338  output_idh = sequence::GetId(bsh, m_IdType);
1339  }
1340  m_output_seq_ids.insert(output_idh ? output_idh : idh);
1341  }
1343  }
1344  catch (CException& e) {
1345  LOG_POST(Error << "failed to retrieve sequence: "
1346  << idh.AsString()
1347  << ": " << e);
1348  ++ (is_withdrawn ? m_RecordsWithdrawn : m_RecordsNotFound);
1349  continue;
1350  }
1351  vector<SSubcacheIndexData> index_data;
1352  inserter(bsh, index_data);
1353 
1354  ITERATE (vector<SSubcacheIndexData>, blob_it, index_data) {
1355  /// Insert index data in blob locations list and in index map
1356  blob_locations.push_back(*blob_it);
1357 
1358  /// Add all biodeq's ids to index map, pointing to same blob
1359  ITERATE (vector<CSeq_id_Handle>, id_it, blob_it->m_Ids) {
1360  index_map[*id_it] = &blob_locations.back();
1361  if (HasNameAndAccession(*id_it)) {
1362  /// Special case for seq-ids with both accession and name;
1363  /// also index by accession.version only
1364  index_map[StrippedAccVer(*id_it)] = &blob_locations.back();
1365  }
1366  }
1367  }
1368  }
1369 
1370  ITERATE (vector<CSeq_id_Handle>, it, inserter.extra_ids) {
1371  SSeqIdIndex new_id(*it);
1372  if (!index_map.count(new_id)) {
1373  extra_ids.insert(TIndexMapById::value_type(new_id, &blob_locations.front()));
1374  }
1375  }
1376  }
1377 }
1378 
1379 
1381 {
1382  if (ids.size() <= 500) {
1383  return true;
1384  }
1385  else {
1386  string key1 = ids.begin()->first.m_SeqId;
1387  string key2 = (--ids.end())->first.m_SeqId;
1388  size_t half_length = max(key1.size(), key2.size())/2;
1389  return key1.substr(0, half_length) != key2.substr(0, half_length);
1390  }
1391 }
1392 
1393 
1395  TIndexMapByBlob& index_map_by_blob,
1396  const vector<CDir>& main_cache_roots,
1397  TBlobLocationList& blob_locations,
1398  TIndexRefList& missing_ids,
1399  time_t timestamp)
1400 {
1401  LOG_POST(Error << "searching for " << index_map.size() << " ids");
1402  bool one_by_one = s_ShouldFetchOneByOne(index_map);
1403 
1404  NON_CONST_ITERATE (TIndexMapById, it, index_map) {
1405  missing_ids.push_back(it);
1406  }
1407  m_RecordsNotInMainCache += index_map.size();
1408 
1409  ITERATE (vector<CDir>, dir_it, main_cache_roots) {
1410  string cache_index =
1412  LOG_POST(Error << "locate blobs in " << cache_index << ", missing " << missing_ids.size());
1413  if (CFile(cache_index).Exists() && !missing_ids.empty()) {
1414  CAsnIndex asn_index(CAsnIndex::e_main);
1415  asn_index.SetCacheSize(1 * 1024 * 1024 * 1024);
1416  asn_index.Open(cache_index, CBDB_RawFile::eReadOnly);
1417 
1418  if(one_by_one) {
1419  LOG_POST(Error << " retrieval: one-by-one");
1420  TIndexRefList::iterator iter = missing_ids.begin();
1421  while (iter != missing_ids.end()) {
1422  CBDB_FileCursor cursor(asn_index);
1424  const SSeqIdIndex &key = (*iter)->first;
1425  cursor.From << key.m_SeqId << key.m_Version;
1426  cursor.To << key.m_SeqId << key.m_Version;
1427  SSubcacheIndexData sub_cache_index_data;
1428  SBlobLocator main_cache_locator(key.m_Idh, *dir_it);
1429  while (cursor.Fetch() == eBDB_Ok ) {
1430  if (asn_index.GetTimestamp() > sub_cache_index_data.m_Timestamp &&
1431  asn_index.GetTimestamp() <= timestamp)
1432  {
1433  /// current ASN index record should be saved
1434  sub_cache_index_data = asn_index;
1435  main_cache_locator = asn_index;
1436  }
1437  }
1438  if (sub_cache_index_data) {
1439  /// Found a blob in the main cache. Insert in blob_locations list, and enter
1440  /// into two maps
1441  blob_locations.push_back(sub_cache_index_data);
1442  (*iter)->second = &blob_locations.back();
1443  index_map_by_blob.push_back(
1444  TIndexMapByBlob::value_type(main_cache_locator, &blob_locations.back()));
1445  iter = missing_ids.erase(iter);
1447  } else {
1448  ++iter;
1449  }
1450  }
1451  } else {
1452  LOG_POST(Error << " retrieval: bulk");
1453  TIndexRefList::iterator iter = missing_ids.begin();
1454  CBDB_FileCursor cursor(asn_index);
1455  cursor.InitMultiFetch(256 * 1024);
1457  const SSeqIdIndex &start = (*iter)->first,
1458  &end = (*--missing_ids.end())->first;
1459  cursor.From << start.m_SeqId << start.m_Version;
1460  cursor.To << end.m_SeqId << end.m_Version;
1461 
1462  LOG_POST(Error << "scan range: "
1463  << start.m_SeqId << '.' << start.m_Version << " - "
1464  << end.m_SeqId << '.' << end.m_Version);
1465 
1466  bool valid_index = cursor.Fetch() == eBDB_Ok;
1467  while (valid_index && iter != missing_ids.end()) {
1468  SSubcacheIndexData sub_cache_index_data;
1469  SBlobLocator main_cache_locator((*iter)->first.m_Idh, *dir_it);
1470  for (; valid_index && (*iter)->first >= asn_index;
1471  valid_index = cursor.Fetch() == eBDB_Ok)
1472  {
1473  if ((*iter)->first == asn_index &&
1474  asn_index.GetTimestamp() > sub_cache_index_data.m_Timestamp &&
1475  asn_index.GetTimestamp() <= timestamp)
1476  {
1477  /// current ASN index record should be saved
1478  sub_cache_index_data = asn_index;
1479  main_cache_locator = asn_index;
1480  }
1481  }
1482  if (sub_cache_index_data) {
1483  /// Found a blob in the main cache. Insert in blob_locations list, and enter
1484  /// into two maps
1485  blob_locations.push_back(sub_cache_index_data);
1486  (*iter)->second = &blob_locations.back();
1487  index_map_by_blob.push_back(
1488  TIndexMapByBlob::value_type(main_cache_locator, &blob_locations.back()));
1489  iter = missing_ids.erase(iter);
1491  } else {
1492  ++iter;
1493  }
1494  }
1495  }
1496  }
1497  }
1498 
1499  sort(index_map_by_blob.begin(), index_map_by_blob.end());
1500 }
1501 
1502 
1503 void
1505  const string& cache_index)
1506 {
1507  if (CFile(cache_index).Exists()) {
1508  /// now, scan the index to see if we have what we expect
1509  CAsnIndex asn_index(CAsnIndex::e_main);
1510  asn_index.SetCacheSize(1 * 1024 * 1024 * 1024);
1511  asn_index.Open(cache_index, CBDB_RawFile::eReadWriteCreate);
1512 
1513  bool one_by_one = s_ShouldFetchOneByOne(index_map);
1514  vector<TIndexRef> ids_found;
1515 
1516  if(one_by_one) {
1517  LOG_POST(Error << " retrieval: one-by-one");
1518  NON_CONST_ITERATE (TIndexMapById, iter, index_map) {
1519  CBDB_FileCursor cursor(asn_index);
1521  const SSeqIdIndex &key = iter->first;
1522  cursor.From << key.m_SeqId << key.m_Version;
1523  cursor.To << key.m_SeqId << key.m_Version;
1524  bool found_match = false;
1525  while (!found_match && cursor.Fetch() == eBDB_Ok ) {
1526  /// If update_existing is true, this is called after locating blobs in
1527  /// main cache, and iter->second->m_Tmestamp will conatin timestamp from main
1528  /// cache ; we consider an entry to already exist in sub-cache only if its timestamp
1529  /// is up-to-date compared to main cache. If update_existing is false, this is called
1530  /// before locating blobs in main cache, to iter->second->m_Timestamp is 0
1531  if (asn_index.GetTimestamp() >= iter->second->m_Timestamp)
1532  {
1533  found_match = true;
1534  ids_found.push_back(iter);
1535  }
1536  }
1537  }
1538  } else {
1539  LOG_POST(Error << " retrieval: bulk");
1540  CBDB_FileCursor cursor(asn_index);
1541  cursor.InitMultiFetch(256 * 1024);
1543  const SSeqIdIndex &start = index_map.begin()->first,
1544  &end = (--index_map.end())->first;
1545  cursor.From << start.m_SeqId << start.m_Version;
1546  cursor.To << end.m_SeqId << end.m_Version;
1547 
1548  LOG_POST(Error << "scan range: "
1549  << start.m_SeqId << '.' << start.m_Version << " - "
1550  << end.m_SeqId << '.' << end.m_Version);
1551 
1552  bool valid_index = cursor.Fetch() == eBDB_Ok;
1553  NON_CONST_ITERATE (TIndexMapById, iter, index_map) {
1554  if (!valid_index) {
1555  break;
1556  }
1557  bool found_match = false;
1558  for (; valid_index && iter->first >= asn_index;
1559  valid_index = cursor.Fetch() == eBDB_Ok)
1560  {
1561  if (iter->first == asn_index &&
1562  asn_index.GetTimestamp() >= iter->second->m_Timestamp)
1563  {
1564  found_match = true;
1565  }
1566  }
1567  if (found_match) {
1568  ids_found.push_back(iter);
1569  }
1570  }
1571  }
1572 
1573  m_RecordsInSubCache += ids_found.size();
1576  m_Scope->AddDefaults();
1577  }
1578  ITERATE (vector<TIndexRef>, iter, ids_found) {
1579  /// We already have this blob in the sub-cache; invalidate index data, so
1580  /// it won't be copied; it still needs to be included in output
1581  *(*iter)->second = m_BlankIndexData;
1582  CSeq_id_Handle idh = (*iter)->first.m_Idh,
1583  output_idh = idh;
1584  if (m_cached_seq_ids.count(idh)) {
1586  output_idh = sequence::GetId(idh, *m_Scope, m_IdType);
1587  if (!output_idh) {
1588  output_idh = idh;
1589  }
1590  }
1591  m_output_seq_ids.insert(output_idh);
1592  }
1593  index_map.erase(*iter);
1594  }
1595  }
1596 }
1597 
1599 IndexNewBlobsInSubCache(const TIndexMapById& index_map,
1600  const CDir & cache_root)
1601 {
1602  string main_index_path =
1605  string seq_id_index_path =
1608 
1609  CAsnIndex main_index(CAsnIndex::e_main);
1610  main_index.SetCacheSize(1 * 1024 * 1024 * 1024);
1611  main_index.Open(main_index_path,
1613 
1614  CAsnIndex seq_id_index(CAsnIndex::e_seq_id);
1615  seq_id_index.SetCacheSize(1 * 1024 * 1024 * 1024);
1616  seq_id_index.Open(seq_id_index_path,
1618 
1619  ITERATE (TIndexMapById, it, index_map) {
1620  main_index.SetSeqId( it->first.m_SeqId );
1621  main_index.SetVersion( it->first.m_Version );
1622  main_index.SetGi( it->second->m_Gi );
1623  main_index.SetTimestamp( it->second->m_Timestamp );
1624  main_index.SetChunkId( it->second->m_ChunkId );
1625  main_index.SetOffset( it->second->m_Offset );
1626  main_index.SetSize( it->second->m_BlobSize );
1627  main_index.SetSeqLength( it->second->m_SeqLength );
1628  main_index.SetTaxId( it->second->m_TaxId );
1629  if ( eBDB_Ok != main_index.UpdateInsert() ) {
1630  LOG_POST( Error << "Main index failed to index SeqId "
1631  << it->first.m_SeqId );
1632  }
1633 
1634  seq_id_index.SetSeqId( it->first.m_SeqId );
1635  seq_id_index.SetVersion( it->first.m_Version );
1636  seq_id_index.SetGi( it->second->m_Gi );
1637  seq_id_index.SetTimestamp( it->second->m_Timestamp );
1638  seq_id_index.SetOffset( it->second->m_SeqIdOffset );
1639  seq_id_index.SetSize( it->second->m_SeqIdSize );
1640  if ( eBDB_Ok != seq_id_index.UpdateInsert() ) {
1641  LOG_POST( Error << "SeqId index failed to index SeqId "
1642  << it->first.m_SeqId );
1643  }
1644  }
1645 }
1646 
1649 {
1650  if (m_FreezeDate.IsEmpty()) {
1651  return m_Scope->GetBioseqHandle(idh);
1652  }
1653  if (!idh.IsGi()) {
1654  if (m_AcceptNonGi) {
1655  ERR_POST(Warning << "Can't get frozen version of " << idh
1656  << "; getting latest version");
1657  return m_Scope->GetBioseqHandle(idh);
1658  } else {
1659  NCBI_THROW(CException, eUnknown, "Can't get frozen version of "
1660  + idh.AsString());
1661  }
1662  }
1663 
1664  vector<string> args { "-i", "PUBSEQ_OS_GI64", "-g" };
1665  args.push_back(NStr::NumericToString(GI_TO(unsigned long, idh.GetGi())));
1666 
1667  ostringstream idstat_out, idstat_err;
1668  int idstat_exit;
1669  CNcbiIstrstream istr("");
1670 
1671  CPipe::ExecWait(GetArgs()["idstat-executable"].AsString(), args, istr,
1672  idstat_out, idstat_err, idstat_exit);
1673  cerr << idstat_err.str();
1674 
1675  if(0 != idstat_exit) {
1676  NCBI_THROW(CException, eUnknown, "idstat returned: " + NStr::NumericToString(idstat_exit));
1677  }
1678 
1679  vector<string> idstat_lines;
1680  NStr::Split(idstat_out.str(), "\n", idstat_lines);
1681  bool reached_data = false;
1682  for (const string &line : idstat_lines) {
1683  if (!reached_data) {
1684  if (!line.empty() && line[0] == '-') {
1685  /// data will start after this line
1686  reached_data = true;
1687  }
1688  continue;
1689  }
1690  SBlobVersion bv(line);
1691  if (bv.gi == idh.GetGi() && bv.date_loaded <= m_FreezeDate) {
1692  CScope::TBlobId blob_id =
1695  blob_id);
1696  return seh.GetBioseqHandle(idh);
1697  }
1698  }
1699  NCBI_THROW(CException, eUnknown, "Can't find version of " + idh.AsString()
1700  + " from before " + m_FreezeDate.AsString());
1701 }
1702 
1704 : sat(0), satkey(0), gi(ZERO_GI)
1705 {
1706  if (line.empty() || line[0] == ' ') {
1707  return;
1708  }
1709 
1710  vector<string> tokens;
1711  NStr::Split(line, " ", tokens, NStr::fSplit_Tokenize);
1712  gi = GI_FROM(Int8, NStr::StringToInt8(tokens[1]));
1713  sat = s_SatelliteMap[tokens[3]];
1714  satkey = NStr::StringToInt(tokens[2]);
1715  date_loaded = CTime(tokens[8], "M/D/Y");
1716 }
1717 
1719 {
1720  CDatabase db("dbapi://anyone:allowed@ENTREZ_MAIN/IdMain");
1721  db.Connect();
1722  CQuery query = db.NewQuery("select sat_id,satellite from Satellite");
1723  for (const auto &row : query) {
1724  s_SatelliteMap[row["satellite"].AsString()] = row["sat_id"].AsInt4();
1725  }
1726 }
1727 
1728 
1729 /////////////////////////////////////////////////////////////////////////////
1730 // MAIN
1731 
1732 
1733 int main(int argc, const char* argv[])
1734 {
1735  // Execute main application function
1736  return CAsnSubCacheCreateApplication().AppMain(argc, argv);
1737 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Contains the class definiton for CAsnCache, the main client class for accessing the ASN cache data.
void GetNormalizedSeqId(const objects::CSeq_id_Handle &id, string &id_str, Uint4 &version)
void BioseqIndexData(const objects::CBioseq &bioseq, CAsnIndex::TGi &gi, CAsnIndex::TSeqLength &seq_length, CAsnIndex::TTaxId &taxid)
Definition: asn_index.cpp:240
Berkeley BDB file cursor.
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
This is a simple BDB structure holding information about a given accession and its indexed location.
Definition: asn_index.hpp:53
TChunkId GetChunkId() const
Definition: asn_index.cpp:96
Uint4 TSeqLength
Definition: asn_index.hpp:62
TGi GetGi() const
Definition: asn_index.cpp:84
TOffset GetOffset() const
Definition: asn_index.cpp:102
void SetSeqId(TSeqId val)
Definition: asn_index.cpp:126
Uint4 TChunkId
Definition: asn_index.hpp:59
void SetVersion(TVersion val)
Definition: asn_index.cpp:132
Uint4 TVersion
Definition: asn_index.hpp:56
void SetTaxId(TTaxId val)
Definition: asn_index.cpp:176
Uint8 TOffset
Definition: asn_index.hpp:60
TSize GetSize() const
Definition: asn_index.cpp:108
void SetChunkId(TChunkId val)
Definition: asn_index.cpp:150
TTimestamp GetTimestamp() const
Definition: asn_index.cpp:90
Uint4 TSize
Definition: asn_index.hpp:61
void SetOffset(TOffset val)
Definition: asn_index.cpp:157
TVersion GetVersion() const
Definition: asn_index.cpp:78
void SetSeqLength(TSeqLength val)
Definition: asn_index.cpp:169
void SetSize(TSize val)
Definition: asn_index.cpp:163
Uint8 TGi
Definition: asn_index.hpp:57
TSeqLength GetSeqLength() const
Definition: asn_index.cpp:114
void SetTimestamp(TTimestamp val)
Definition: asn_index.cpp:144
Uint4 TTimestamp
Definition: asn_index.hpp:58
string TSeqId
Definition: asn_index.hpp:55
TTaxId GetTaxId() const
Definition: asn_index.cpp:120
Uint4 TTaxId
Definition: asn_index.hpp:63
void SetGi(TGi val)
Definition: asn_index.cpp:138
TSeqId GetSeqId() const
accessors
Definition: asn_index.cpp:72
void x_FetchMissingBlobs(TIndexMapById &index_map, const TIndexRefList &missing_ids, TBlobLocationList &blob_locations, TIndexMapById &extra_ids, const CDir &subcache_root, bool extract_delta, bool extract_product)
size_t WriteBlobsInSubCache(const vector< CDir > &main_cache_roots, const CDir &sub_cache_root, TIndexMapById &index_map, TBlobLocationList &blob_locations, time_t timestamp, bool extract_delta, bool extract_product, bool fetch_missing, bool update_existing, int recursion_level)
CBioseq_Handle x_GetBioseqHandle(const CSeq_id_Handle &idh)
virtual void Init(void)
Initialize the application.
void IndexNewBlobsInSubCache(const TIndexMapById &index_map, const CDir &cache_root)
virtual int Run(void)
Run the application.
void x_LocateBlobsInCache(TIndexMapById &index_map, TIndexMapByBlob &index_map_by_blob, const vector< CDir > &main_cache_roots, TBlobLocationList &blob_locations, TIndexRefList &missing_ids, time_t timestamp)
virtual void Exit(void)
Cleanup on application exit.
void x_EliminateIdsAlreadyInCache(TIndexMapById &index_map, const string &cache_index)
Berkeley DB file cursor class.
Definition: bdb_cursor.hpp:95
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
void UnPack(CSeq_entry &entry) const
Definition: Cache_blob.cpp:98
void Pack(const CSeq_entry &entry)
Definition: Cache_blob.cpp:75
Int8 GetOffset()
Definition: chunk_file.hpp:72
void OpenForWrite(const std::string &root_path="")
Definition: chunk_file.cpp:54
void RawWrite(const char *raw_blob, size_t raw_blob_size)
Definition: chunk_file.cpp:150
void Write(const CCache_blob &cache_blob)
Definition: chunk_file.cpp:141
unsigned int GetChunkSerialNum() const
Definition: chunk_file.hpp:73
void OpenForRead(const std::string &root_path="", unsigned int chunk=0)
Definition: chunk_file.cpp:99
void RawRead(std::streampos offset, char *raw_blob, size_t raw_blob_size)
Definition: chunk_file.cpp:171
static void SetupObjectManager(const CArgs &args, objects::CObjectManager &obj_mgr, TLoaders loaders=fDefault)
Set up the standard object manager data loaders according to the arguments provided above.
static void AddArgumentDescriptions(CArgDescriptions &arg_desc, TLoaders loaders=fDefault)
Add a standard set of arguments used to configure the object manager.
Database connection object.
Definition: sdbapi.hpp:1224
void Connect(void)
Explicitly (re)connect to the database server.
Definition: sdbapi.cpp:2023
CQuery NewQuery(void)
Get new CQuery object for this database.
Definition: sdbapi.cpp:2092
CDelta_seq –.
Definition: Delta_seq.hpp:66
Temporary object for holding extra message arguments.
Definition: ncbidiag.hpp:1828
CDir –.
Definition: ncbifile.hpp:1696
CFeat_CI –.
Definition: feat_ci.hpp:64
vector< string > GetAllFilePaths() const
Returns all the file paths referenced by the manifest.
CFile –.
Definition: ncbifile.hpp:1605
void SetAddWGSMasterDescr(bool flag)
Definition: gbloader.hpp:412
TBlobId GetBlobIdFromSatSatKey(int sat, int sat_key, int sub_sat=0) const
Definition: gbloader.cpp:678
CObjectManager –.
Object used to execute queries and stored procedures on the database server and retrieve result sets.
Definition: sdbapi.hpp:232
CScope –.
Definition: scope.hpp:92
void OpenForWrite(const std::string &root_path="")
void Write(const objects::CBioseq::TId &seq_ids)
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CStopWatch –.
Definition: ncbitime.hpp:1937
CTime –.
Definition: ncbitime.hpp:296
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
void SetAddWGSMasterDescr(bool flag)
Definition: wgsloader.cpp:313
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
container_type::iterator iterator
Definition: map.hpp:54
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
container_type::value_type value_type
Definition: map.hpp:52
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
The NCBI C++ standard methods for dealing with std::string.
#define true
Definition: bool.h:35
#define bool
Definition: bool.h:34
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
CNcbiRegistry & GetRWConfig(void)
Get the application's cached configuration parameters, accessible for read-write for an application's...
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
@ eFollowLinks
Follow symbolic links.
Definition: ncbimisc.hpp:145
@ fPreOpen
Open file right away; for eInputFile, eOutputFile, eIOFile.
Definition: ncbiargs.hpp:618
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
void SetCacheSize(unsigned int cache_size)
Set Berkeley DB memory cache size for the file (default is 256K).
Definition: bdb_file.cpp:563
void SetCondition(ECondition cond_from, ECondition cond_to=eNotSet)
Set search condition(type of interval)
Definition: bdb_cursor.cpp:263
void Open(const string &filename, EOpenMode open_mode, bool support_dirty_read=false, unsigned rec_len=0)
Open file with specified access mode.
Definition: bdb_file.hpp:774
EBDB_ErrCode UpdateInsert(EAfterWrite write_flag=eDiscardData)
Update record corresponding to the current key value.
Definition: bdb_file.cpp:1489
void InitMultiFetch(size_t buffer_size, EMultiFetchMode mfm=eFetchAll)
Init multi-row fetch.
Definition: bdb_cursor.cpp:213
CBDB_ConditionHandle To
Definition: bdb_cursor.hpp:253
EBDB_ErrCode Fetch(EFetchDirection fdir=eDefault)
Fetch record.
Definition: bdb_cursor.cpp:665
CBDB_ConditionHandle From
Definition: bdb_cursor.hpp:252
@ eReadWriteCreate
read-write, create if it doesn't exist
Definition: bdb_file.hpp:82
@ eBDB_Ok
Definition: bdb_file.hpp:58
#define NULL
Definition: ncbistd.hpp:225
void PrintRequestStop(void)
Print request stop message (for request-driven applications)
Definition: ncbidiag.cpp:2778
CDiagContext & GetDiagContext(void)
Get diag context instance.
Definition: logging.cpp:818
void PrintRequestStart(const string &message)
Print request start message (for request-driven applications)
Definition: ncbidiag.cpp:2762
static CRequestContext & GetRequestContext(void)
Shortcut to CDiagContextThreadData::GetThreadData().GetRequestContext()
Definition: ncbidiag.cpp:1901
void SetRequestStatus(int status)
const CStopWatch & GetRequestTimer(void) const
Request execution timer.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8086
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
static string NormalizePath(const string &path, EFollowLinks follow_links=eIgnoreLinks)
Normalize a path.
Definition: ncbifile.cpp:820
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
Definition: ncbifile.cpp:3846
bool CreatePath(TCreateFlags flags=fCreate_Default) const
Create the directory path recursively possibly more than one at a time.
Definition: ncbifile.cpp:4106
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4066
bool IsDir(EFollowLinks follow=eFollowLinks) const
Check whether a directory entry is a directory.
Definition: ncbifile.hpp:3947
list< TEntry > TEntries
Definition: ncbifile.hpp:1751
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Delete existing directory.
Definition: ncbifile.cpp:4342
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3911
@ fCreateObjects
Create appropriate subclasses of CDirEntry (CFile,CDir,...), not just CDirEntry objects.
Definition: ncbifile.hpp:1759
@ eUnknown
Definition: app_popup.hpp:72
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
CConstRef< CSeq_id > GetSeqId(void) const
bool IsGi(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
TGi GetGi(void) const
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
int EGetIdType
Definition: sequence.hpp:126
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ eGetId_Canonical
Definition: sequence.hpp:114
@ eGetId_HandleDefault
returns the ID associated with a bioseq-handle
Definition: sequence.hpp:104
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CSeq_entry_Handle GetSeq_entryHandle(CDataLoader *loader, const TBlobId &blob_id, EMissing action=eMissing_Default)
Get Seq-entry handle by its blob-id, with possible loading.
Definition: scope.cpp:113
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
vector< string > TRegisteredNames
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
Definition: scope.cpp:331
TBioseqStateFlags GetState(void) const
Get state of the bioseq.
const CSeq_id_Handle & GetSeq_id_Handle(void) const
Get handle of id used to obtain this bioseq handle.
CSeq_entry_Handle GetSeq_entry_Handle(void) const
Get parent Seq-entry handle.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id) const
Get Bioseq handle from the TSE of this Seq-entry.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TId & GetId(void) const
TMol GetBioseqMolType(void) const
Get some values from core:
const TInst & GetInst(void) const
SAnnotSelector & SetExactDepth(bool value=true)
SetExactDepth() specifies that annotations will be searched on the segment level specified by SetReso...
SAnnotSelector & SetResolveAll(void)
SetResolveAll() is equivalent to SetResolveMethod(eResolve_All).
SAnnotSelector & SetAdaptiveDepth(bool value=true)
SetAdaptiveDepth() requests to restrict subsegment resolution depending on annotations found on lower...
SAnnotSelector & SetResolveDepth(int depth)
SetResolveDepth sets the limit of subsegment resolution in searching annotations.
SAnnotSelector & ExcludeNamedAnnots(const CAnnotName &name)
Add named annot to set of annots names to exclude.
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
static EFinish ExecWait(const string &cmd, const vector< string > &args, CNcbiIstream &in, CNcbiOstream &out, CNcbiOstream &err, int &exit_code, const string &current_dir=kEmptyStr, const char *const envp[]=0, IProcessWatcher *watcher=0, const STimeout *kill_timeout=0, size_t pipe_size=0)
Execute a command with a vector of arguments, and wait for its completion.
Definition: ncbi_pipe.cpp:2139
#define kMax_Int
Definition: ncbi_limits.h:184
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
size_t total
Total memory usage.
static bool GetMemoryUsage(SMemoryUsage &usage)
Get current process memory usage.
bool Set(const string &section, const string &name, const string &value, TFlags flags=0, const string &comment=kEmptyStr)
Set the configuration parameter value.
Definition: ncbireg.cpp:826
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static Int8 StringToInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Int8.
Definition: ncbistr.cpp:793
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
bool IsEmpty(void) const
Is time object empty (date and time)?
Definition: ncbitime.hpp:2377
time_t GetTimeT(void) const
Get time in time_t format.
Definition: ncbitime.cpp:1396
static CTimeFormat GetPredefined(EPredefined fmt, TFlags flags=fDefault)
Get predefined format.
Definition: ncbitime.cpp:389
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2764
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
@ eISO8601_Date
Y-M-D (eg 1997-07-16)
Definition: ncbitime.hpp:194
static bool IsSignaled(TSignalMask signals=eSignal_Any)
Check that any of specified signals is received.
static void TrapSignals(TSignalMask signals)
Sets interrupt signal handling.
@ eSignal_QUIT
Quit.
Definition: ncbi_signal.hpp:72
@ eSignal_HUP
Hangup.
Definition: ncbi_signal.hpp:70
@ eSignal_TERM
Termination.
Definition: ncbi_signal.hpp:78
@ eSignal_INT
Interrupt.
Definition: ncbi_signal.hpp:71
TTimestamp GetTimestamp(void) const
Get the Timestamp member data.
void SetTimestamp(TTimestamp value)
Assign a value to Timestamp data member.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TVersion GetVersion(void) const
Get the Version member data.
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
void SetRelease(const TRelease &value)
Assign a value to Release data member.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
bool IsSetColl(void) const
to identify a collection Check if a value has been assigned to Coll data member.
TClass GetClass(void) const
Get the Class member data.
bool IsSetDate(void) const
Check if a value has been assigned to Date data member.
bool IsSetRelease(void) const
Check if a value has been assigned to Release data member.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
void SetId(TId &value)
Assign a value to Id data member.
Definition: Bioseq_set_.cpp:93
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
void ResetAnnot(void)
Reset Annot data member.
void SetLevel(TLevel value)
Assign a value to Level data member.
const TRelease & GetRelease(void) const
Get the Release member data.
bool IsSetLevel(void) const
nesting level Check if a value has been assigned to Level data member.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
void SetDate(TDate &value)
Assign a value to Date data member.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
void SetClass(TClass value)
Assign a value to Class data member.
void SetColl(TColl &value)
Assign a value to Coll data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
TLevel GetLevel(void) const
Get the Level member data.
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
void ResetAnnot(void)
Reset Annot data member.
Definition: Bioseq_.cpp:91
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
static CStopWatch sw
string GetBDBIndex()
Definition: file_names.hpp:44
constexpr auto sort(_Init &&init)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::KEY key
Portable class to work with a spawned process via pipes.
Defines process management classes.
Setup interrupt signal handling.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
T max(T x_, T y_)
The Object manager core.
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Defines CRequestContext class for NCBI C++ diagnostic API.
CRef< objects::CObjectManager > om
#define row(bind, expected)
Definition: string_bind.c:73
Process memory usage information, in bytes.
SAnnotSelector –.
const CDir & m_SubcacheRoot
const SBlobLocator * m_LastBlob
sequence::EGetIdType m_IdType
CChunkFile m_OutputChunk
CSeqIdChunkFile m_SeqIdChunk
vector< CSeq_id_Handle > extra_ids
SBlobCopier(const CDir &subcache_root, bool extract_delta, bool extract_product, sequence::EGetIdType id_type)
vector< char > m_Buffer
CAsnIndex::TOffset m_LastBlobOffset
void operator()(const SBlobLocator &main_cache_locator, SSubcacheIndexData &sub_cache_locator, CSeq_id_Handle &output_idh)
CRef< CSeq_entry > m_CurrentNucprotSeqEntry
CChunkFile m_InputChunk
CChunkFile m_OutputChunk
const CDir & m_SubcacheRoot
SBlobInserter(const CDir &subcache_root, bool extract_delta, bool extract_product)
CSeqIdChunkFile m_SeqIdChunk
void operator()(CBioseq_Handle bsh, vector< SSubcacheIndexData > &sub_cache_locator)
vector< char > m_Buffer
vector< CSeq_id_Handle > extra_ids
SBlobLocator & operator=(const CAsnIndex &main_index)
CAsnIndex::TChunkId m_ChunkId
const CDir * m_CacheRoot
SBlobLocator(CSeq_id_Handle idh, const CDir &root_cache)
CAsnIndex::TSize m_BlobSize
bool operator<(const SBlobLocator &k2) const
CAsnIndex::TOffset m_Offset
bool operator==(const SBlobLocator &k2) const
CSeq_id_Handle m_Idh
Indexing primitives.
SSeqIdIndex(CSeq_id_Handle idh)
bool operator>=(const CAsnIndex &index) const
bool operator==(const CAsnIndex &index) const
CAsnIndex::TSeqId m_SeqId
bool operator<(const SSeqIdIndex &k2) const
bool operator<(const CAsnIndex &index) const
CSeq_id_Handle m_Idh
CAsnIndex::TVersion m_Version
CAsnIndex::TSeqLength m_SeqLength
vector< CSeq_id_Handle > m_Ids
CAsnIndex::TSize m_SeqIdSize
CAsnIndex::TTimestamp m_Timestamp
CAsnIndex::TOffset m_Offset
CAsnIndex::TSize m_BlobSize
SSubcacheIndexData & operator=(const CAsnIndex &main_index)
CAsnIndex::TTaxId m_TaxId
CAsnIndex::TChunkId m_ChunkId
CAsnIndex::TOffset m_SeqIdOffset
static string query
USING_SCOPE(objects)
bool TrimEntry(CConstRef< CSeq_entry > &entry, CBioseq_Handle bsh)
If entry is a large nucprot set, Optionally create a new trimmed Seq-entry containing only the needed...
void VerifyMolType(CBioseq_Handle bsh)
bool HasNameAndAccession(const CSeq_id_Handle &idh)
static void s_ReadIdsFromFile(CNcbiIstream &istr, TIndexMapById &index_map, TBlobLocationList &blob_locations, TCachedSeqIds &cached_seq_ids)
TIndexMapById::iterator TIndexRef
static CSeq_inst::EMol s_MolType
deque< SSubcacheIndexData > TBlobLocationList
vector< pair< SBlobLocator, TBlobLocationEntry > > TIndexMapByBlob
list< TIndexRef > TIndexRefList
bool s_RemoveAnnotsFromEntry(CSeq_entry &entry)
static bool s_ShouldFetchOneByOne(TIndexMapById &ids)
set< CSeq_id_Handle, CSeq_id_Handle::PLessOrdered > TCachedSeqIds
static bool s_RemoveAnnot
map< SSeqIdIndex, TBlobLocationEntry > TIndexMapById
int main(int argc, const char *argv[])
static bool s_TrimLargeNucprots
USING_NCBI_SCOPE
SSubcacheIndexData * TBlobLocationEntry
void ExtractExtraIds(CBioseq_Handle bsh, vector< CSeq_id_Handle > &extra_ids, bool extract_delta, bool extract_products)
CSeq_id_Handle StrippedAccVer(const CSeq_id_Handle &idh)
#define _ASSERT
ZLib Compression API.
Modified on Tue Jul 16 13:24:21 2024 by modify_doxy.py rev. 669887