NCBI C++ ToolKit
dump_asn_index.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dump_asn_index.cpp 98384 2022-11-03 18:45:14Z whlavina $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Cheinan Marks
27  *
28  * File Description:
29  * Produce an ASN.1 cache index from CSeq_entry blobs passed from ID.
30  * This module is built into a library designed to be linked into an
31  * ID team machine with direct access to the databases.
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <ctime>
36 #include <sstream>
37 #include <algorithm>
38 #include <string>
39 #include <limits>
40 
41 #include <corelib/version.hpp>
42 #include <corelib/ncbitime.hpp>
43 #include <corelib/rwstream.hpp>
44 #include <corelib/ncbifile.hpp>
45 
46 #include <util/compress/stream.hpp>
47 #include <util/compress/zlib.hpp>
48 
50 #include <objects/seq/Bioseq.hpp>
58 #include <objects/seq/Seq_inst.hpp>
59 
60 #include <serial/serial.hpp>
61 #include <serial/objostrasnb.hpp>
62 
63 #include "md5_writer.hpp"
64 
71 
74 
76  const CTime & timestamp,
77  SSatSatKey /*blob_id*/ )
78 {
79  m_DumpSW.Start();
80 
81  CRef<CSeq_entry> parentized_entry = entry;
82  parentized_entry->Parentize();
83  m_CacheBlob.Reset();
84  x_CreateBlob( *parentized_entry, timestamp);
86  x_WriteBlob();
88  x_BuildIndexAndSeqIdInfo( *parentized_entry );
89 
90  m_DumpSW.Stop();
91 }
92 
93 
95 {
96 }
97 
98 
99 void CDumpASNIndex::x_CreateBlob ( const CSeq_entry & entry, const CTime & timestamp )
100 {
101  m_CreateBlobSW.Start();
102 
103  // CAsnCache can't store dates after Jan 18 2038. There is compiler time check
104  // that should stop compilation in 2035. But I was asked to avoid it,
105  // since we still support pre-C++11 compilers
106  //static_assert ( !(__DATE__[9] >= '3' && __DATE__ [10] >= '5'), "AsnCache doesn't support dates after Jan 18 2038. Today is " __DATE__ );
107 
108  m_CacheBlob.SetTimestamp( timestamp.GetTimeT() );
109  m_CacheBlob.Pack(entry);
110 
111  m_CreateBlobSW.Stop();
112 }
113 
114 
116 {
118  CDir root_dir( m_RootDirPath );
119  if ( root_dir.Exists() ) {
120  // Do nothing.
121  } else {
122  if ( ! root_dir.CreatePath() ) {
123  int saved_errno = NCBI_ERRNO_CODE_WRAPPER();
124  std::string error_string = "Attempted path was \"" + m_RootDirPath;
125  error_string += "\". errno was " + NStr::IntToString( saved_errno );
126  error_string += ": " + std::string( NCBI_ERRNO_STR_WRAPPER( saved_errno ) );
127 
128  NCBI_THROW( CASNCacheException, eRootDirectoryCreationFailed, error_string );
129  }
130  }
131 }
132 
133 
135 {
136  m_WriteBlobSW.Start();
137 
141 
142  LOG_POST(Info << "Blob written @ chunk " << m_ChunkFile.GetChunkSerialNum()
143  << ", offset " << m_Offset << ", size " << m_Size );
144 
145  m_WriteBlobSW.Stop();
146 }
147 
148 
149 void CDumpASNIndex::x_BuildIndexAndSeqIdInfo( const objects::CSeq_entry & entry )
150 {
151  m_BuildIndexEntrySW.Start();
152 
153  if ( entry.IsSet() ) {
154  ITERATE (CSeq_entry::TSet::TSeq_set, iter, entry.GetSet().GetSeq_set()) {
155  x_BuildIndexAndSeqIdInfo( **iter );
156  }
157  } else if ( entry.IsSeq() ) {
158  CAsnIndex::TTimestamp timestamp = m_CacheBlob.GetTimestamp();
160  const CBioseq & bio_seq = entry.GetSeq();
162  NCBI_THROW( CASNCacheException, eRootDirectoryCreationFailed,
163  "Sequence is to big (AsnCache supports only objects <= 4Gb)" );
164  }
165  IndexABioseq( bio_seq, m_MainIndex, timestamp, chunk_id, m_Offset, CAsnIndex::TSize(m_Size) );
166  Int8 seq_id_offset = m_SeqIdChunkFile.GetOffset();
167  m_SeqIdChunkFile.Write( bio_seq.GetId() );
168  size_t size = m_SeqIdChunkFile.GetOffset() - seq_id_offset;
170  NCBI_THROW( CASNCacheException, eRootDirectoryCreationFailed,
171  "Sequence is to big (AsnCache supports only objects <= 4Gb)" );
172  }
173  IndexABioseq( bio_seq, m_SeqIdIndex, timestamp, 0,
174  seq_id_offset, CAsnIndex::TSize(size) );
175  } else {
176  // This should never happen.
177  _ASSERT( false );
178  }
179 
180  m_BuildIndexEntrySW.Stop();
181 }
182 
183 
185 {
186  Uint4 taxid = 0;
187  if (bio_seq.IsSetDescr() && bio_seq.GetDescr().IsSet()) {
188  ITERATE (CBioseq::TDescr::Tdata, it, bio_seq.GetDescr().Get()) {
189  const CSeqdesc& desc = **it;
190  const COrg_ref* org = NULL;
191  switch (desc.Which()) {
192  case CSeqdesc::e_Org:
193  org = &desc.GetOrg();
194  break;
195 
196  case CSeqdesc::e_Source:
197  org = &desc.GetSource().GetOrg();
198  break;
199 
200  default:
201  break;
202  }
203  if (org && org->IsSetDb()) {
204  ITERATE (COrg_ref::TDb, dbiter, org->GetDb()) {
205  if ((*dbiter)->GetDb() == "taxon") {
206  taxid = (*dbiter)->GetTag().GetId();
207  break;
208  }
209  }
210  }
211 
212  if (taxid) {
213  break;
214  }
215  }
216  }
217 
218  return taxid;
219 }
220 
221 
223 {
225  NASNCacheFileName::GetHeader() ).c_str(), std::ios::binary );
226  header_stream.write( reinterpret_cast<const char *>( &kMajorVersion ), sizeof( kMajorVersion ) );
227  header_stream.write( reinterpret_cast<const char *>( &kMinorVersion ), sizeof( kMinorVersion ) );
229  std::string header(version.PrintJson());
230  header_stream << header << flush;
231 }
232 
233 
235 {
236  if ( m_CreateBlobSW.EIsInstantiated ) {
237  LOG_POST( Info << "CreateBlob stopwatch measured " << m_CreateBlobSW.Elapsed()
238  << " seconds." );
239  }
240  if ( m_WriteBlobSW.EIsInstantiated ) {
241  LOG_POST( Info << "WriteBlob stopwatch measured " << m_WriteBlobSW.Elapsed() << " seconds." );
242  }
243  if ( m_BuildIndexEntrySW.EIsInstantiated ) {
244  LOG_POST( Info << "BuildIndexEntry stopwatch measured " << m_BuildIndexEntrySW.Elapsed()
245  << " seconds." );
246  }
247  if ( m_DumpSW.EIsInstantiated ) {
248  LOG_POST( Info << "Dump stopwatch measured " << m_DumpSW.Elapsed() << " seconds." );
249  }
250 }
251 
253 
size_t IndexABioseq(const objects::CBioseq &bioseq, CAsnIndex &index, CAsnIndex::TTimestamp timestamp, CAsnIndex::TChunkId chunk_id, CAsnIndex::TOffset offset, CAsnIndex::TSize size)
Definition: asn_index.cpp:197
Uint4 TChunkId
Definition: asn_index.hpp:59
Uint4 TSize
Definition: asn_index.hpp:61
Uint4 TTimestamp
Definition: asn_index.hpp:58
Int8 GetOffset()
Definition: chunk_file.hpp:72
void OpenForWrite(const std::string &root_path="")
Definition: chunk_file.cpp:54
void Write(const CCache_blob &cache_blob)
Definition: chunk_file.cpp:141
unsigned int GetChunkSerialNum() const
Definition: chunk_file.hpp:73
CDir –.
Definition: ncbifile.hpp:1695
objects::CCache_blob m_CacheBlob
CTempStopWatch< EIsInstantiated > m_BuildIndexEntrySW
CTempStopWatch< EIsInstantiated > m_DumpSW
CChunkFile m_ChunkFile
void x_BuildIndexAndSeqIdInfo(const objects::CSeq_entry &entry)
std::string m_RootDirPath
CAsnIndex m_SeqIdIndex
std::streampos m_Offset
void DumpBlob(const CRef< objects::CSeq_entry > entry, const CTime &timestamp, SSatSatKey blob_id)
CTempStopWatch< EIsInstantiated > m_CreateBlobSW
Uint4 x_GetTaxId(const objects::CBioseq &bio_seq)
CAsnIndex m_MainIndex
CSeqIdChunkFile m_SeqIdChunkFile
CTempStopWatch< EIsInstantiated > m_WriteBlobSW
void x_CreateBlob(const objects::CSeq_entry &entry, const CTime &timestamp)
SSatSatKeyRange m_SatId
void OpenForWrite(const std::string &root_path="")
void Write(const objects::CBioseq::TId &seq_ids)
Definition: Seq_entry.hpp:56
void Parentize(void)
Definition: Seq_entry.cpp:71
CTime –.
Definition: ncbitime.hpp:296
Define CVersionInfo, a version info storage class.
USING_SCOPE(objects)
const Uint2 kMinorVersion
const Uint2 kMajorVersion
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_ERRNO_CODE_WRAPPER
Definition: ncbiexpt.hpp:1529
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define NCBI_ERRNO_STR_WRAPPER
Definition: ncbiexpt.hpp:1530
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
bool CreatePath(TCreateFlags flags=fCreate_Default) const
Create the directory path recursively possibly more than one at a time.
Definition: ncbifile.cpp:4106
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
static string ConcatPath(const string &first, const string &second)
Concatenate two parts of the path for the current OS.
Definition: ncbifile.cpp:776
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
time_t GetTimeT(void) const
Get time in time_t format.
Definition: ncbitime.cpp:1395
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
vector< CRef< CDbtag > > TDb
Definition: Org_ref_.hpp:101
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
list< CRef< CSeq_entry > > TSeq_set
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsSet(void) const
Check if a value has been assigned to data member.
Definition: Seq_descr_.hpp:154
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
static int version
Definition: mdb_load.c:29
string GetHeader()
Definition: file_names.hpp:62
const struct ncbi::grid::netcache::search::fields::SIZE size
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Defines: CTimeFormat - storage class for time format.
T max(T x_, T y_)
Reader-writer based streams.
string AsString() const
#define _ASSERT
ZLib Compression API.
Modified on Thu Apr 25 08:16:21 2024 by modify_doxy.py rev. 669887