NCBI C++ ToolKit
lds2_handlers.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: lds2_handlers.cpp 91833 2020-12-14 18:27:22Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksey Grichenko
27  *
28  * File Description: LDS v.2 URL handlers.
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34 #include <util/checksum.hpp>
35 #include <util/format_guess.hpp>
37 #include <util/compress/stream.hpp>
38 #include <util/compress/zlib.hpp>
42 
43 
44 #define NCBI_USE_ERRCODE_X Objtools_LDS2
45 
48 
49 
50 // Base handler implementation
51 
53 {
54  info.format = GetFileFormat(info);
55  info.crc = GetFileCRC(info);
56  info.size = GetFileSize(info);
57  info.time = GetFileTime(info);
58 }
59 
60 
63 {
64  shared_ptr<CNcbiIstream> in(OpenStream(file_info, 0, NULL));
65  if (!in.get()) {
67  }
68  return CFormatGuess::Format(*in);
69 }
70 
71 
72 // Default (file) handler implementation
73 
75 {
76  return "file";
77 }
78 
79 
81  : CLDS2_UrlHandler_Base(s_GetHandlerName()),
82  m_StreamCache(new CTls<TStreamCache>)
83 {
84 }
85 
86 
87 shared_ptr<CNcbiIstream>
89  Int8 stream_pos,
90  CLDS2_Database* /*db*/)
91 {
92  shared_ptr<CNcbiIstream> in = OpenOrGetStream(file_info);
93  if (!in.get()) {
94  return nullptr;
95  }
96  // Chunks are not supported for regular files,
97  /// offset is relative to the file start.
98  in->seekg(NcbiInt8ToStreampos(stream_pos));
99  return in;
100 }
101 
102 
104 {
105  CFile f(file_info.name);
106  return f.Exists() ? f.GetLength() : -1;
107 }
108 
109 
111 {
113  crc.AddFile(file_info.name);
114  return crc.GetChecksum();
115 }
116 
117 
119 {
120  CFile f(file_info.name);
121  CFile::SStat stat;
122  return f.Stat(&stat) ? stat.mtime_nsec : 0;
123 }
124 
125 
127 {
128  TStreamCache* cache = m_StreamCache->GetValue();
129  if (!cache) {
130  cache = new TStreamCache;
131  m_StreamCache->SetValue(cache, CTlsBase::DefaultCleanup<TStreamCache>);
132  }
133  return *cache;
134 }
135 
136 
137 NCBI_PARAM_DECL(size_t, LDS2, MAX_CACHED_STREAMS);
138 NCBI_PARAM_DEF_EX(size_t, LDS2, MAX_CACHED_STREAMS, 3, eParam_NoThread,
139  LDS2_MAX_CACHED_STREAMS);
140 typedef NCBI_PARAM_TYPE(LDS2, MAX_CACHED_STREAMS) TMaxCachedStreams;
141 
142 
143 shared_ptr<CNcbiIstream> CLDS2_UrlHandler_File::OpenOrGetStream(const SLDS2_File& file_info)
144 {
145  size_t max_streams = TMaxCachedStreams::GetDefault();
146  if (max_streams == 0) {
147  // Do not use cached streams at all.
148  unique_ptr<CNcbiIfstream> fin(new CNcbiIfstream(file_info.name.c_str(), ios::binary));
149  if (!fin->is_open()) {
150  return nullptr;
151  }
152  return shared_ptr<CNcbiIstream>(fin.release());
153  }
154 
155  TStreamCache& cache = x_GetStreamCache();
156  TStreamCache::iterator found = cache.end();
157  NON_CONST_ITERATE(TStreamCache, it, cache) {
158  if (it->first == file_info.name) {
159  found = it;
160  break;
161  }
162  }
164  if (found != cache.end()) {
165  str = *found;
166  cache.erase(found);
167  cache.emplace_front(str);
168  }
169  else {
170  // Not yet cached
171  unique_ptr<CNcbiIfstream> fin(new CNcbiIfstream(file_info.name.c_str(), ios::binary));
172  if (!fin->is_open()) {
173  return nullptr;
174  }
175  str.first = file_info.name;
176  str.second.reset(fin.release());
177  while (!cache.empty() && cache.size() >= max_streams) {
178  cache.pop_back();
179  }
180  cache.emplace_front(str);
181  }
182  return str.second;
183 }
184 
185 
186 // Default (file) handler implementation
188 {
189  return "gzipfile";
190 }
191 
192 
194 {
196 }
197 
198 
200 {
201 public:
202  CGZipChunkHandler(const SLDS2_File& file_info,
203  CLDS2_Database& db);
204  virtual ~CGZipChunkHandler(void) {}
205 
206  virtual EAction OnChunk(TPosition raw_pos, TPosition data_pos);
207 private:
210 };
211 
212 
214  CLDS2_Database& db)
215  : m_FileInfo(file_info),
216  m_Db(db)
217 {
218 }
219 
220 
223 {
224  SLDS2_Chunk chunk(raw_pos, data_pos);
225  m_Db.AddChunk(m_FileInfo, chunk);
226  return eAction_Continue;
227 }
228 
229 
231  CLDS2_Database& db)
232 {
233  // Collect information about chunks, store in in the database.
234  unique_ptr<CNcbiIfstream> in(
235  new CNcbiIfstream(file_info.name.c_str(), ios::binary));
236  if ( !in->is_open() ) {
237  return;
238  }
239  CGZipChunkHandler chunk_handler(file_info, db);
240  g_GZip_ScanForChunks(*in, chunk_handler);
241 }
242 
243 
244 shared_ptr<CNcbiIstream>
246  Int8 stream_pos,
247  CLDS2_Database* db)
248 {
249  shared_ptr<CNcbiIstream> in = OpenOrGetStream(file_info);
250  if (!in.get()) {
251  return nullptr;
252  }
253  bool rewind = true;
254  if ( db ) {
255  // Try to use chunks information to optimize loading
256  SLDS2_Chunk chunk;
257  if ( db->FindChunk(file_info, chunk, stream_pos) ) {
258  if (chunk.raw_pos > 0) {
259  in->seekg(NcbiInt8ToStreampos(chunk.raw_pos));
260  rewind = false;
261  }
262  stream_pos -= chunk.stream_pos;
263  }
264  }
265  if ( rewind ) {
266  in->seekg(0);
267  }
268  unique_ptr<CCompressionIStream> zin(
270  *in,
273  zin->ignore(NcbiInt8ToStreampos(stream_pos));
274  return shared_ptr<CNcbiIstream>(zin.release());
275 }
276 
277 
Checksum and hash calculation classes.
CChecksum – Checksum calculator.
Definition: checksum.hpp:302
CFile –.
Definition: ncbifile.hpp:1604
EFormat
The formats are checked in the same order as declared here.
@ eUnknown
unknown format
static EFormat Format(const string &path, EOnError onerror=eDefault)
Guess file format.
const SLDS2_File & m_FileInfo
CGZipChunkHandler(const SLDS2_File &file_info, CLDS2_Database &db)
virtual ~CGZipChunkHandler(void)
CLDS2_Database & m_Db
virtual EAction OnChunk(TPosition raw_pos, TPosition data_pos)
Callback method, to be implemented by the end user.
void AddChunk(const SLDS2_File &file_info, const SLDS2_Chunk &chunk_info)
Store the chunk info in the database.
Definition: lds2_db.cpp:939
bool FindChunk(const SLDS2_File &file_info, SLDS2_Chunk &chunk_info, Int8 stream_pos)
Load chunk containing the required stream position.
Definition: lds2_db.cpp:953
Base class for URL handler.
virtual SLDS2_File::TFormat GetFileFormat(const SLDS2_File &file_info)
Methods for getting file information.
virtual shared_ptr< CNcbiIstream > OpenStream(const SLDS2_File &, Int8, CLDS2_Database *)=0
Open input stream for the URL at the specified position.
void SetHandlerName(const string &new_name)
Allow to change handler name by derived classes.
pair< string, TStream > TNamedStream
shared_ptr< CNcbiIstream > OpenStream(const SLDS2_File &file_info, Int8 stream_pos, CLDS2_Database *db) override
Open input stream for the URL at the specified position.
static const string s_GetHandlerName(void)
shared_ptr< CNcbiIstream > OpenOrGetStream(const SLDS2_File &file_info)
Int8 GetFileTime(const SLDS2_File &file_info) override
Get file timestamp - returns 0 by default.
Int8 GetFileSize(const SLDS2_File &file_info) override
Get file size - returns 0 by default.
deque< TNamedStream > TStreamCache
TStreamCache & x_GetStreamCache(void)
CRef< CTls< TStreamCache > > m_StreamCache
Uint4 GetFileCRC(const SLDS2_File &file_info) override
Get file CRC - returns 0 by default.
shared_ptr< CNcbiIstream > OpenStream(const SLDS2_File &file_info, Int8 stream_pos, CLDS2_Database *db) override
Open input stream for the URL at the specified position.
static const string s_GetHandlerName(void)
void SaveChunks(const SLDS2_File &file_info, CLDS2_Database &db) override
Save information about chunks for the URL in the database.
CLDS2_UrlHandler_GZipFile(void)
Create GZip file handler.
CTls –.
Definition: ncbithr.hpp:166
CZipStreamDecompressor – zlib based decompression stream processor.
Definition: zlib.hpp:817
Interface class to scan data source for seekable data chunks.
Definition: compress.hpp:631
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
void AddFile(const string &file_path)
Update checksum with the file data.
Definition: checksum.cpp:373
Uint4 GetChecksum(void) const
Return calculated checksum.
Definition: checksum.hpp:341
@ fOwnReader
Delete the reader.
Definition: stream.hpp:129
void g_GZip_ScanForChunks(CNcbiIstream &is, IChunkHandler &handler)
Get list of positions of separate gzip files in the concatenated gzip file.
EAction
Action types.
Definition: compress.hpp:636
Uint8 TPosition
Type to store stream positions.
Definition: compress.hpp:633
@ eAction_Continue
Continue scanning to the next data chunk, if any.
Definition: compress.hpp:637
@ fGZip
Set of flags for gzip file support. See each flag description above.
Definition: zlib.hpp:120
long mtime_nsec
Nanoseconds for modification time.
Definition: ncbifile.hpp:825
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
NCBI_NS_STD::char_traits< char >::pos_type NcbiInt8ToStreampos(Int8 pos)
Convert plain numeric stream position (offset) into stream position usable with STL stream library.
Definition: ncbistre.hpp:782
void SetValue(TValue *value, FCleanup cleanup=0, void *cleanup_data=0, ENativeThreadCleanup native=eSkipCleanup)
Set value.
Definition: ncbithr.hpp:203
TValue * GetValue(void) const
Get the pointer previously stored by SetValue().
Definition: ncbithr.hpp:179
typedef NCBI_PARAM_TYPE(LDS2, MAX_CACHED_STREAMS) TMaxCachedStreams
NCBI_PARAM_DECL(size_t, LDS2, MAX_CACHED_STREAMS)
NCBI_PARAM_DEF_EX(size_t, LDS2, MAX_CACHED_STREAMS, 3, eParam_NoThread, LDS2_MAX_CACHED_STREAMS)
static MDB_envinfo info
Definition: mdb_load.c:37
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
std::istream & in(std::istream &in_, double &x_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static const char * str(char *buf, int n)
Definition: stats.c:84
Alternate stat structure for use instead of the standard struct stat.
Definition: ncbifile.hpp:822
Chunk info.
Definition: lds2_db.hpp:107
Int8 stream_pos
Chunk position in the processed (e.g. unzipped) stream.
Definition: lds2_db.hpp:111
Int8 raw_pos
Chunk position in the raw file.
Definition: lds2_db.hpp:109
LDS2 database.
Definition: lds2_db.hpp:57
string name
Definition: lds2_db.hpp:61
ZLib Compression API.
Modified on Fri Dec 08 08:22:28 2023 by modify_doxy.py rev. 669887