NCBI C++ ToolKit
writedb_gimask.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb_gimask.cpp 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ning Ma
27  *
28  */
29 
30 /// @file writedb_gimask.cpp
31 /// Implementation for the CWriteDB_GiMask and related classes.
32 #include <ncbi_pch.hpp>
34 #include "writedb_gimask.hpp"
35 
37 
38 /// Import C++ std namespace.
40 
41 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
42  (!defined(NCBI_COMPILER_MIPSPRO)) )
43 
44 // CWriteDB_GiMask
45 
46 CWriteDB_GiMask::CWriteDB_GiMask(const string & maskname,
47  const string & desc,
48  Uint8 max_file_size):
49  m_MaskName (maskname),
50  m_MaxFileSize (max_file_size),
51  m_DFile (new CWriteDB_GiMaskData(maskname, "gmd", 0, max_file_size)),
52  m_DFile_LE (new CWriteDB_GiMaskData(maskname, "gnd", 0, max_file_size, true)),
53  m_OFile (new CWriteDB_GiMaskOffset(maskname, "gmo", max_file_size)),
54  m_OFile_LE (new CWriteDB_GiMaskOffset(maskname, "gno", max_file_size, true)),
55  m_IFile (new CWriteDB_GiMaskIndex(maskname, "gmi", desc, max_file_size)),
56  m_IFile_LE (new CWriteDB_GiMaskIndex(maskname, "gni", desc, max_file_size, true))
57 { }
58 
59 void CWriteDB_GiMask::ListFiles(vector<string> & files) const
60 {
61  if (!m_GiOffset.size()) return;
62  files.push_back(m_IFile->GetFilename());
63  files.push_back(m_IFile_LE->GetFilename());
64  files.push_back(m_OFile->GetFilename());
65  files.push_back(m_OFile_LE->GetFilename());
66  files.push_back(m_DFile->GetFilename());
67  files.push_back(m_DFile_LE->GetFilename());
68 }
69 
70 void CWriteDB_GiMask::AddGiMask(const vector<TGi> & GIs,
71  const TPairVector & mask)
72 {
73  if (!m_DFile->CanFit(mask.size())) {
74  int index = m_DFile->GetIndex() + 1;
75  m_DFile->Close();
76  m_DFile_LE->Close();
78  m_DFile_LE.Reset(new CWriteDB_GiMaskData(m_MaskName, "gnd", index, m_MaxFileSize, true));
79  }
80 
84 
85  ITERATE(vector<TGi>, gi, GIs) {
86  m_GiOffset.push_back(pair<TGi, TOffset> (*gi, offset));
87  }
88 }
89 
91 {
92  if (!m_GiOffset.size()) {
93  // un_used mask file
94  m_MaskName = "";
95  return;
96  }
97 
98  m_DFile->Close();
99  m_DFile_LE->Close();
100 
101  int num_vols = m_DFile->GetIndex() + 1;
102  if (num_vols == 1) {
105  }
106 
107  sort(m_GiOffset.begin(), m_GiOffset.end());
108 
109  m_IFile->AddGIs(m_GiOffset, num_vols);
110  m_IFile->Close();
111 
112  m_IFile_LE->AddGIs(m_GiOffset, num_vols);
113  m_IFile_LE->Close();
114 
116  m_OFile->Close();
117 
119  m_OFile_LE->Close();
120 }
121 
122 // CWriteDB_GiMaskOffset
123 
125 CWriteDB_GiMaskOffset(const string & maskname,
126  const string & extn,
127  Uint8 max_file_size,
128  bool le)
129  : CWriteDB_File (maskname, extn, -1, max_file_size, false),
130  m_UseLE (le)
131 { }
132 
134 {
137 
138  if ( ! m_Created) {
139  Create();
140  }
141 
142  int i = 0;
143  ITERATE(TGiOffset, iter, gi_offset) {
144 #ifdef NCBI_INT8_GI
145  if (m_UseLE) {
146  gis.WriteInt4_LE(GI_TO(int, iter->first));
147  offsets.WriteInt4_LE(iter->second.first);
148  offsets.WriteInt4_LE(iter->second.second);
149  } else {
150  gis.WriteInt4(GI_TO(int, iter->first));
151  offsets.WriteInt4(iter->second.first);
152  offsets.WriteInt4(iter->second.second);
153  }
154 #else
155  if (m_UseLE) {
156  gis.WriteInt4_LE(iter->first);
157  offsets.WriteInt4_LE(iter->second.first);
158  offsets.WriteInt4_LE(iter->second.second);
159  } else {
160  gis.WriteInt4(iter->first);
161  offsets.WriteInt4(iter->second.first);
162  offsets.WriteInt4(iter->second.second);
163  }
164 #endif
165 
166  ++i;
167 
168  if (i== kPageSize) {
169  Write(gis.Str());
170  Write(offsets.Str());
171  gis.Clear();
172  offsets.Clear();
173  i = 0;
174  }
175  }
176 
177  // flush the residual records
178  if (i) {
179  Write(gis.Str());
180  Write(offsets.Str());
181  gis.Clear();
182  offsets.Clear();
183  }
184 }
185 
186 // CWriteDB_GiMaskIndex
187 
189 CWriteDB_GiMaskIndex(const string & maskname,
190  const string & extn,
191  const string & desc,
192  Uint8 max_file_size,
193  bool le)
194  : CWriteDB_GiMaskOffset (maskname, extn, max_file_size, le),
195  m_Desc (desc)
196 {
198 }
199 
201  int num_vols)
202 {
203  m_NumGIs = gi_offset.size();
204  m_NumIndex = m_NumGIs / kPageSize + 2;
205 
208 
209  if ( ! m_Created) {
210  Create();
211  }
212 
213  int i = 0;
214  m_NumIndex = 0;
215 
216  ITERATE(TGiOffset, iter, gi_offset) {
217  if (i % kPageSize && i < m_NumGIs-1) {
218  ++i;
219  continue;
220  }
221 
222  ++i;
223 
224 #ifdef NCBI_INT8_GI
225  if (m_UseLE ) {
226  gis.WriteInt4_LE(GI_TO(int, iter->first));
227  offsets.WriteInt4_LE(iter->second.first);
228  offsets.WriteInt4_LE(iter->second.second);
229  } else {
230  gis.WriteInt4(GI_TO(int, iter->first));
231  offsets.WriteInt4(iter->second.first);
232  offsets.WriteInt4(iter->second.second);
233  }
234 #else
235  if (m_UseLE ) {
236  gis.WriteInt4_LE(iter->first);
237  offsets.WriteInt4_LE(iter->second.first);
238  offsets.WriteInt4_LE(iter->second.second);
239  } else {
240  gis.WriteInt4(iter->first);
241  offsets.WriteInt4(iter->second.first);
242  offsets.WriteInt4(iter->second.second);
243  }
244 #endif
245  ++m_NumIndex;
246  }
247 
248  x_BuildHeaderFields(num_vols);
249  Write(gis.Str());
250  Write(offsets.Str());
251 }
252 
254 {
255  const int kFormatVersion = 1; // SeqDB has one of these.
256 
257  CBlastDbBlob header;
258 
259  header.WriteInt4(kFormatVersion);
260  header.WriteInt4(num_vols);
261  header.WriteInt4(kGISize);
262  header.WriteInt4(kOffsetSize);
263  header.WriteInt4(kPageSize);
264  header.WriteInt4(m_NumIndex);
265  header.WriteInt4(m_NumGIs);
266  header.WriteInt4(0); // index start will be calculated later
267  header.WriteString(m_Desc, kStringFmt);
268  header.WriteString(m_Date, kStringFmt);
270 
271  Int4 size = header.GetWriteOffset();
272  header.WriteInt4(size, 28);
273 
274  Write(header.Str());
275 }
276 
277 // CWriteDB_GiMaskData
278 
280  const string & extn,
281  int index,
282  Uint8 max_file_size,
283  bool le)
284  : CWriteDB_File (maskname, extn, index, max_file_size, false),
285  m_DataLength (0),
286  m_UseLE (le),
287  m_Index (index)
288 { }
289 
291 {
292 
293  if (! mask.size()) return;
294 
295  if (! m_Created) Create();
296 
297  CBlastDbBlob data;
298 
299  if (m_UseLE ) {
300  data.WriteInt4_LE(mask.size());
302  data.WriteInt4_LE(range->first);
303  data.WriteInt4_LE(range->second);
304  }
305  } else {
306  data.WriteInt4(mask.size());
308  data.WriteInt4(range->first);
309  data.WriteInt4(range->second);
310  }
311  }
312 
313  Write(data.Str());
314  m_DataLength += (1+2*mask.size()) * 4;
315 }
316 
317 #endif
318 
320 
ncbi::TMaskedQueryRegions mask
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
@ eString
Pad using NUL terminated string of '#' bytes.
Definition: seqdbblob.hpp:272
int GetWriteOffset() const
Get the current write pointer offset.
Definition: seqdbblob.cpp:552
void WritePadBytes(int align, EPadding fmt)
Align the offset by writing pad bytes.
Definition: seqdbblob.cpp:562
void Clear()
Clear all owned data and reference an empty string.
Definition: seqdbblob.cpp:58
void WriteInt4(Int4 x)
Write a 4 byte integer to the blob.
Definition: seqdbblob.cpp:323
CTempString Str() const
Get blob contents as a CTempString.
Definition: seqdbblob.cpp:526
void WriteInt4_LE(Int4 x)
Definition: seqdbblob.cpp:363
int WriteString(CTempString str, EStringFormat fmt)
Write string data to the blob.
Definition: seqdbblob.cpp:383
CTime –.
Definition: ncbitime.hpp:296
CWriteDB_IndexFile class.
bool m_Created
True if the file has already been opened.
const string & GetFilename() const
Get the current filename for this file.
void Create()
Create and open the file.
void Close()
Close the file, flushing any remaining data to disk.
unsigned int Write(const CTempString &data)
Write contents of a string to the file.
virtual void RenameSingle()
Rename this file, disincluding the volume index.
CWriteDB_GiMaskData class.
void WriteMask(const TPairVector &mask)
Write a new data blob.
vector< pair< TSeqPos, TSeqPos > > TPairVector
TOffset GetOffset() const
Get current index/offset pair.
Uint8 m_DataLength
Length of data written so far.
int GetIndex() const
Get current index.
CWriteDB_GiMaskData(const string &maskname, const string &extn, int index, Uint8 max_file_size, bool le=false)
Constructor for an gimask data file.
bool m_UseLE
Use little endian?
bool CanFit(int num_masks) const
Tests whether there is room for another batch.
CWriteDB_GiMaskIndex class.
string m_Desc
Description of this gimask.
void AddGIs(const TGiOffset &gi_offset, int num_vols)
Add sequence GI to the offset file.
CWriteDB_GiMaskIndex(const string &maskname, const string &extn, const string &desc, Uint8 max_file_size, bool le=false)
Constructor for gimask index file.
static const CBlastDbBlob::EStringFormat kStringFmt
String format used by gimask files.
Int4 m_NumIndex
Number of GIs indexed.
string m_Date
Creation timestamp for this gimask.
Int4 m_NumGIs
Number of GIs.
void x_BuildHeaderFields(int num_vols)
Build fixed length header fields.
CWriteDB_GiMaskOffset class.
CWriteDB_GiMaskOffset(const string &maskname, const string &extn, Uint8 max_file_size, bool le=false)
Constructor for gimask offset file.
static const int kGISize
Size of a GI.
void AddGIs(const TGiOffset &gi_offset)
Add sequence GI to the offset file.
vector< pair< TGi, TOffset > > TGiOffset
static const int kOffsetSize
Size of offset entry.
bool m_UseLE
Use little endian?
static const int kPageSize
Page size.
void ListFiles(vector< string > &files) const
List Filenames.
CRef< CWriteDB_GiMaskIndex > m_IFile_LE
void AddGiMask(const vector< TGi > &GIs, const TPairVector &masks)
Add a mask data for a sequence represented by a set of GIs.
TGiOffset m_GiOffset
Sorted list of (GI, offset) pairs.
CRef< CWriteDB_GiMaskData > m_DFile
Data file.
CRef< CWriteDB_GiMaskData > m_DFile_LE
void Close()
Flush data to disk and close all associated files.
CRef< CWriteDB_GiMaskOffset > m_OFile
Offset file.
pair< int, int > TOffset
CRef< CWriteDB_GiMaskOffset > m_OFile_LE
vector< pair< TSeqPos, TSeqPos > > TPairVector
CRef< CWriteDB_GiMaskIndex > m_IFile
Index file.
CWriteDB_GiMask(const string &maskname, const string &desc, Uint8 max_file_size)
Construct WriteDB style database gimask.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1511
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
int i
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
bool le(T x_, T y_, T round_)
Definition: njn_approx.hpp:84
int offset
Definition: replacements.h:160
Defines BLAST database construction classes.
USING_SCOPE(std)
Import C++ std namespace.
Code for gi-based database mask file construction.
Modified on Tue Nov 28 02:28:57 2023 by modify_doxy.py rev. 669887