NCBI C++ ToolKit
split_info_utils.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: split_info_utils.cpp 95765 2021-12-20 19:48:24Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description: processor for data from OSG
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <corelib/rwstream.hpp>
35 #include <serial/objistr.hpp>
36 #include <serial/serial.hpp>
38 #include <util/compress/stream.hpp>
39 #include <util/compress/zlib.hpp>
40 
44 
45 #include <strstream>
46 
47 #include "split_info_utils.hpp"
48 
51 
52 
54  : m_Gzip(false)
55 {
56 }
57 
58 
60 {
61 }
62 
63 
64 void CDataChunkStream::AddDataChunk(const unsigned char * data,
65  unsigned int size,
66  int chunk_no)
67 {
68  m_Chunks[chunk_no].assign(data, data+size);
69 }
70 
71 
73 
74 
75 class CDataChunkReader : public IReader
76 {
77 public:
79 
80  explicit
81  CDataChunkReader(const TChunks& chunks)
82  : m_EndOfChunks(chunks.end()),
83  m_CurrentChunk(chunks.begin()),
85  {
86  }
87 
88  virtual ERW_Result Read(void* buf,
89  size_t count,
90  size_t* bytes_read = 0) override
91  {
92  if ( x_Advance() ) {
93  count = min(count, x_AvaliableCount());
94  memcpy(buf, x_GetCurrentDataPtr(), count);
95  m_CurrentChunkPos += count;
96  *bytes_read = count;
97  return eRW_Success;
98  }
99  else {
100  *bytes_read = 0;
101  return eRW_Eof;
102  }
103  }
104 
105  virtual ERW_Result PendingCount(size_t* count) override
106  {
107  if ( x_Advance() ) {
108  *count = x_AvaliableCount();
109  return eRW_Success;
110  }
111  else {
112  *count = 0;
113  return eRW_Eof;
114  }
115  }
116 
117 private:
118  const char* x_GetCurrentDataPtr() const
119  {
120  return m_CurrentChunk->second.data() + m_CurrentChunkPos;
121  }
122  size_t x_AvaliableCount() const
123  {
124  return m_CurrentChunk->second.size() - m_CurrentChunkPos;
125  }
126  bool x_Advance()
127  {
128  for ( ;; ) {
129  if ( m_CurrentChunk == m_EndOfChunks ) {
130  return false;
131  }
132  else if ( m_CurrentChunkPos >= m_CurrentChunk->second.size() ) {
133  ++m_CurrentChunk;
134  m_CurrentChunkPos = 0;
135  }
136  else {
137  return true;
138  }
139  }
140  }
141 
145 };
146 
148 
149 
151 {
152  m_Stream = make_unique<CRStream>(new CDataChunkReader(m_Chunks),
153  0, nullptr, CRWStreambuf::fOwnReader);
154  return *m_Stream;
155 }
156 
157 
158 vector<int> GetBioseqChunks(const CSeq_id& seq_id,
159  const CBlobRecord& blob,
160  const unsigned char * data,
161  unsigned int size,
162  int chunk_no)
163 {
165  buf.AddDataChunk(data, size, chunk_no);
166  buf.SetGzip(blob.GetFlag(EBlobFlags::eGzip));
167  return GetBioseqChunks(seq_id, *buf.DeserializeSplitInfo());
168 }
169 
170 
171 static
172 bool s_Matches(const CSeq_id& seq_id,
173  const CID2S_Bioseq_Ids::C_E& id)
174 {
175  if ( id.IsSeq_id() ) {
176  if ( auto req_textid = seq_id.GetTextseq_Id() ) {
177  if ( auto seq_textid = id.GetSeq_id().GetTextseq_Id() ) {
178  if ( req_textid->IsSetAccession() && seq_textid->IsSetAccession() &&
179  NStr::EqualNocase(req_textid->GetAccession(), seq_textid->GetAccession()) ) {
180  if ( !req_textid->IsSetVersion() ||
181  (seq_textid->IsSetVersion() &&
182  req_textid->GetVersion() == seq_textid->GetVersion()) ) {
183  return true;
184  }
185  }
186  if ( req_textid->IsSetName() && seq_textid->IsSetName() &&
187  NStr::EqualNocase(req_textid->GetName(), seq_textid->GetName()) ) {
188  return true;
189  }
190  }
191  }
192  return seq_id.Equals(id.GetSeq_id());
193  }
194  else if ( !seq_id.IsGi() ) {
195  return false;
196  }
197  if ( id.IsGi() ) {
198  return seq_id.GetGi() == id.GetGi();
199  }
200  else {
201  auto& gi_range = id.GetGi_range();
202  return
203  seq_id.GetGi() >= gi_range.GetStart() &&
204  seq_id.GetGi() < GI_FROM(TIntId, GI_TO(TIntId, gi_range.GetStart())+gi_range.GetCount());
205  }
206 }
207 
208 
209 static
210 bool s_ContainsBioseq(const CSeq_id& seq_id,
211  const CID2S_Chunk_Info& chunk)
212 {
213  for ( auto& content : chunk.GetContent() ) {
214  if ( content->IsBioseq_place() ) {
215  for ( auto& place : content->GetBioseq_place() ) {
216  for ( auto& id : place->GetSeq_ids().Get() ) {
217  if ( s_Matches(seq_id, *id) ) {
218  return true;
219  }
220  }
221  }
222  }
223  }
224  return false;
225 }
226 
227 
228 vector<int> GetBioseqChunks(const CSeq_id& seq_id,
229  const CID2S_Split_Info& split_info)
230 {
231  vector<int> ret;
232  for ( auto& chunk : split_info.GetChunks() ) {
233  if ( s_ContainsBioseq(seq_id, *chunk) ) {
234  ret.push_back(chunk->GetId());
235  }
236  }
237  return ret;
238 }
239 
240 
242 {
244  CNcbiIstream* in = &GetStream();
245  unique_ptr<CNcbiIstream> z_stream;
246  if ( IsGzip() ) {
247  z_stream.reset(new CCompressionIStream(*in,
250  in = z_stream.get();
251  }
252  unique_ptr<CObjectIStream> obj_stream(CObjectIStream::Open(eSerial_AsnBinary, *in));
253  *obj_stream >> *info;
254  return info;
255 }
256 
257 
bool GetFlag(EBlobFlags flag_value) const
virtual ERW_Result Read(void *buf, size_t count, size_t *bytes_read=0) override
Read as many as "count" bytes into a buffer pointed to by the "buf" argument.
size_t x_AvaliableCount() const
TChunks::const_iterator m_CurrentChunk
map< int, vector< char > > TChunks
virtual ERW_Result PendingCount(size_t *count) override
Via parameter "count" (which is guaranteed to be supplied non-NULL) return the number of bytes that a...
CDataChunkReader(const TChunks &chunks)
const char * x_GetCurrentDataPtr() const
TChunks::const_iterator m_EndOfChunks
map< int, vector< char > > m_Chunks
CRef< CID2S_Split_Info > DeserializeSplitInfo()
void AddDataChunk(const unsigned char *data, unsigned int size, int chunk_no)
CNcbiIstream & GetStream()
unique_ptr< CNcbiIstream > m_Stream
CID2S_Chunk_Info –.
CID2S_Split_Info –.
@ fOwnReader
Own the underlying reader.
Definition: rwstreambuf.hpp:66
CZipStreamDecompressor – zlib based decompression stream processor.
Definition: zlib.hpp:817
A very basic data-read interface.
container_type::const_iterator const_iterator
Definition: map.hpp:53
Definition: map.hpp:338
#define false
Definition: bool.h:36
char data[12]
Definition: iconv.c:80
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
Int8 TIntId
Definition: ncbimisc.hpp:999
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
@ fGZip
Set of flags for gzip file support. See each flag description above.
Definition: zlib.hpp:120
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
ERW_Result
Result codes for I/O operations.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
@ eRW_Eof
End of data, should be considered permanent.
@ eRW_Success
Everything is okay, I/O completed.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
const TChunks & GetChunks(void) const
Get the Chunks member data.
const TContent & GetContent(void) const
Get the Content member data.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
char * buf
static MDB_envinfo info
Definition: mdb_load.c:37
#define z_stream
Definition: miniz.h:442
const struct ncbi::grid::netcache::search::fields::SIZE size
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
Reader-writer based streams.
static bool s_Matches(const CSeq_id &seq_id, const CID2S_Bioseq_Ids::C_E &id)
BEGIN_LOCAL_NAMESPACE
vector< int > GetBioseqChunks(const CSeq_id &seq_id, const CBlobRecord &blob, const unsigned char *data, unsigned int size, int chunk_no)
END_LOCAL_NAMESPACE
END_NCBI_NAMESPACE
BEGIN_NCBI_NAMESPACE
END_NAMESPACE(psg)
BEGIN_NAMESPACE(psg)
static bool s_ContainsBioseq(const CSeq_id &seq_id, const CID2S_Chunk_Info &chunk)
ZLib Compression API.
Modified on Wed May 15 15:06:19 2024 by modify_doxy.py rev. 669887