NCBI C++ ToolKit
split_loader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: split_loader.cpp 92131 2020-12-22 16:49:57Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 * Sample split data loader
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
38 #include <serial/objistr.hpp>
39 #include <serial/serial.hpp>
40 #include <objmgr/impl/tse_info.hpp>
44 
45 #include "split_loader.hpp"
46 
49 
50 
53  const string& data_file,
54  CObjectManager::EIsDefault is_default,
55  CObjectManager::TPriority priority)
56 {
57  TSplitLoaderMaker maker(data_file);
58  CDataLoader::RegisterInObjectManager(om, maker, is_default, priority);
59  return maker.GetRegisterInfo();
60 }
61 
62 
63 string CSplitDataLoader::GetLoaderNameFromArgs(const string& data_file)
64 {
65  return "SPLIT_LOADER_DEMO:" + data_file;
66 }
67 
68 
69 CSplitDataLoader::CSplitDataLoader(const string& loader_name,
70  const string& data_file)
71  : CDataLoader(loader_name),
72  m_DataFile(data_file),
73  m_NextSeqsetId(0),
74  m_NextChunkId(0)
75 {
76 }
77 
78 
80 {
81 }
82 
83 
85 {
86  // Release loaded data, reset chunk mappings
87  m_TSE.Reset();
91  m_NextSeqsetId = 0;
92  m_NextChunkId = 0;
93 }
94 
95 
98  EChoice choice)
99 {
100  TTSE_LockSet locks;
101  TBlobId blob_id = GetBlobId(idh);
102  if ( blob_id ) {
103  TTSE_Lock lock = GetBlobById(blob_id);
104  if ( lock ) {
105  locks.insert(lock);
106  }
107  }
108  return locks;
109 }
110 
111 
113 {
114  // Check if already loaded
115  if ( chunk->IsLoaded() ) {
116  return;
117  }
118  // Find annotations related to the chunk
119  TAnnotChunks::iterator annot_chunks =
120  m_AnnotChunks.find(chunk->GetChunkId());
121  if (annot_chunks != m_AnnotChunks.end()) {
122  // Attach all related annotations
123  ITERATE(TAnnots, annot, annot_chunks->second) {
124  chunk->x_LoadAnnot(annot->m_Place, *annot->m_Annot);
125  }
126  }
127  // Find descriptors related to the chunk
129  if (descr != m_DescrChunks.end()) {
130  // Attach related descriptor
131  chunk->x_LoadDescr(descr->second.m_Place, *descr->second.m_Descr);
132  }
133  // Load sequence data
135  if (seq != m_SeqChunks.end()) {
136  // Attach seq-data
138  sequence.push_back(seq->second.m_Literal);
139  chunk->x_LoadSequence(seq->second.m_Place, seq->second.m_Pos,
140  sequence);
141  }
142  // Mark chunk as loaded
143  chunk->SetLoaded();
144 }
145 
146 
149 {
150  TBlobId blob_id;
151  blob_id = new CBlobIdString(m_DataFile);
152  return blob_id;
153 }
154 
155 
157 {
158  return true;
159 }
160 
161 
164 {
165  TBlobId my_blob_id = new CBlobIdString(m_DataFile);
166  if ( blob_id != my_blob_id ) {
167  return TTSE_Lock();
168  }
169  // Load data, get the lock
170  CTSE_LoadLock lock = GetDataSource()->GetTSE_LoadLock(blob_id);
171  if ( !lock.IsLoaded() ) {
172  x_LoadData(lock);
173  }
174  return lock;
175 }
176 
177 
179 {
180  _ASSERT(!m_TSE);
181  m_TSE.Reset(new CSeq_entry);
182  m_NextSeqsetId = 0;
183  m_NextChunkId = 0;
184  {{
185  // Load the seq-entry
186  unique_ptr<CObjectIStream> is(CObjectIStream::Open(m_DataFile,
187  eSerial_AsnText));
188  *is >> *m_TSE;
189  }}
190  TChunks chunks;
191  // Split data
192  if ( m_TSE->IsSeq() ) {
193  x_SplitSeq(chunks, m_TSE->SetSeq());
194  }
195  else {
196  x_SplitSet(chunks, m_TSE->SetSet());
197  }
198  // Fill TSE info
199  CTSE_Info& info = *load_lock;
200  info.SetSeq_entry(*m_TSE);
201  // Attach all chunks to the TSE info
202  NON_CONST_ITERATE(TChunks, it, chunks) {
203  info.GetSplitInfo().AddChunk(**it);
204  }
205  // Mark TSE info as loaded
206  load_lock.SetLoaded();
207 }
208 
209 
211 {
212  // Split and remove annots
214  if ( bioseq.IsSetAnnot() ) {
215  x_SplitAnnot(chunks, TPlace(idh, 0), bioseq.SetAnnot());
216  bioseq.SetAnnot().clear();
217  }
218  // Split and remove descrs
219  if ( bioseq.IsSetDescr() ) {
220  x_SplitDescr(chunks, TPlace(idh, 0), bioseq.SetDescr());
221  bioseq.ResetDescr();
222  }
223  // Split and remove data
224  x_SplitSeqData(chunks, idh, bioseq);
225 }
226 
227 
229 {
231  if ( (*it)->IsSeq() ) {
232  x_SplitSeq(chunks, (*it)->SetSeq());
233  }
234  else {
235  x_SplitSet(chunks, (*it)->SetSet());
236  }
237  }
238  TPlace place;
239  seqset.SetId().SetId(++m_NextSeqsetId);
240  place.second = seqset.GetId().GetId();
241  // Split and remove annots
242  if ( seqset.IsSetAnnot() ) {
243  x_SplitAnnot(chunks, place, seqset.SetAnnot());
244  seqset.SetAnnot().clear();
245  }
246  // Split and remove descrs
247  if ( seqset.IsSetDescr() ) {
248  x_SplitDescr(chunks, place, seqset.SetDescr());
249  seqset.ResetDescr();
250  }
251 }
252 
253 
255  CSeq_id_Handle idh,
256  CBioseq& bioseq)
257 {
258  if ( !bioseq.GetInst().IsSetSeq_data() ) {
259  return;
260  }
261  // Prepare internal data
262  SSeqData data;
263  data.m_Pos = 0;
264  data.m_Place.first = idh;
265  data.m_Literal.Reset(new CSeq_literal);
266  data.m_Literal->SetLength(bioseq.GetInst().IsSetLength() ?
267  bioseq.GetInst().GetLength() : 0);
268  data.m_Literal->SetSeq_data(bioseq.SetInst().SetSeq_data());
269  bioseq.SetInst().ResetSeq_data();
270 
271  // Create location for the chunk
275  CTSE_Chunk_Info::TLocation loc(idh, rg);
276  loc_set.push_back(loc);
277 
278  // Create new chunk for the data
280  // Add seq-data
281  chunk->x_AddSeq_data(loc_set);
282  // Store data locally, remember place and chunk id
284  chunks.push_back(chunk);
285 }
286 
287 
289  TPlace place,
290  CSeq_descr& descr)
291 {
292  // Create new chunk for each descr
294  // Add descr info using bioseq id or bioseq-set id
295  // Descr type mask includes everything for simplicity
296  if ( place.first ) {
297  chunk->x_AddDescInfo(0xffff, place.first);
298  }
299  else {
300  chunk->x_AddDescInfo(0xffff, place.second);
301  }
302  // Store data locally, remember place and chunk id
304  data.m_Place = place;
305  data.m_Descr.Reset(&descr);
307  chunks.push_back(chunk);
308 }
309 
310 
312  TPlace place,
313  CBioseq::TAnnot& annots)
314 {
315  // Create new chunk for each set of annots
317  // Register attachment place for annots
318  chunk->x_AddAnnotPlace(place);
320  data.m_Place = place;
321  TAnnots& annot_chunks = m_AnnotChunks[m_NextChunkId];
322  // Process each annot
323  NON_CONST_ITERATE(CBioseq::TAnnot, ait, annots) {
324  CSeq_annot& annot = **ait;
325  data.m_Annot = &annot;
326  annot_chunks.push_back(data);
327  switch ( annot.GetData().Which() ) {
329  x_SplitFeats(*chunk, annot);
330  break;
332  x_SplitAligns(*chunk, annot);
333  break;
335  x_SplitGraphs(*chunk, annot);
336  break;
337  default:
338  // ignore other annotations
339  continue;
340  }
341  }
342  chunks.push_back(chunk);
343 }
344 
345 
347  const CSeq_annot& annot)
348 {
349  _ASSERT(annot.GetData().IsFtable());
351  const CSeq_feat& feat = **it;
352  // Get type and all referenced seq-ids for each feature
353  SAnnotTypeSelector sel(feat.GetData().Which());
354  set<CSeq_id_Handle> handles;
355  for (CSeq_loc_CI loc_it(feat.GetLocation()); loc_it; ++loc_it) {
356  handles.insert(loc_it.GetSeq_id_Handle());
357  }
358  if ( feat.IsSetProduct() ) {
359  for (CSeq_loc_CI loc_it(feat.GetProduct()); loc_it; ++loc_it) {
360  handles.insert(loc_it.GetSeq_id_Handle());
361  }
362  }
363  // Register each referenced seq-id and feature type
364  ITERATE(set<CSeq_id_Handle>, idh, handles) {
365  chunk.x_AddAnnotType(CAnnotName(), sel, *idh);
366  }
367  }
368 }
369 
370 
372  const CSeq_annot& annot)
373 {
374  _ASSERT(annot.GetData().IsAlign());
377  const CSeq_align& align = **it;
378  // Collect all referenced seq-ids
379  set<CSeq_id_Handle> handles;
380  CSeq_align::TDim dim = align.CheckNumRows();
381  for (CSeq_align::TDim row = 0; row < dim; row++) {
383  }
384  // Register each referenced seq-id and annotation type
385  ITERATE(set<CSeq_id_Handle>, idh, handles) {
386  chunk.x_AddAnnotType(CAnnotName(), sel, *idh);
387  }
388  }
389 }
390 
391 
393  const CSeq_annot& annot)
394 {
395  _ASSERT(annot.GetData().IsGraph());
398  const CSeq_graph& graph = **it;
399  // Collect all referenced seq-ids
400  set<CSeq_id_Handle> handles;
401  for (CSeq_loc_CI loc_it(graph.GetLoc()); loc_it; ++loc_it) {
402  handles.insert(loc_it.GetSeq_id_Handle());
403  }
404  // Register each referenced seq-id and annotation type
405  ITERATE(set<CSeq_id_Handle>, idh, handles) {
406  chunk.x_AddAnnotType(CAnnotName(), sel, *idh);
407  }
408  }
409 }
410 
411 
User-defined methods of the data storage class.
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
CTSE_LoadLock GetTSE_LoadLock(const TBlobId &blob_id)
CObjectManager –.
TDim CheckNumRows(void) const
Validatiors.
Definition: Seq_align.cpp:73
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
void x_SplitSet(TChunks &chunks, CBioseq_set &seqset)
void x_SplitAnnot(TChunks &chunks, TPlace place, CBioseq::TAnnot &annots)
TDescrChunks m_DescrChunks
vector< CRef< CTSE_Chunk_Info > > TChunks
virtual void GetChunk(TChunk chunk)
virtual ~CSplitDataLoader(void)
void x_LoadData(CTSE_LoadLock &load_lock)
TAnnotChunks m_AnnotChunks
void x_SplitSeqData(TChunks &chunks, CSeq_id_Handle idh, CBioseq &bioseq)
void x_SplitFeats(CTSE_Chunk_Info &chunk, const CSeq_annot &annot)
void x_SplitSeq(TChunks &chunks, CBioseq &bioseq)
TSequenceChunks m_SeqChunks
virtual bool CanGetBlobById(void) const
void x_SplitAligns(CTSE_Chunk_Info &chunk, const CSeq_annot &annot)
virtual TBlobId GetBlobId(const CSeq_id_Handle &idh)
CSplitDataLoader(const string &loader_name, const string &data_file)
virtual void DropTSE(CRef< CTSE_Info > tse_info)
vector< SAnnotData > TAnnots
CTSE_Chunk_Info::TPlace TPlace
CRef< CSeq_entry > m_TSE
void x_SplitGraphs(CTSE_Chunk_Info &chunk, const CSeq_annot &annot)
virtual TTSE_Lock GetBlobById(const TBlobId &blob_id)
void x_SplitDescr(TChunks &chunks, TPlace place, CSeq_descr &descr)
virtual TTSE_LockSet GetRecords(const CSeq_id_Handle &idh, EChoice choice)
Request from a datasource using handles and ranges instead of seq-loc The TSEs loaded in this call wi...
static string GetLoaderNameFromArgs(const string &data_file)
void x_AddAnnotPlace(const TBioseqId &id)
void x_LoadAnnot(const TPlace &place, const CSeq_annot &annot)
void x_AddDescInfo(TDescTypeMask type_mask, const TBioseqId &id)
void x_LoadSequence(const TPlace &place, TSeqPos pos, const TSequence &seq)
void SetLoaded(CObject *obj=0)
vector< TLocation > TLocationSet
void x_LoadDescr(const TPlace &place, const CSeq_descr &descr)
TChunkId GetChunkId(void) const
void x_AddAnnotType(const CAnnotName &annot_name, const SAnnotTypeSelector &annot_type, const TLocationId &location_id)
list< CRef< CSeq_literal > > TSequence
void x_AddSeq_data(const TLocationSet &location)
pair< TLocationId, TLocationRange > TLocation
bool IsLoaded(void) const
bool IsLoaded(void) const
void SetLoaded(void)
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
CDataSource * GetDataSource(void) const
Definition: data_loader.cpp:92
EChoice
main blob is blob with sequence all other blobs are external and contain external annotations
TRegisterInfo GetRegisterInfo(void)
CBlobIdFor< string > CBlobIdString
Definition: blob_id.hpp:153
static void RegisterInObjectManager(CObjectManager &om, CLoaderMaker_Base &loader_maker, CObjectManager::EIsDefault is_default, CObjectManager::TPriority priority)
Register the loader only if the name is not yet registered in the object manager.
Definition: data_loader.cpp:53
CTSE_Lock TTSE_Lock
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
E_Choice Which(void) const
Which variant is currently selected.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
const TLoc & GetLoc(void) const
Get the Loc member data.
Definition: Seq_graph_.hpp:869
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
void SetId(TId &value)
Assign a value to Id data member.
Definition: Bioseq_set_.cpp:93
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
void ResetDescr(void)
Reset Descr data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
list< CRef< CSeq_entry > > TSeq_set
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
const TId & GetId(void) const
Get the Id member data.
bool IsAlign(void) const
Check if variant Align is selected.
Definition: Seq_annot_.hpp:635
void ResetDescr(void)
Reset Descr data member.
Definition: Bioseq_.cpp:60
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
const TGraph & GetGraph(void) const
Get the variant data.
Definition: Seq_annot_.hpp:661
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
list< CRef< CSeq_graph > > TGraph
Definition: Seq_annot_.hpp:195
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
const TAlign & GetAlign(void) const
Get the variant data.
Definition: Seq_annot_.hpp:641
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
bool IsGraph(void) const
Check if variant Graph is selected.
Definition: Seq_annot_.hpp:655
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
bool IsFtable(void) const
Check if variant Ftable is selected.
Definition: Seq_annot_.hpp:615
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_annot_.hpp:586
static MDB_envinfo info
Definition: mdb_load.c:37
CRef< objects::CObjectManager > om
#define row(bind, expected)
Definition: string_bind.c:73
SRegisterLoaderInfo –.
#define _ASSERT
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:49 2024 by modify_doxy.py rev. 669887