NCBI C++ ToolKit
blob_splitter_impl.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blob_splitter_impl.cpp 86774 2019-06-18 18:07:31Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Eugene Vasilchenko
27 *
28 * File Description:
29 * Application for splitting blobs withing ID1 cache
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
36 
37 #include <serial/objostr.hpp>
38 #include <serial/serial.hpp>
39 
46 #include <objmgr/error_codes.hpp>
47 #include <objmgr/scope.hpp>
49 #include <objects/seq/Seqdesc.hpp>
51 
52 
53 #define NCBI_USE_ERRCODE_X ObjMgr_BlobSplit
54 
56 
58 
60 
61 template<class C>
62 inline
63 C& NonConst(const C& c)
64 {
65  return const_cast<C&>(c);
66 }
67 
68 
69 /////////////////////////////////////////////////////////////////////////////
70 // CBlobSplitter interface method to avoid recompilation of two files
71 /////////////////////////////////////////////////////////////////////////////
72 
73 bool CBlobSplitter::Split(const CSeq_entry& entry)
74 {
76  if ( impl.Split(entry) ) {
77  m_SplitBlob = impl.GetBlob();
78  }
79  else {
80  m_SplitBlob.Reset(entry);
81  }
82  return m_SplitBlob.IsSplit();
83 }
84 
85 
86 /////////////////////////////////////////////////////////////////////////////
87 // CBlobSplitterImpl
88 /////////////////////////////////////////////////////////////////////////////
89 
90 
92 {
93  Reset();
94 
97 
98  // copying skeleton while stripping annotations
99  CopySkeleton(*m_Skeleton, entry);
100 
101  // collect annot pieces separating annotations with different priorities
102  CollectPieces();
103 
104  if ( m_Pieces.size() <= eAnnotPriority_skeleton+1 ) {
105  // only skeleton -> no-split
106  return false;
107  }
108 
109  if ( m_Pieces.size() <= eAnnotPriority_zoomed+1 ) {
110  // check if all non-zoomed annotations fit in one chunk
111  size_t total_size = 0;
112  ITERATE( TPieces, pi, m_Pieces ) {
113  if ( !(*pi) ) {
114  continue;
115  }
116  ITERATE( CAnnotPieces, i, **pi ) {
117  const SIdAnnotPieces& id_pieces = i->second;
118  total_size += id_pieces.m_Size.GetAsnSize();
119  }
120  }
121  if (total_size <= m_Params.m_MaxChunkSize) {
122  return false;
123  }
124  }
125 
126  // split pieces in chunks
127  SplitPieces();
128 
129  if ( m_Chunks.size() < m_Params.m_MinChunkCount ) { // too few chunks
130  return false;
131  }
132 
133  MakeID2SObjects();
134 
135  return m_SplitBlob.IsSplit();
136 }
137 
138 
140 {
141  // Collect annotation pieces and strip skeleton annotations
142  // to main chunk.
143  m_Pieces.clear();
144 
145  ITERATE ( TEntries, it, m_Entries ) {
146  CollectPieces(it->second);
147  }
148 
149  if ( m_Params.m_Verbose ) {
150  // display pieces statistics
151  CSize single_ref;
152  ITERATE ( TPieces, pit, m_Pieces ) {
153  if ( !*pit ) {
154  continue;
155  }
156  ITERATE ( CAnnotPieces, it, **pit ) {
157  if ( it->second.size() <= 1 ) {
158  single_ref += it->second.m_Size;
159  }
160  else {
161  NcbiCout << "@" << it->first.AsString() << ": " <<
162  it->second.m_Size << '\n';
163  }
164  }
165  }
166  if ( single_ref ) {
167  NcbiCout << "with 1 obj: " << single_ref << '\n';
168  }
169  NcbiCout << NcbiEndl;
170  }
171 }
172 
173 
175 {
176  const CPlaceId& place_id = info.m_PlaceId;
177  if ( info.m_Descr ) {
178  CollectPieces(place_id, *info.m_Descr);
179  }
180  ITERATE ( CPlace_SplitInfo::TSeq_annots, it, info.m_Annots ) {
181  CollectPieces(place_id, it->second);
182  }
183  if ( info.m_Inst ) {
184  const CSeq_inst_SplitInfo& inst_info = *info.m_Inst;
186  Add(SAnnotPiece(place_id, *it));
187  }
188  }
189  if ( info.m_Hist ) {
190  CollectPieces(place_id, *info.m_Hist);
191  }
192  ITERATE ( CPlace_SplitInfo::TBioseqs, it, info.m_Bioseqs ) {
193  Add(SAnnotPiece(place_id, *it));
194  }
195 }
196 
197 
199  const CSeq_annot_SplitInfo& info)
200 {
201  size_t max_size = info.m_Name.IsNamed()? 5000: 500;
202  size_t size = info.m_Size.GetAsnSize();
203  bool add_as_whole = size <= max_size;
204  if ( add_as_whole ) {
205  // add whole Seq-annot as one piece because header overhead is too big
206  Add(SAnnotPiece(place_id, info));
207  }
208  else {
209  // add each annotation as separate piece
210  ITERATE ( CSeq_annot_SplitInfo::TObjects, i, info.m_Objects ) {
211  if ( !*i ) {
212  continue;
213  }
214  ITERATE ( CLocObjects_SplitInfo, j, **i ) {
215  Add(SAnnotPiece(place_id, info, *j));
216  }
217  }
218  }
219 }
220 
221 
223 {
224  switch ( desc.Which() ) {
225  case CSeqdesc::e_Source:
226  case CSeqdesc::e_Molinfo:
227  case CSeqdesc::e_Title:
228  case CSeqdesc::e_User: // for the feature fetch policy
230  case CSeqdesc::e_Pub:
231  case CSeqdesc::e_Comment:
232  return eAnnotPriority_low;
233  default:
234  break;
235  }
236  return eAnnotPriority_regular;
237 }
238 
239 
241  const CSeq_descr_SplitInfo& info)
242 {
243  size_t max_size = m_Params.m_MaxChunkSize;
244  size_t size = info.m_Size.GetZipSize();
245  bool add_as_whole = size <= max_size;
246  bool have_skeleton_priority = false;
247  bool have_other_priority = false;
248  ITERATE ( CSeq_descr::Tdata, i, info.m_Descr->Get() ) {
250  have_skeleton_priority = true;
251  }
252  else {
253  have_other_priority = true;
254  }
255  }
256  if ( have_skeleton_priority && have_other_priority ) {
257  add_as_whole = false;
258  }
259  if ( add_as_whole ) {
260  // add whole Seq-descr as one piece because header overhead is too big
261  Add(SAnnotPiece(place_id, info));
262  }
263  else {
264  // split descriptors
265  _ASSERT(info.m_Location.size() == 1);
266  TSeqPos seq_length = info.m_Location.begin()->second.
267  GetTotalRange().GetLength();
268  ITERATE ( CSeq_descr::Tdata, i, info.m_Descr->Get() ) {
269  CRef<CSeqdesc> desc(&NonConst(**i));
270  CRef<CSeq_descr> descr_piece(new CSeq_descr);
271  descr_piece->Set().push_back(desc);
272  CRef<CSeq_descr_SplitInfo> piece_info(new CSeq_descr_SplitInfo(place_id, seq_length,
273  *descr_piece, m_Params));
274  info.m_SubPieces.push_back(piece_info);
275  piece_info->m_Priority = GetSeqdescPriority(*desc);
276  Add(SAnnotPiece(place_id, *piece_info));
277  }
278  }
279 }
280 
281 
283  const CSeq_hist_SplitInfo& info)
284 {
285  // add whole history asembly as one piece
286  Add(SAnnotPiece(place_id, info));
287 }
288 
289 
291 {
292  TAnnotPriority priority = piece.m_Priority;
293  m_Pieces.resize(max(m_Pieces.size(), priority + size_t(1)));
294  if ( !m_Pieces[priority] ) {
295  m_Pieces[priority] = new CAnnotPieces;
296  }
297  m_Pieces[priority]->Add(piece);
298 }
299 
300 
302 {
303  int chunk_id = int(m_Chunks.size());
304  if ( m_Chunks.find(0) == m_Chunks.end() )
305  ++chunk_id;
306  return &m_Chunks[chunk_id];
307 }
308 
309 
311 {
312  if ( chunk ) {
313  CSize::TDataSize cur_size = chunk->m_Size.GetZipSize();
314  CSize::TDataSize new_size = cur_size + size.GetZipSize();
315  if ( /* cur_size < m_Params.m_MinChunkSize || */
316  cur_size <= m_Params.m_ChunkSize &&
317  new_size <= m_Params.m_MaxChunkSize ) {
318  return chunk;
319  }
320  }
321  return NextChunk();
322 }
323 
324 
326 {
327  NON_CONST_ITERATE ( TPieces, prit, m_Pieces ) {
328  if ( !*prit ) {
329  continue;
330  }
331  TAnnotPriority priority = EAnnotPriority(prit-m_Pieces.begin());
332  if ( priority == eAnnotPriority_skeleton ) {
333  AddToSkeleton(**prit);
334  }
335  else {
336  SplitPieces(**prit);
337  }
338  _ASSERT((*prit)->empty());
339  prit->Reset();
340  }
341 
342  m_Pieces.clear();
343 
344  if ( m_Params.m_Verbose ) {
345  // display collected chunks stats
346  ITERATE ( TChunks, it, m_Chunks ) {
347  NcbiCout << "Chunk: " << it->first << ": " << it->second.m_Size <<
348  NcbiEndl;
349  }
350  }
351 
352  if ( m_Params.m_JoinSmallChunks ) {
353  if ( m_Params.m_Verbose ) {
354  LOG_POST_X(6, "Joining small chunks");
355  }
356 
357  typedef multimap<size_t, int> TSizes;
358  TSizes sizes;
359  ITERATE ( TChunks, it, m_Chunks ) {
360  size_t zip_size = it->second.m_Size.GetZipSize();
361  if ( it->first != 0 && zip_size < m_Params.m_MinChunkSize) {
362  sizes.insert(TSizes::value_type(zip_size, it->first));
363  }
364  }
365 
366  // Create main chunk if not created yet
367  m_Chunks[0];
368  // merge too small chunks to higher priority chunks
369  NON_CONST_ITERATE( TChunks, chunk_it, m_Chunks ) {
370  if ( sizes.empty() ) {
371  break;
372  }
373  SChunkInfo& dst_chunk = chunk_it->second;
374  while (dst_chunk.m_Size.GetZipSize() < m_Params.m_MinChunkSize) {
375  TSizes::iterator small = sizes.begin();
376  while ( small->second <= chunk_it->first ) {
377  // Do not try to merge already processed chunks or
378  // a chunk to itself
379  sizes.erase(small);
380  if (sizes.empty()) {
381  break;
382  }
383  small = sizes.begin();
384  }
385  if (sizes.empty()) {
386  break;
387  }
388  size_t new_size =
389  dst_chunk.m_Size.GetZipSize() + small->first;
390  if ( new_size > m_Params.m_MaxChunkSize ) {
391  // No more merging to the current chunk
392  break;
393  }
394  if ( m_Params.m_Verbose ) {
395  LOG_POST_X(7, " merging chunk " << small->second
396  << " into " << chunk_it->first
397  << " new size: " << new_size);
398  }
399  dst_chunk.Add(m_Chunks[small->second]);
400  m_Chunks.erase(small->second);
401  sizes.erase(small);
402  if ( sizes.empty() ) {
403  break;
404  }
405  }
406  }
407  if ( m_Params.m_Verbose && !sizes.empty() ) {
408  ITERATE( TSizes, i, sizes ) {
409  LOG_POST_X(8, "Small chunk not merged: "
410  << i->second << ", size: " << i->first);
411  }
412  }
413  }
414 }
415 
416 
418 {
419  SChunkInfo& main_chunk = m_Chunks[0];
420 
421  // combine ids with small amount of pieces
422  while ( !pieces.empty() ) {
423  CAnnotPieces::iterator max_iter = pieces.begin();
424  SIdAnnotPieces& objs = max_iter->second;
425  if ( !objs.empty() ) {
426  while ( !objs.empty() ) {
427  SAnnotPiece piece = *objs.begin();
428  main_chunk.Add(piece);
429  pieces.Remove(piece);
430  _ASSERT(objs.empty() || *objs.begin() != piece);
431  }
432  }
433  _ASSERT(max_iter->second.empty());
434  pieces.erase(max_iter);
435  }
436  _ASSERT(pieces.empty());
437 }
438 
439 
441 {
442  SChunkInfo* chunk = 0;
443  SChunkInfo* long_chunk = 0;
444 
445  // split ids with large amount of pieces
446  while ( !pieces.empty() ) {
447  // find id with most size of pieces on it
448  CSize max_size;
449  CAnnotPieces::iterator max_iter;
450  NON_CONST_ITERATE ( CAnnotPieces, it, pieces ) {
451  if ( it->second.m_Size > max_size ) {
452  max_iter = it;
453  max_size = it->second.m_Size;
454  }
455  }
456  if ( max_size.GetZipSize() < m_Params.m_MaxChunkSize ||
457  max_size.GetCount() <= 1 ) {
458  break;
459  }
460 
461  // split this id
462  if ( m_Params.m_Verbose ) {
463  LOG_POST_X(9, "Splitting @"<<max_iter->first.AsString()<<
464  ": "<<max_size);
465  }
466 
467  SIdAnnotPieces& objs = max_iter->second;
468  bool sequential = true;
469  TRange prevRange = TRange::GetEmpty();
470  ITERATE ( SIdAnnotPieces, it, objs ) {
471  const SAnnotPiece& piece = *it;
472  TRange range = piece.m_IdRange;
473  if ( range.Empty() ) {
474  continue;
475  }
476  if ( !prevRange.Empty() ) {
477  if ( range.GetFrom() < prevRange.GetFrom() ||
478  (range.IntersectingWith(prevRange) &&
479  range != prevRange) ) {
480  sequential = false;
481  break;
482  }
483  }
484  prevRange = range;
485  }
486  if ( !sequential ) {
487  // extract long annotations first
488 
489  // calculate maximum piece length
490  // how many chunks to make from these annotations
491  size_t chunk_count =
492  size_t(double(objs.m_Size.GetZipSize()) /
493  double(m_Params.m_ChunkSize)
494  +.5);
495  // length of sequence covered by annotations
496  size_t whole_length = objs.m_IdRange.GetLength();
497  // estimated length of sequence covered by one chunk
498  size_t chunk_length = whole_length / chunk_count;
499  // maximum length of one piece over the sequence
500  size_t max_piece_length = chunk_length / 2;
501 
502  // extract long pieces into main or next chunk
503  vector<SAnnotPiece> pcs;
504  CSize size;
505  ITERATE ( SIdAnnotPieces, it, objs ) {
506  const SAnnotPiece& piece = *it;
507  if ( piece.m_IdRange.GetLength() > max_piece_length ) {
508  pcs.push_back(piece);
509  size += piece.m_Size;
510  if ( m_Params.m_Verbose ) {
511  LOG_POST_X(10, " long piece: "<<piece.m_IdRange.GetLength());
512  }
513  }
514  }
515  if ( !pcs.empty() ) {
516  if ( m_Params.m_Verbose ) {
517  LOG_POST_X(11, " "<<pcs.size()<<" long pieces: "<<size);
518  LOG_POST_X(12, " "
519  " CC:"<<chunk_count<<
520  " WL:"<<whole_length<<
521  " CL:"<<chunk_length<<
522  " ML:"<<max_piece_length);
523  }
524  ITERATE ( vector<SAnnotPiece>, it, pcs ) {
525  const SAnnotPiece& piece = *it;
526  long_chunk = NextChunk(long_chunk, piece.m_Size);
527  long_chunk->Add(piece);
528  pieces.Remove(piece);
529  }
530  }
531  }
532 
533  // extract all other pieces
534  vector<SAnnotPiece> pcs;
535  ITERATE ( SIdAnnotPieces, it, objs ) {
536  pcs.push_back(*it);
537  }
538  ITERATE ( vector<SAnnotPiece>, it, pcs ) {
539  const SAnnotPiece piece = *it;
540  chunk = NextChunk(chunk, piece.m_Size);
541  chunk->Add(piece);
542  pieces.Remove(piece);
543  }
544 
545  _ASSERT(max_iter->second.empty());
546  pieces.erase(max_iter);
547  }
548 
549  // combine ids with small amount of pieces
550  while ( !pieces.empty() ) {
551  CAnnotPieces::iterator max_iter = pieces.begin();
552  SIdAnnotPieces& objs = max_iter->second;
553  if ( !objs.empty() ) {
554  chunk = NextChunk(chunk, objs.m_Size);
555  while ( !objs.empty() ) {
556  SAnnotPiece piece = *objs.begin();
557  chunk->Add(piece);
558  pieces.Remove(piece);
559  _ASSERT(objs.empty() || *objs.begin() != piece);
560  }
561  }
562  _ASSERT(max_iter->second.empty());
563  pieces.erase(max_iter);
564  }
565  _ASSERT(pieces.empty());
566 }
567 
568 
TAnnotPriority GetSeqdescPriority(const CSeqdesc &desc)
C & NonConst(const C &c)
NCBI_DEFINE_ERR_SUBCODE_X(12)
void erase(iterator it)
void Remove(const SAnnotPiece &piece)
bool empty(void) const
const_iterator begin(void) const
TPiecesById::iterator iterator
SSplitterParams m_Params
SChunkInfo * NextChunk(void)
void AddToSkeleton(CAnnotPieces &pieces)
bool Split(const CSeq_entry &entry)
CRef< CSeq_entry > m_Skeleton
vector< CRef< CAnnotPieces > > TPieces
void CopySkeleton(CSeq_entry &dst, const CSeq_entry &src)
void Add(const SAnnotPiece &piece)
CSplitBlob m_SplitBlob
bool Split(const CSeq_entry &entry)
SSplitterParams m_Params
vector< CBioseq_SplitInfo > TBioseqs
CScope –.
Definition: scope.hpp:92
vector< CRef< CLocObjects_SplitInfo > > TObjects
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
vector< CSeq_data_SplitInfo > TSeq_data
Definition: size.hpp:46
size_t GetCount(void) const
Definition: size.hpp:86
TDataSize GetZipSize(void) const
Definition: size.hpp:94
size_t TDataSize
Definition: size.hpp:48
TDataSize GetAsnSize(void) const
Definition: size.hpp:90
bool IsSplit(void) const
Definition: split_blob.hpp:67
void Reset(void)
Definition: split_blob.cpp:76
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
#define C(s)
Definition: common.h:231
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define LOG_POST_X(err_subcode, message)
Definition: ncbidiag.hpp:553
const float pi
Definition: math.hpp:54
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
position_type GetLength(void) const
Definition: range.hpp:158
static TThisType GetEmpty(void)
Definition: range.hpp:306
bool Empty(void) const
Definition: range.hpp:148
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NcbiEndl
Definition: ncbistre.hpp:548
#define NcbiCout
Definition: ncbistre.hpp:543
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
Definition of all error codes used in objmgr libraries (xobjmgr.lib, xobjutil.lib and others).
int i
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
T max(T x_, T y_)
The Object manager core.
EAnnotPriority
@ eAnnotPriority_skeleton
@ eAnnotPriority_low
@ eAnnotPriority_regular
@ eAnnotPriority_zoomed
unsigned TAnnotPriority
TRange m_IdRange
TAnnotPriority m_Priority
void Add(const SChunkInfo &info)
Definition: chunk_info.cpp:45
CSize m_Size
Definition: chunk_info.hpp:94
const_iterator begin(void) const
bool empty(void) const
#define _ASSERT
Modified on Sat Dec 02 09:23:11 2023 by modify_doxy.py rev. 669887