NCBI C++ ToolKit
sparse_graph.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrei Shkeda
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
35 #include <gui/utils/url_utils.hpp>
36 #include <util/bitset/bmserial.h>
38 #include <util/checksum.hpp>
39 #include <corelib/rwstream.hpp>
41 #include <future>
42 #include <memory>
43 
46 
47 
48 ///////////////////////////////////////////////////////////////////////////////
49 /// CSparseGraph
50 
51 
52 CCompoundID CSparseGraph::CreateCompoundID(CScope& scope, const CSeq_id& seq_id, const string& remote_path)
53 {
54  CCompoundIDPool id_pool;
55  auto id = id_pool.NewID(eCIC_GenericID);
56  auto sih_in = CSeq_id_Handle::GetHandle(seq_id);
57  auto sih = scope.GetAccVer(sih_in);
58  string seq_id_str = (sih) ? sih.GetSeqId()->GetSeqIdString(true) : seq_id.GetSeqIdString(true);
59 
60  id.AppendSeqID(seq_id_str);
61  size_t seq_len = scope.GetSequenceLength(sih_in);
62  _ASSERT(seq_len);
63  id.AppendInteger(seq_len);
64  id.AppendHost(remote_path);
65  return id;
66 }
67 
69 {
70  m_SeqId = id.GetFirst(eCIT_SeqID).GetSeqID();
71  if (m_SeqId.empty()) NCBI_THROW(CException, eUnknown, "Empty seq_id");
72 
73  m_RemotePath = id.GetFirst(eCIT_Host).GetHost();
74  if (m_RemotePath.empty()) NCBI_THROW(CException, eUnknown, "Empty remote path");
75 
76  m_DataSize = id.GetFirst(eCIT_Integer).GetInteger();
77  if (m_DataSize == 0) NCBI_THROW(CException, eUnknown, "Empty seq_len");
78 
79  // m_LastModified = id.GetFirst(eCIT_String).GetString();
80  // if (m_LastModified.empty()) NCBI_THROW(CException, eUnknown, "Empty last modified");
81 
82 }
83 
84 static
85 void s_GenerateSessionTag(string& session_tag)
86 {
87  CCompoundIDPool id_pool;
88  auto id = id_pool.NewID(eCIC_GenericID);
89  // GUID to establish blob ownership
90  // 'our' blobs don't need to be synchronized
91  // thus a trip to NetCache can be avoided
92  // GUID: pid + curent_time + rand()
93 
95  id.AppendCurrentTime();
96  id.AppendRandom();
97  session_tag = id.ToString();
98 }
99 
101 {
104 }
105 
106 
108 {
109  m_SessionTag = data.m_SessionTag;
110  m_NetCacheKey = data.m_NetCacheKey;
111  _ASSERT(!m_NetCacheKey.empty());
112  if (m_NetCacheKey.empty())
113  throw runtime_error("Empty data key!");
114  m_SeqId = data.m_SeqId;
115  m_DataSize = data.m_DataSize;
116  m_DataMap = data.m_DataMap;
117  // copy data
118  for (auto& old_data : data.m_Data) {
119  m_Data.emplace(old_data.first, unique_ptr<TSparseVector>(new TSparseVector(*old_data.second)));
120  }
121  m_RemotePath = data.m_RemotePath;
122  m_LastModified = data.m_LastModified;
123  m_LastChecked = data.m_LastChecked;
125  m_IsCacheGood = true;
126 }
127 
128 void CSparseGraph::GetMissingRegions(const TSeqRange& range, vector<TSeqRange>& missing_ranges)
129 {
130  missing_ranges.clear();
131  unique_lock<mutex> guard(m_DataMutex);
132  size_t num_bits = m_DataMap.count_range(range.GetFrom(), range.GetTo());
133  if (num_bits >= range.GetLength()) {
134  //ERR_POST(Info << "All cached!");
135  // All cached
136  return;
137  }
138  // the region is not cached yet, need to calc a new range
139  if (num_bits == 0) {
140  // Nothing cached
141  //ERR_POST(Info << "Nothing cached!");
142  missing_ranges.emplace_back(range.GetFrom(), range.GetTo());
143  return;
144  }
145  //ERR_POST(Info << "Partially cached!");
147  mask.set_range(range.GetFrom(), range.GetTo());
148  mask &= m_DataMap;
149  guard.unlock();
150 
151  mask.invert();
152  if (range.GetFrom() > 0)
153  mask.set_range(0, range.GetFrom() - 1, false);
154  if (range.GetToOpen() < m_DataSize - 1)
155  mask.set_range(range.GetToOpen(), static_cast<bm::bvector_size_type>(m_DataSize - 1), false);
156 
157  auto first = mask.get_first();
158  auto stop = range.GetToOpen();
159  vector<TSeqRange> mr;
160  while (first < stop) {
161  mask.invert();
162  // searching the end of the missing range
163  auto last = mask.get_next(first);
164  if (last == 0 || last >= stop - 1) {
165  mr.emplace_back(first, stop - 1);
166  break;
167  }
168  mr.emplace_back(first, last - 1);
169  mask.invert();
170  // searching the start of the next missing range
171  first = mask.get_next(last);
172  if (first == 0)
173  break;
174  }
175  if (!mr.empty()) {
176  missing_ranges.push_back(mr.front());
177  // merge neighboring ranges
178  for (size_t i = 1; i < mr.size(); ++i) {
179  if (mr[i].GetFrom() - missing_ranges.back().GetFrom() < 20000) {
180  missing_ranges.back().SetTo(mr[i].GetTo());
181  } else {
182  missing_ranges.emplace_back(mr[i].GetFrom(), mr[i].GetTo());
183  }
184  }
185  }
186 }
187 static
188 void s_GetMD5Tag(bm::bvector<>& bv, string& tag, bm::word_t* tb = 0)
189 {
190  tag.clear();
191  unique_ptr<bm::bvector<>::statistics> st_op(new bm::bvector<>::statistics);
192  bv.optimize(tb, bm::bvector<>::opt_compress, &*st_op);
193  struct bm::bvector<>::statistics st;
194  bv.calc_stat(&st);
195  unique_ptr<unsigned char> smem(new unsigned char[st.max_serialize_mem]);
196  auto slen = bm::serialize(bv, smem.get(), tb);
197 
198  CChecksum cs(CChecksum::eMD5);
199  cs.AddChars((const char*)smem.get(), slen);
200  cs.GetMD5Digest(tag);
201 }
202 
203 
204 void CSparseGraph::x_Sync(TGraphCachePool& icache_pool)
205 {
206 
207  CObjPoolGuard<TGraphCachePool> icache1(icache_pool);
208 
209  // is there any data stored?
210  bool has_blob = false;
211  try {
212  has_blob = icache1->GetSize(m_NetCacheKey, 0, NcbiEmptyString) >0;
213  } catch(const exception& ) {
214  }
215  if (!has_blob) {
216  x_CheckRemoteData();
217  return;
218  }
219 
220  // check stored data for consistency
221  // read and merge
223  string my_MD5;
224 
225  bool synced = false;
226  unsigned short attempt = 1;
227  while (attempt <= 3) {
228  try {
229  string tag;
230  string remote_path;
231  string last_modified;
232  time_t last_checked = 0;
233  bm::bvector<> data_map;
234  string md5;
235  x_ReadMap(*icache1, tag, remote_path, last_modified, last_checked, md5, data_map, tb);
236  bool must_sync = tag != m_SessionTag;
237 
238  if (must_sync) {
239  // if session tags are different
240  // check if current md5 is different from cached md5
241  // if they are the same no need for syncing
242  if (my_MD5.empty()) { // it can be not empty if the first attempt to sync failed
243  lock_guard<mutex> guard(m_DataMutex);
244  s_GetMD5Tag(m_DataMap, my_MD5, tb);
245  }
246  must_sync = (my_MD5 != md5);
247  }
248 
249  if (must_sync == false) {
250  // no syncing required
251  // oour memory blob is current
252  lock_guard<mutex> guard(m_DataMutex);
253  m_LastSynced.SetCurrent();
254  if (m_LastModified.empty()) {
255  m_LastModified = last_modified;
256  m_LastChecked = max(last_checked, m_LastChecked);
257  }
258  break;
259  }
261  bool all_is_good = false;
262  if (m_Data.size() > 1) {
263  vector<future<bool>> res;
264  for (auto& data : m_Data) {
265  temp_vectors.emplace(data.first, unique_ptr<TSparseVector>(new TSparseVector));
266  temp_vectors[data.first]->resize(data.second->size());
267  //CRef<CRequestContext> context{ CDiagContext::GetRequestContext().Clone() };
268 
269  res.emplace_back(async(launch::async, [&]()->bool {
270  // if (context)
271  // CDiagContext::SetRequestContext(context);
272  string data_key = m_NetCacheKey + "_" + data.first;
273  CObjPoolGuard<TGraphCachePool> icache(icache_pool);
274  return x_ReadData(*icache, data_key, tag, *temp_vectors[data.first]); }));
275  }
276 
277  all_is_good = all_of(res.begin(), res.end(), [](future<bool>& f) { return f.get(); });
278  } else {
279  const auto& data = *m_Data.begin();
280  temp_vectors.emplace(data.first, unique_ptr<TSparseVector>(new TSparseVector));
281  temp_vectors[data.first]->resize(data.second->size());
282  string data_key = m_NetCacheKey + "_" + data.first;
283  CObjPoolGuard<TGraphCachePool> icache(icache_pool);
284  all_is_good = x_ReadData(*icache, data_key, tag, *temp_vectors[data.first]);;
285  }
286  if (all_is_good) {
287  lock_guard<mutex> guard(m_DataMutex);
288  if (!last_modified.empty()) {
289  m_LastModified = last_modified;
290  m_LastChecked = max(last_checked, m_LastChecked);
291  }
292  m_DataMap |= data_map;
293  m_LastSynced.SetCurrent();
294  for (auto&& data : m_Data) {
295  data.second->join(*temp_vectors[data.first]);
296  }
297  synced = true;
298  break;
299  }
300  } catch (exception& e) {
301  ERR_POST(Error << e.what());
302  }
303  chrono::milliseconds timespan(attempt * 50 + rand() % 100);
304  this_thread::sleep_for(timespan);
305  ++attempt;
306  }
307  if (attempt > 1) {
308  auto diag = GetDiagContext().Extra();
309  if (synced) {
310  diag.Print("Sync_passed_attempts", attempt - 1);
311  } else {
312  diag.Print("Sync_failed_attempts", attempt - 1);
313  }
314  }
315 
316  x_CheckRemoteData();
317  if (!m_IsCacheGood) {
318  CObjPoolGuard<TGraphCachePool> icache(icache_pool);
319  icache->Remove(m_NetCacheKey, 0, NcbiEmptyString);
320  for (auto&& data : m_Data) {
321  string data_key = m_NetCacheKey + "_" + data.first;
322  icache->Remove(data_key, 0, NcbiEmptyString);
323  }
324  }
325 }
326 
328 {
329 
330  double diff_in_millisec = CTime(CTime::eCurrent).DiffNanoSecond(m_LastSynced) * 1e-6;
331  if (diff_in_millisec > 250)
332  x_Sync(icache_pool);
333 }
334 
335 void CSparseGraph::Update(const TSeqRange& range, const TUpdateMap& update)
336 {
337  lock_guard<mutex> guard(m_DataMutex);
338  _ASSERT(update.size() == m_Data.size());
339  if (update.size() != m_Data.size())
340  NCBI_THROW(CException, eInvalid, "AlignmentGraph update failed. Incorrect data.");
341  for (auto& data : update) {
342  if (m_Data.count(data.first) == 0)
343  NCBI_THROW(CException, eInvalid, "AlignmentGraph update failed. Incorrect data tag.");
344  m_Data[data.first]->import(&(*data.second)[0], range.GetLength(), range.GetFrom());
345  }
346  m_DataMap.set_range(range.GetFrom(), range.GetTo());
347 }
348 
350 {
351 
353  x_Sync(icache_pool);
354 
355  CObjPoolGuard<TGraphCachePool> icache(icache_pool);
356  for (auto&& data : m_Data) {
357  string data_key = m_NetCacheKey + "_" + data.first;
358  x_WriteData(*icache, data_key, *data.second);
359  }
361  x_WriteMap(*icache, tb);
362 
363  auto elapsed = sw.Elapsed();
364  if (elapsed > 2000) {
365  auto diag = GetDiagContext().Extra();
366  diag.Print("icache_save_delay", elapsed);
367  }
368 }
369 
370 
372 {
373  static const int kRemoteDataCheckInterval = 600; // (Seconds) How often to check remote data
375  if (!is_http)
376  return;
377  lock_guard<mutex> guard(m_DataMutex);
378  m_IsCacheGood = true;
380  if (m_LastChecked == 0) {
382  if (!m_LastModified.empty())
383  m_LastChecked = t.GetTimeT();
384 
385  } else if ((t.GetTimeT() - m_LastChecked) > kRemoteDataCheckInterval) {
387  m_IsCacheGood = false;
388  m_DataMap.clear();
389  for (auto&& data : m_Data) {
390  data.second->set(0, data.second->size());
391  }
392  auto diag = GetDiagContext().Extra();
393  diag.Print("Remote_data_updated", m_RemotePath);
394  }
395  m_LastChecked = t.GetTimeT();
396  }
397 }
398 
400 {
401  unique_ptr<bm::bvector<>::statistics> st_op(new bm::bvector<>::statistics);
403  struct bm::bvector<>::statistics st;
404  m_DataMap.calc_stat(&st);
405  unique_ptr<unsigned char> smem(new unsigned char[st.max_serialize_mem]);
406  auto slen = bm::serialize(m_DataMap, smem.get(), tb);
407  CWStream w(icache.GetWriteStream(m_NetCacheKey, 0, NcbiEmptyString), 0, 0, CRWStreambuf::fOwnWriter);
408  int val = static_cast<int>(m_SessionTag.size());
409  w.write((const char*)&val, sizeof(int));
410  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write Session tag size: " + m_NetCacheKey);
411  w.write(m_SessionTag.c_str(), m_SessionTag.size());
412  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write session tag: " + m_NetCacheKey);
413  val = static_cast<int>(m_RemotePath.size());
414  w.write((const char*)&val, sizeof(int));
415  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write remote path length: " + m_NetCacheKey);
416  w.write((const char*)m_RemotePath.c_str(), m_RemotePath.size());
417  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write remote path length: " + m_NetCacheKey);
418  val = static_cast<int>(m_LastModified.size());
419  w.write((const char*)&val, sizeof(int));
420  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write last_modified: " + m_NetCacheKey);
421  w.write((const char*)m_LastModified.c_str(), val);
422  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write last_modified: " + m_NetCacheKey);
423  w.write((const char*)&m_LastChecked, sizeof(time_t));
424  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write last_checked: " + m_NetCacheKey);
425  w.write((const char*)smem.get(), slen);
426  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write vector: " + m_NetCacheKey);
427  w.flush();
428 
429 }
430 
431 void CSparseGraph::x_WriteData(ICache& icache, const string& data_key, TSparseVector& sv)
432 {
433  sv.optimize();
435  bm::sparse_vector_serialize(sv, sv_lay);
436  CWStream w(icache.GetWriteStream(data_key, 0, NcbiEmptyString), 0, 0, CRWStreambuf::fOwnWriter);
437  int val = static_cast<int>(m_SessionTag.size());
438  w.write((const char*)&val, sizeof(int));
439  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write session tag size: " + data_key);
440  w.write(m_SessionTag.c_str(), m_SessionTag.size());
441  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write session tag: " + data_key);
442  w.write((const char*)sv_lay.buf(), sv_lay.size());
443  if (!w) NCBI_THROW(CException, eUnknown, "Failed to write data: " + data_key);
444  w.flush();
445 }
446 
447 void CSparseGraph::x_ReadMap(ICache& icache, string& tag, string& remote_path, string& last_modified, time_t& last_checked, string& md5, bm::bvector<>& bv, bm::word_t* tb)
448 {
449  tag.clear();
450 
451  size_t size = icache.GetSize(m_NetCacheKey, 0, NcbiEmptyString);
452  size_t vector_offset = 0;
454  int val = 0;
455  {
456  is.read((char*)&val, sizeof(int));
457  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read tag lengh: " + m_NetCacheKey);
458  vector_offset += sizeof(int);
459  vector<char> buffer(val, 0);
460  is.read(&buffer[0], buffer.size());
461  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read tag: " + m_NetCacheKey);
462  tag.append(&buffer[0], buffer.size());
463  vector_offset += buffer.size();
464  }
465  is.read((char*)&val, sizeof(int));
466  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read remote path length: " + m_NetCacheKey);
467  vector_offset += sizeof(int);
468  remote_path.clear();
469  if (val > 0) {
470  vector_offset += val;
471  vector<char> buffer(val, 0);
472  is.read(&buffer[0], buffer.size());
473  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read remote path: " + m_NetCacheKey);
474  remote_path.append(&buffer[0], val);
475  val = 0;
476  }
477  is.read((char*)&val, sizeof(int));
478  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read last_modified length: " + m_NetCacheKey);
479 
480  vector_offset += sizeof(int);
481  last_modified.clear();
482  if (val > 0) {
483  vector_offset += val;
484  vector<char> buffer(val, 0);
485  is.read(&buffer[0], buffer.size());
486  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read last modified: " + m_NetCacheKey);
487  last_modified.append(&buffer[0], val);
488  val = 0;
489  }
490 
491  is.read((char*)&last_checked, sizeof(time_t));
492  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read last_checked: " + m_NetCacheKey);
493  vector_offset += sizeof(time_t);
494 
495  vector<char> buffer(size - vector_offset, 0);
496  is.read(&buffer[0], buffer.size());
497  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read vector: " + m_NetCacheKey);
498  bv.clear();
499  {
501  cs.AddChars((const char*)&buffer[0], buffer.size());
502  cs.GetMD5Digest(md5);
503  }
504  bm::deserialize(bv, (const unsigned char*)&buffer[0], tb);
505 }
506 
507 
508 bool CSparseGraph::x_ReadData(ICache& icache, const string& data_key, const string& tag, TSparseVector& sv, bm::word_t* tb)
509 {
510  size_t size = 0;
511  try {
512  size = icache.GetSize(data_key, 0, NcbiEmptyString);
513  } catch (exception& e) {
514  ERR_POST(Error << e.what());
515  return false;
516  }
517  CRStream is(icache.GetReadStream(data_key, 0, NcbiEmptyString), 0, 0, CRWStreambuf::fOwnReader);
518  int val = 0;
519  int vector_offset = 0;
520  is.read((char*)&val, sizeof(int));
521  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read tag lengh: " + data_key);
522  vector_offset += sizeof(int);
523  vector<char> buffer(val, 0);
524  is.read(&buffer[0], buffer.size());
525  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read tag: " + data_key);
526  string my_tag(&buffer[0], buffer.size());
527  if (my_tag != tag)
528  return false;
529  vector_offset += buffer.size();
530  buffer.resize(size - vector_offset);
531  is.read(&buffer[0], buffer.size());
532  if (!is) NCBI_THROW(CException, eUnknown, "Failed to read vector: " + data_key);
533  auto res = sparse_vector_deserialize(sv, (const unsigned char*)&buffer[0], tb);
534  if (res != 0) NCBI_THROW(CException, eUnknown, "Deserialization problem");
535  return true;
536 
537 }
538 
539 ///////////////////////////////////////////////////////////////////////////////
540 /// CPileUpGraph
541 
542 
543 string CPileUpGraph::CreateCacheKey(CScope& scope, const CSeq_id& seq_id, const string& remote_path)
544 {
545  auto id = CSparseGraph::CreateCompoundID(scope, seq_id, remote_path);
546  return id.ToString();
547 }
548 
549 CPileUpGraph::CPileUpGraph(const string& cache_key)
550 {
551  _ASSERT(!cache_key.empty());
552  if (cache_key.empty())
553  NCBI_THROW(CException, eUnknown, "Empty cache key");
554  CCompoundIDPool id_pool;
555  auto cid = id_pool.FromString(cache_key);
556  InitFromCompoundID(cid);
557 
558  m_NetCacheKey = "GR_";
560  m_NetCacheKey += "_";
561 
563  if (is_http) {
565  cs.AddLine(m_RemotePath);
566  m_NetCacheKey += cs.GetHexSum();
567  } else {
569  }
571 
572  _ASSERT(m_DataSize > 0);
573  m_Data.emplace("m", unique_ptr<TSparseVector>(new TSparseVector));
574  m_Data["m"]->resize(static_cast<TSparseVector::size_type>(m_DataSize));
575  m_Data.emplace("mm", unique_ptr<TSparseVector>(new TSparseVector));
576  m_Data["mm"]->resize(static_cast<TSparseVector::size_type>(m_DataSize));
577  m_Data.emplace("g", unique_ptr<TSparseVector>(new TSparseVector));
578  m_Data["g"]->resize(static_cast<TSparseVector::size_type>(m_DataSize));
579  m_Data.emplace("n", unique_ptr<TSparseVector>(new TSparseVector));
580  m_Data["n"]->resize(static_cast<TSparseVector::size_type>(m_DataSize));
581 }
582 
583 
585 {
586  if (window < 1.0) window = 1.0;
587  size_t num_pixels = (size_t)ceil(range.GetLength() / window);
588  stats.resize(num_pixels);
589 
590  auto start = range.GetFrom();
591  auto stop = range.GetToOpen();
592 
593  int chunk_size = min<int>(range.GetLength(), 1048576); // 1MB chunks
594  vector<TValType> matches(chunk_size, 0);
595  vector<TValType> mismatches(chunk_size, 0);
596  vector<TValType> gaps(chunk_size, 0);
597  vector<TValType> introns(chunk_size, 0);
598  {
599  lock_guard<mutex> guard(m_DataMutex);
600  m_Data["m"]->decode(&matches[0], start, chunk_size);
601  m_Data["mm"]->decode(&mismatches[0], start, chunk_size);
602  m_Data["g"]->decode(&gaps[0], start, chunk_size);
603  m_Data["n"]->decode(&introns[0], start, chunk_size);
604  }
605 
606  auto offset = start;
607  auto pos = start;
608  int stat_idx = 0;
609  double curr_pos = start;
610  while (pos < stop && stat_idx < num_pixels) {
611  curr_pos += window;
612  size_t end_idx = (size_t)floor(curr_pos + 0.5);
613  if (end_idx > stop)
614  end_idx = stop;
615  auto&& data = stats[stat_idx].m_Data;
616  int max_mismatch = -1;
617  int max_gap = -1;
618  int max_intron = -1;
619  int total = -1;
620  while (pos < end_idx) {
621  int index = pos - offset;
622  if (index >= chunk_size) {
623  {
624  chunk_size = min<int>(chunk_size, range.GetToOpen() - pos);
625  lock_guard<mutex> guard(m_DataMutex);
626  m_Data["m"]->decode(&matches[0], pos, chunk_size);
627  m_Data["mm"]->decode(&mismatches[0], pos, chunk_size);
628  m_Data["g"]->decode(&gaps[0], pos, chunk_size);
629  m_Data["n"]->decode(&introns[0], pos, chunk_size);
630  }
631  offset = pos;
632  index = 0;
633  }
634  int mismatch = static_cast<int>(mismatches[index]);
635  int match = static_cast<int>(matches[index]);
636  int gap = static_cast<int>(gaps[index]);
637  int intron = static_cast<int>(introns[index]);
638  total = max<int>(total, match + mismatch + gap + intron);
639  max_mismatch = max<int>(max_mismatch, mismatch);
640  max_gap = max<int>(max_gap, gap);
641  max_intron = max<int>(max_intron, intron);
642  ++pos;
643  }
644  // total is total number of rows in the given range
645  // we take the number of matches as total - max error
646  // where error is mismatch, gap or intron
647  int max_error = max(max_gap, max(max_mismatch, max_intron));
648  data[CAlnStatConfig::eStat_Match] = total - max_error;
649  data[CAlnStatConfig::eStat_Mismatch] = max_mismatch;
650  data[CAlnStatConfig::eStat_Gap] = max_gap;
651  data[CAlnStatConfig::eStat_Intron] = max_intron;
653 
654  ++stat_idx;
655  }
656 }
657 
658 
660 {
661  {
662  lock_guard<mutex> guard(m_DataMutex);
663  size_t num_bits = m_DataMap.count_range(range.GetFrom(), range.GetTo());
664  if (num_bits < range.GetLength())
665  return -1;
666  }
667 
668  vector<TValType> matches;
669  vector<TValType> mismatches;
670  vector<TValType> gaps;
671  vector<TValType> introns;
672  int total = 0;
673  auto from = range.GetFrom();
674  auto to = range.GetTo();
675  static const int kChunkSize = 1048576;
676  while (from < to) {
677  // read 1MB chunks
678  TSeqRange r(from, min<int>(from + kChunkSize, to));
679  matches.resize(r.GetLength());
680  mismatches.resize(r.GetLength());
681  gaps.resize(r.GetLength());
682  introns.resize(r.GetLength());
683  m_Data["m"]->decode(&matches[0], r.GetFrom(), r.GetLength());
684  m_Data["mm"]->decode(&mismatches[0], r.GetFrom(), r.GetLength());
685  m_Data["g"]->decode(&gaps[0], r.GetFrom(), r.GetLength());
686  m_Data["n"]->decode(&introns[0], r.GetFrom(), r.GetLength());
687 
688  int sub_total = 0;
689  for (size_t i = 0; i < r.GetLength(); ++i) {
690  if (i % 150 == 0) {
691  total += sub_total;
692  sub_total = 0;
693  }
694  sub_total = max<int>(sub_total, static_cast<int>(matches[i] + mismatches[i] + gaps[i] + introns[i]));
695  }
696  total += sub_total;
697  from = r.GetToOpen();
698  }
699  return total;
700 }
701 
ncbi::TMaskedQueryRegions mask
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Serialization / compression of bvector<>. Set theoretical operations on compressed BLOBs.
Serialization for sparse_vector<>
Checksum and hash calculation classes.
@ eStat_Intron
intron (for mRNA-to-genome alignments)
@ eStat_Mismatch
mismatches (A+G+T+C - matches)
@ eStat_Total
total alignment count at this base (A+G+T+C+Gap)
vector< SStatStruct > TStatVec
CChecksum – Checksum calculator.
Definition: checksum.hpp:302
Pool of recycled CCompoundID objects.
CCompoundID NewID(ECompoundIDClass new_id_class)
Create and return a new CCompoundID objects.
CCompoundID FromString(const string &cid)
Unpack the base64-encoded ID and return a CCompoundID object for field extraction.
Base64-encoded ID string that contains extractable typed fields.
void AppendID(Uint8 id)
Append an eCIT_ID field at the end of this compound ID.
Guard that can be used to automatically return object to the pool after leaving some scope.
Definition: obj_pool.hpp:198
void UpdateAlignStats(const TSeqRange &range, float window, CAlnStatGlyph::TStatVec &stats)
CPileUpGraph(const string &data_key)
static string CreateCacheKey(objects::CScope &scope, const objects::CSeq_id &seq_id, const string &remote_path)
CPileUpGraph.
int GetNumberOfReads(const TSeqRange &range)
Note about the "buf_size" parameter for streams in this API.
Definition: rwstream.hpp:122
@ fOwnReader
Own the underlying reader.
Definition: rwstreambuf.hpp:66
@ fOwnWriter
Own the underlying writer.
Definition: rwstreambuf.hpp:67
CScope –.
Definition: scope.hpp:92
CSparseGraph.
string m_RemotePath
path to remote data or srz accession
time_t m_LastChecked
timestamp: last time when m_LastModified was checked
void x_WriteMap(ICache &icache, bm::word_t *tb=0)
string m_NetCacheKey
data access key
static CCompoundID CreateCompoundID(objects::CScope &scope, const objects::CSeq_id &seq_id, const string &remote_path)
CSparseGraph.
void x_Sync(TGraphCachePool &icache_pool)
void x_WriteData(ICache &icache, const string &data_key, TSparseVector &sv)
void x_CheckRemoteData()
virtual void Init(TGraphCachePool &icache_pool)
string m_LastModified
timestamp of the remote data last modified date as reported by www server
void Update(const TSeqRange &range, const TUpdateMap &update)
virtual void Save(TGraphCachePool &icache_pool)
mutex m_DataMutex
data access mutex
map< string, unique_ptr< TSparseVector > > m_Data
map of sprasvectors, the key is used as a part of suffix of NetCache key (m_NetCacheKey + "_" + this ...
void x_ReadMap(ICache &icache, string &tag, string &remote_path, string &last_modified, time_t &last_checked, string &md5, bm::bvector<> &bv, bm::word_t *tb=0)
string m_SeqId
NCBI seq_id, e.g. NC_000001.
CTime m_LastSynced
size_t m_DataSize
size of each vector of m_Data
bm::bvector m_DataMap
bvector shows the regions m_Data exist
void InitFromCompoundID(CCompoundID id)
bool m_IsCacheGood
false value indicates that remote data were updated and the cache was reset
string m_SessionTag
A current session tag, used to separate session netcache blobs from the blobs saved by other sessions...
bool x_ReadData(ICache &icache, const string &data_key, const string &tag, TSparseVector &sv, bm::word_t *tb=0)
void GetMissingRegions(const TSeqRange &range, vector< TSeqRange > &missing_ranges)
CStopWatch –.
Definition: ncbitime.hpp:1937
CTime –.
Definition: ncbitime.hpp:296
Writer-based output stream.
Definition: rwstream.hpp:171
BLOB cache read/write/maintenance interface.
Definition: icache.hpp:64
virtual size_t GetSize(const string &key, TBlobVersion version, const string &subkey)=0
Check if BLOB exists, return BLOB size.
virtual IWriter * GetWriteStream(const string &key, TBlobVersion version, const string &subkey, unsigned int time_to_live=0, const string &owner=kEmptyStr)=0
Return sequential stream interface to write BLOB data.
virtual IReader * GetReadStream(const string &key, TBlobVersion version, const string &subkey)=0
Return sequential stream interface to read BLOB data.
void optimize(bm::word_t *temp_block=0, optmode opt_mode=opt_compress, statistics *stat=0)
Optimize memory bitvector's memory allocation.
Definition: bm.h:3635
size_type count_range(size_type left, size_type right, const rs_index_type &rs_idx) const noexcept
Returns count of 1 bits in the given range [left..right] Uses rank-select index to accelerate the sea...
Definition: bm.h:3516
bvector< Alloc > & set_range(size_type left, size_type right, bool value=true)
Sets all bits in the specified closed interval [left,right] Interval must be inside the bvector's siz...
Definition: bm.h:2368
void clear(const size_type *ids, size_type ids_size, bm::sort_order so=bm::BM_UNKNOWN)
clear list of bits in this bitset
Definition: bm.h:4149
void calc_stat(struct bm::bvector< Alloc >::statistics *st) const noexcept
Calculates bitvector statistics.
Definition: bm.h:3978
succinct sparse vector with runtime compression using bit-slicing / transposition method
Definition: bmsparsevec.h:87
bvector_type::size_type size_type
Definition: bmsparsevec.h:92
size_type size() const
Definition: map.hpp:148
Definition: map.hpp:338
static const int chunk_size
@ eCIT_Host
Definition: compound_id.hpp:70
@ eCIT_SeqID
Definition: compound_id.hpp:79
@ eCIT_Integer
Definition: compound_id.hpp:64
@ eCIC_GenericID
Definition: compound_id.hpp:51
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
int offset
Definition: replacements.h:160
static void md5(const char *src, const char *out)
Definition: challenge.c:77
char data[12]
Definition: iconv.c:80
string GetHexSum(void) const
Return string with checksum in hexadecimal form.
Definition: checksum.hpp:353
void AddLine(const char *line, size_t len)
Definition: checksum.hpp:609
void GetMD5Digest(unsigned char digest[16]) const
Return calculated MD5 digest.
Definition: checksum.hpp:637
void AddChars(const char *str, size_t len)
Update current control sum with data provided.
Definition: checksum.hpp:602
CDiagContext_Extra & Print(const string &name, const string &value)
The method does not print the argument, but adds it to the string.
Definition: ncbidiag.cpp:2622
CDiagContext & GetDiagContext(void)
Get diag context instance.
Definition: logging.cpp:818
CDiagContext_Extra Extra(void) const
Create a temporary CDiagContext_Extra object.
Definition: ncbidiag.hpp:2095
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
static bool IfModifiedSince(const string &url, string &last_modified)
Works with HTTP(s) protocol returns true if url was modified since last_modified date if true last_mo...
Definition: url_utils.cpp:111
static void GetLastModified(const string &url, string &last_modified)
HTTP: Returns header's Last-Modified in the last_modified parameter FTP: Returns MD5 of first 512 byt...
Definition: url_utils.cpp:61
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
CConstRef< CSeq_id > GetSeqId(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
CSeq_id_Handle GetAccVer(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get accession.version Seq-id Returns null CSeq_id_Handle if the sequence is not found or if it doesn'...
Definition: scope.cpp:413
TSeqPos GetSequenceLength(const CSeq_id &id, TGetFlags flags=0)
Get sequence length Return kInvalidSeqPos if sequence is not found.
Definition: scope.cpp:769
static TPid GetPid(void)
Get process identifier (pid) for the current process.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NcbiEmptyString
Definition: ncbistr.hpp:122
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
double DiffNanoSecond(const CTime &t) const
Difference in nanoseconds from specified time.
Definition: ncbitime.hpp:2411
CTime & AddHour(int hours=1, EDaylight adl=eDaylightDefault)
Add specified hours and adjust for daylight saving time.
Definition: ncbitime.hpp:2371
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1941
size_t serialize(const BV &bv, unsigned char *buf, bm::word_t *temp_block=0, unsigned serialization_flags=0)
Saves bitvector into memory.
Definition: bmserial.h:3071
size_t deserialize(BV &bv, const unsigned char *buf, bm::word_t *temp_block=0, const bm::bv_ref_vector< BV > *ref_vect=0)
Bitvector deserialization from a memory BLOB.
Definition: bmserial.h:3137
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
int sparse_vector_deserialize(SV &sv, const unsigned char *buf, bm::word_t *temp_block=0)
Deserialize sparse vector.
static CStopWatch sw
int i
if(yy_accept[yy_current_state])
const size_t kChunkSize
Definition: na_utils.cpp:587
unsigned int word_t
Definition: bmconst.h:39
bm::id_t bvector_size_type
Definition: bm.h:103
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
T max(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Reader-writer based streams.
USING_SCOPE(objects)
static void s_GenerateSessionTag(string &session_tag)
static void s_GetMD5Tag(bm::bvector<> &bv, string &tag, bm::word_t *tb=0)
Statistical information about bitset's memory allocation details.
Definition: bm.h:125
layout class for serialization buffer structure
const unsigned char * buf() const noexcept
Return serialization buffer pointer.
size_t size() const noexcept
return current serialized size
#define _ASSERT
string GetMD5Digest(const CChecksum &cs)
Definition: wx_utils.cpp:211
Modified on Sat May 18 11:35:31 2024 by modify_doxy.py rev. 669887