NCBI C++ ToolKit
bamindex.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: bamindex.cpp 96857 2022-05-19 14:59:20Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description:
29  * Access to BAM index files
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <sra/readers/bam/bamread.hpp> // for CBamException
37 #include <util/compress/zlib.hpp>
38 #include <corelib/rwstream.hpp>
39 #include <util/util_exception.hpp>
40 #include <util/timsort.hpp>
44 
45 #include <strstream>
46 
47 #ifndef NCBI_THROW2_FMT
48 # define NCBI_THROW2_FMT(exception_class, err_code, message, extra) \
49  throw NCBI_EXCEPTION2(exception_class, err_code, FORMAT(message), extra)
50 #endif
51 
52 
54 
55 //#define NCBI_USE_ERRCODE_X BAM2Graph
56 //NCBI_DEFINE_ERR_SUBCODE_X(6);
57 
59 
60 class CSeq_entry;
61 
62 static const size_t kGZipMagicLength = 2;
63 static const char kGZipMagic[] = "\x1f\x8b";
64 
65 static const char kBamExt[] = ".bam";
66 
67 static const size_t kIndexMagicLength = 4;
68 static const char kBaiExt[] = ".bai";
69 static const char kIndexMagicBAI[] = "BAI\1";
70 #ifdef BAM_SUPPORT_CSI
71 static const char kCsiExt[] = ".csi";
72 static const char kIndexMagicCSI[] = "CSI\1";
73 NCBI_PARAM_DECL(bool, BAM, PREFER_CSI);
74 NCBI_PARAM_DEF_EX(bool, BAM, PREFER_CSI, false, eParam_NoThread, BAM_PREFER_CSI);
75 #endif
76 static const float kEstimatedCompression = 0.25;
77 
78 static inline
79 void s_Read(CNcbiIstream& in, char* dst, size_t len)
80 {
81  while ( len ) {
82  in.read(dst, len);
83  if ( !in ) {
84  NCBI_THROW(CIOException, eRead, "Read failure");
85  }
86  size_t cnt = in.gcount();
87  len -= cnt;
88  dst += cnt;
89  }
90 }
91 
92 
93 static inline
94 const char* s_Read(const char*& buffer_ptr, const char* buffer_end, size_t len)
95 {
96  const char* ret_ptr = buffer_ptr;
97  const char* ret_end = ret_ptr + len;
98  if ( ret_end > buffer_end ) {
99  NCBI_THROW(CIOException, eRead, "BAM index EOF");
100  }
101  buffer_ptr = ret_end;
102  return ret_ptr;
103 }
104 
105 
106 static inline
107 void s_Read(CBGZFStream& in, char* dst, size_t len)
108 {
109  while ( len ) {
110  size_t cnt = in.Read(dst, len);
111  len -= cnt;
112  dst += cnt;
113  }
114 }
115 
116 
117 static inline
118 void s_ReadString(CBGZFStream& in, string& ret, size_t len)
119 {
120  ret.resize(len);
121  s_Read(in, &ret[0], len);
122 }
123 
124 
125 static inline
126 void s_ReadMagic(CBGZFStream& in, const char* magic)
127 {
128  _ASSERT(strlen(magic) == 4);
129  char buf[4];
130  s_Read(in, buf, 4);
131  if ( memcmp(buf, magic, 4) != 0 ) {
132  NCBI_THROW_FMT(CBGZFException, eFormatError,
133  "Bad file magic: "<<NStr::PrintableString(string(buf, buf+4)));
134  }
135 }
136 
137 
138 static inline
140 {
141  char buf[4];
142  s_Read(in, buf, 4);
143  return SBamUtil::MakeUint4(buf);
144 }
145 
146 
147 static inline
149 {
150  return int32_t(s_ReadUInt32(in));
151 }
152 
153 
154 static inline
156 {
157  char buf[8];
158  s_Read(in, buf, 8);
159  return SBamUtil::MakeUint8(buf);
160 }
161 
162 
163 static inline
165 {
166  return CBGZFPos(s_ReadUInt64(in));
167 }
168 
169 
170 static inline
172 {
173  CBGZFPos beg = s_ReadFilePos(in);
174  CBGZFPos end = s_ReadFilePos(in);
175  return CBGZFRange(beg, end);
176 }
177 
178 
179 static inline
181 {
182  char buf[4];
183  s_Read(in, buf, 4);
184  return SBamUtil::MakeUint4(buf);
185 }
186 
187 
188 static inline
190 {
191  return int32_t(s_ReadUInt32(in));
192 }
193 
194 
195 /////////////////////////////////////////////////////////////////////////////
196 // SBamIndexBinInfo
197 /////////////////////////////////////////////////////////////////////////////
198 
199 
201  SBamIndexParams params)
202 {
203  m_Bin = s_ReadUInt32(in);
204 #ifdef BAM_SUPPORT_CSI
205  if ( params.is_CSI ) {
207  }
208  else {
209  m_Overlap = CBGZFPos();
210  }
211 #endif
212  int32_t n_chunks = s_ReadInt32(in);
213  m_Chunks.resize(n_chunks);
214  for ( int32_t i_chunk = 0; i_chunk < n_chunks; ++i_chunk ) {
215  m_Chunks[i_chunk] = s_ReadFileRange(in);
216  }
217 }
218 
219 
220 const char* SBamIndexBinInfo::Read(const char* ptr, const char* end,
221  SBamIndexParams params)
222 {
223  size_t n_chunks;
224 #ifdef BAM_SUPPORT_CSI
225  if ( params.is_CSI ) {
226  const char* header = s_Read(ptr, end, 16);
227  m_Bin = SBamUtil::MakeUint4(header);
229  n_chunks = SBamUtil::MakeUint4(header+12);
230  }
231  else {
232  const char* header = s_Read(ptr, end, 8);
233  m_Bin = SBamUtil::MakeUint4(header);
234  m_Overlap = CBGZFPos();
235  n_chunks = SBamUtil::MakeUint4(header+4);
236  }
237 #endif
238  m_Chunks.reserve(n_chunks);
239  const char* data = s_Read(ptr, end, n_chunks*16);
240  for ( size_t i = 0; i < n_chunks; ++i ) {
241  Uint8 start = SBamUtil::MakeUint8(data+i*16);
242  Uint8 end = SBamUtil::MakeUint8(data+i*16+8);
243  m_Chunks.push_back(CBGZFRange(CBGZFPos(start), CBGZFPos(end)));
244  }
245  return ptr;
246 }
247 
248 
249 /////////////////////////////////////////////////////////////////////////////
250 // SBamIndexRefIndex
251 /////////////////////////////////////////////////////////////////////////////
252 
253 
254 pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>
255 inline
257 {
258  pair<TBinsIter, TBinsIter> ret;
259  if ( level == 0 ) {
260  ret.second = m_Bins.end();
261  }
262  else {
263  ret.second = lower_bound(m_Bins.begin(), m_Bins.end(), GetBinNumberBase(level-1));
264  }
265  ret.first = lower_bound(m_Bins.begin(), ret.second, GetBinNumberBase(level));
266  return ret;
267 }
268 
270  bool operator()(const CBGZFPos p1, const SBamIndexBinInfo& p2) const
271  {
272  return p1 < p2.GetStartFilePos();
273  }
274  bool operator()(const SBamIndexBinInfo& p1, const CBGZFPos p2) const
275  {
276  return p1.GetStartFilePos() < p2;
277  }
278 };
279 
280 
282  bool operator()(const CBGZFPos p1, const CBGZFRange& p2) const
283  {
284  return p1 < p2.second;
285  }
286  bool operator()(const CBGZFRange& p1, const CBGZFPos p2) const
287  {
288  return p1.second < p2;
289  }
290 };
291 
292 
294 {
295  if ( length != kInvalidSeqPos ) {
296  TSeqPos rounded_length = (length+GetMinBinSize()-1)&~(GetMinBinSize()-1);
297  m_EstimatedLength = max(m_EstimatedLength, rounded_length);
298  }
299 }
300 
301 
303 {
304  if ( bin.m_Chunks.size() != 2 ) {
305  NCBI_THROW(CBamException, eInvalidBAIFormat,
306  "Bad unmapped bin format");
307  }
308  m_UnmappedChunk = bin.m_Chunks[0];
309  m_MappedCount = bin.m_Chunks[1].first.GetVirtualPos();
310  m_UnmappedCount = bin.m_Chunks[1].second.GetVirtualPos();
311  bin.m_Chunks.erase(bin.m_Chunks.begin(), bin.m_Chunks.begin()+2);
312  return bin.m_Chunks.empty();
313 }
314 
315 
317 {
318  if ( bin.m_Chunks.empty() ) {
319  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
320  "No chunks in bin "<<bin.m_Bin);
321  }
322  for ( size_t i = 0; i < bin.m_Chunks.size(); ++i ) {
323  auto& range = bin.m_Chunks[i];
324  if ( range.first >= range.second ) {
325  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
326  "Empty BAM BGZF range in bin "<<bin.m_Bin<<
327  ": "<<range.first<<" - "<<range.second);
328  }
329  if ( i && bin.m_Chunks[i-1].second >= range.first ) {
330  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
331  "Overlapping BAM BGZF ranges in bin "<<bin.m_Bin<<
332  ": "<<bin.m_Chunks[i-1].second<<" over "<<range.first);
333  }
334  }
335  auto range = bin.GetSeqRange(*this);
336  TSeqPos min_end = range.GetFrom();
337  if ( range.GetLength() != GetMinBinSize() ) {
338  // at least 1 sub-range
339  min_end += range.GetLength() >> kLevelStepBinShift;
340  }
341  // at least 1 minimal page
342  min_end += GetMinBinSize();
344 }
345 
346 
348  SBamIndexParams params,
349  int32_t ref_index)
350 {
351  SBamIndexParams::operator=(params);
353  size_t bin_count = 0;
354  int32_t n_bin = s_ReadInt32(in);
355  m_Bins.resize(n_bin);
356  const TBin kPseudoBin = GetPseudoBin();
357  for ( int32_t i_bin = 0; i_bin < n_bin; ++i_bin ) {
358  SBamIndexBinInfo& bin = m_Bins[bin_count++];
359  bin.Read(in, *this);
360  if ( bin.m_Bin == kPseudoBin && ProcessPseudoBin(bin) ) {
361  --bin_count;
362  continue;
363  }
364  ProcessBin(bin);
365  }
366  m_Bins.resize(bin_count);
367  gfx::timsort(m_Bins.begin(), m_Bins.end());
368 
369  if ( !is_CSI ) {
370  int32_t n_intv = s_ReadInt32(in);
371  m_Overlaps.resize(n_intv);
372  for ( int32_t i = 0; i < n_intv; ++i ) {
374  }
376  }
378 }
379 
380 
381 const char* SBamIndexRefIndex::Read(const char* buffer_ptr, const char* buffer_end,
382  SBamIndexParams params,
383  int32_t ref_index)
384 {
385  SBamIndexParams::operator=(params);
387  size_t bin_count = 0;
388  size_t n_bin = SBamUtil::MakeUint4(s_Read(buffer_ptr, buffer_end, 4));
389  m_Bins.resize(n_bin);
390  const TBin kPseudoBin = GetPseudoBin();
391  for ( size_t i_bin = 0; i_bin < n_bin; ++i_bin ) {
392  SBamIndexBinInfo& bin = m_Bins[bin_count++];
393  buffer_ptr = bin.Read(buffer_ptr, buffer_end, *this);
394  if ( bin.m_Bin == kPseudoBin && ProcessPseudoBin(bin) ) {
395  --bin_count;
396  continue;
397  }
398  ProcessBin(bin);
399  }
400  m_Bins.resize(bin_count);
401  gfx::timsort(m_Bins.begin(), m_Bins.end());
402 
403  if ( !is_CSI ) {
404  size_t n_intv = SBamUtil::MakeUint4(s_Read(buffer_ptr, buffer_end, 4));
405  m_Overlaps.resize(n_intv);
406  const char* data = s_Read(buffer_ptr, buffer_end, n_intv*8);
407  for ( size_t i = 0; i < n_intv; ++i ) {
409  }
411  }
413  return buffer_ptr;
414 }
415 
416 
417 static
420  const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
421 {
422  if ( iters.first == iters.second ) {
424  }
425  else if ( !params.is_CSI && iters.first->m_Bin == params.kMaxBinNumber ) {
426  // special case for BAI index of too long sequence
428  }
429  else {
430  return iters.first->GetSeqRange(params);
431  }
432 }
433 
434 
435 static
436 CBGZFPos
437 s_GetOverlap(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
438 {
439  if ( iters.first == iters.second ) {
440  return CBGZFPos::GetInvalid();
441  }
442  else {
443  return iters.first->m_Overlap;
444  }
445 }
446 
447 
448 static
449 CBGZFPos
450 s_GetFilePos(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
451 {
452  auto iter = iters.first;
453  if ( iter == iters.second ) {
454  return CBGZFPos::GetInvalid();
455  }
456  return iter->GetStartFilePos();
457 }
458 
459 
460 static
461 CBGZFPos
462 s_GetNextFilePos(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
463 {
464  auto iter = iters.first;
465  if ( iter == iters.second ) {
466  return CBGZFPos::GetInvalid();
467  }
468  ++iter;
469  if ( iter == iters.second ) {
470  return CBGZFPos::GetInvalid();
471  }
472  return iter->GetStartFilePos();
473 }
474 
475 /*
476 static
477 CBGZFPos
478 s_GetFileEnd(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
479 {
480  if ( iters.first == iters.second ) {
481  return CBGZFPos::GetInvalid();
482  }
483  else {
484  return iters.first->GetEndFilePos();
485  }
486 }
487 */
488 
489 NCBI_PARAM_DECL(int, BAM, OVERLAP_MODE);
490 NCBI_PARAM_DEF_EX(int, BAM, OVERLAP_MODE, 2, eParam_NoThread, BAM_OVERLAP_MODE);
491 
492 
493 static int s_GetOverlapMode()
494 {
495  static int value = NCBI_PARAM_TYPE(BAM, OVERLAP_MODE)::GetDefault();
496  return value;
497 }
498 
499 
500 vector<TSeqPos> SBamIndexRefIndex::GetAlnOverStarts() const
501 {
502 if ( s_GetOverlapMode() == 0 ) {
504  vector<TSeqPos> aln_over_starts(nBins);
505  for ( TSeqPos i = 0; i < nBins; ++i ) {
506  // set limits
507  COpenRange<TSeqPos> ref_range;
509  CBGZFRange limit = GetLimitRange(ref_range, eSearchByOverlap);
510  CBGZFPos min_fp = CBGZFPos::GetInvalid();
511  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
512  TBin bin = GetBinNumberBase(level) + (i>>(level*kLevelStepBinShift));
513  auto it = lower_bound(m_Bins.begin(), m_Bins.end(), bin);
514  if ( it != m_Bins.end() && it->m_Bin == bin ) {
515  for ( auto c : it->m_Chunks ) {
516  if ( c.first >= min_fp ) {
517  break;
518  }
519  if ( c.first >= limit.second ) {
520  break;
521  }
522  if ( c.second <= limit.first ) {
523  continue;
524  }
525  if ( c.first < limit.first ) {
526  c.first = limit.first;
527  }
528  _ASSERT(c.first >= limit.first);
529  _ASSERT(c.first < limit.second);
530  _ASSERT(c.first < c.second);
531  if ( c.first < min_fp ) {
532  min_fp = c.first;
533  }
534  break;
535  }
536  }
537  }
538  TSeqPos min_aln_start;
539  if ( min_fp.IsInvalid() ) {
540  min_aln_start = ref_range.GetFrom();
541  }
542  else {
543  min_aln_start = 0;
544  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
545  auto level_bins = GetLevelBins(level);
546  auto it = lower_bound(level_bins.first, level_bins.second, min_fp, PByStartFilePos());
547  if ( it == level_bins.first ) {
548  continue;
549  }
550  --it;
551  min_aln_start = max(min_aln_start, it->GetSeqRange(*this).GetFrom());
552  if ( it->GetEndFilePos() > min_fp ) {
553  // found exact bin containing the alignment
554  // since we start with the narrowest range there is no point to continue
555  break;
556  }
557  }
558  }
559  aln_over_starts[i] = min_aln_start;
560  }
561  return aln_over_starts;
562 }
563 else if ( s_GetOverlapMode() == 1 ) {
564  size_t nBins = m_Overlaps.size();
565  vector<TSeqPos> aln_over_starts(nBins);
566  // next_bin_it points to a low-level bin that starts after current position
567  auto bin_it_start = GetLevelBins(0).first, next_bin_it = bin_it_start;
568  for ( size_t i = 0; i < nBins; ++i ) {
569  TSeqPos ref_pos = TSeqPos(i * GetMinBinSize());
570  CBGZFPos min_fp = m_Overlaps[i];
571  if ( !min_fp ) {
572  // no overspan
573  aln_over_starts[i] = ref_pos;
574  continue;
575  }
576  // update next_bin_it to point to the next bin after current refseq position
577  while ( next_bin_it != m_Bins.end() && next_bin_it->GetStartFilePos() <= min_fp ) {
578  ++next_bin_it;
579  }
580  TSeqPos min_aln_start = i? aln_over_starts[i-1]: 0;
581  bool inside_min_bin = false;
582  if ( next_bin_it != bin_it_start ) {
583  auto& bin = next_bin_it[-1];
584  _ASSERT(bin.GetStartFilePos() <= min_fp);
585  inside_min_bin = bin.GetEndFilePos() > min_fp;
586  min_aln_start = max(min_aln_start, (bin.m_Bin-GetBinNumberBase(0))*GetMinBinSize());
587  }
588  if ( min_aln_start+GetMinBinSize() < ref_pos && !inside_min_bin ) {
589  // more than 1 page before -> lookup all levels for better estimate
590  for ( TIndexLevel level = 1; level <= GetMaxIndexLevel(); ++level ) {
591  auto level_bins = GetLevelBins(level);
592  auto it = upper_bound(level_bins.first, level_bins.second, min_fp, PByStartFilePos());
593  if ( it == level_bins.first ) {
594  continue;
595  }
596  --it;
597  min_aln_start = max(min_aln_start, it->GetSeqRange(*this).GetFrom());
598  if ( it->GetEndFilePos() > min_fp ) {
599  // found exact bin containing the alignment
600  // since we start with the narrowest range there is no point to continue
601  break;
602  }
603  }
604  }
605  if ( min_aln_start > ref_pos ) {
606  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
607  "Inconsistent linear index at ref pos "<<ref_pos<<
608  ": align starts after end bin start "<<min_aln_start);
609  }
610  aln_over_starts[i] = min_aln_start;
611  }
612  return aln_over_starts;
613 }
614 else {
616  vector<TSeqPos> aln_over_starts(nBins);
617  vector<pair<TBinsIter, TBinsIter>> levelBins;
618  vector<COpenRange<TSeqPos>> levelBinSeqRange;
619  vector<CBGZFPos> levelPrevOverlap;
620  if ( is_CSI ) {
621  levelPrevOverlap.resize(GetMaxIndexLevel()+1);
622  }
623  CBGZFPos minfp = CBGZFPos::GetInvalid();
624  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
625  levelBins.push_back(GetLevelBins(level));
626  levelBinSeqRange.push_back(s_GetSeqRange(*this, levelBins.back()));
627  minfp = min(minfp, s_GetFilePos(levelBins.back()));
628  }
629  if ( minfp.IsInvalid() ) {
630  // no file data -> no overlaps
631  return aln_over_starts;
632  }
633  map<TSeqPos, CBGZFPos> sp2minfp; // map seqpos to the earliest filepos it could appear
634  for ( auto& bin : m_Bins ) {
635  auto sp = bin.GetSeqRange(*this).GetFrom();
636  auto fp = bin.GetStartFilePos();
637  auto ins = sp2minfp.insert(make_pair(sp, fp));
638  if ( !ins.second ) {
639  // uptade with minimum
640  auto& minfp = ins.first->second;
641  minfp = min(minfp, fp);
642  }
643  }
644  map<CBGZFPos, TSeqPos> fp2sp; // map filepos to seqpos that certainly appear at or after
645  for ( auto p : sp2minfp ) {
646  auto ins = fp2sp.insert(make_pair(p.second, p.first));
647  if ( ins.second ) {
648  auto iter = ins.first;
649  ++iter;
650  while ( iter != fp2sp.end() && iter->second < p.first ) {
651  iter = fp2sp.erase(iter);
652  }
653  }
654  }
655  for ( TSeqPos b = 0; b < nBins; ++b ) {
656  TSeqPos seqPos = b << GetMinLevelBinShift();
657  CBGZFPos overlap_fp = CBGZFPos::GetInvalid();
658  if ( b < m_Overlaps.size() && m_Overlaps[b] ) { // BAI overlap table
659  overlap_fp = m_Overlaps[b];
660  }
661  CBGZFPos prev_overlap_fp; // max overlap of previous bins on all levels
662  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
663  // advance to next bin on level if necessary
664  while ( levelBinSeqRange[level].GetToOpen() <= seqPos ) {
665  if ( is_CSI ) {
666  levelPrevOverlap[level] = s_GetOverlap(levelBins[level]);
667  }
668  ++(levelBins[level].first);
669  levelBinSeqRange[level] = s_GetSeqRange(*this, levelBins[level]);
670  }
671  if ( is_CSI ) {
672  CBGZFPos overlap_fp;
673  if ( seqPos >= levelBinSeqRange[level].GetFrom() ) {
674  overlap_fp = s_GetOverlap(levelBins[level]);
675  }
676  else {
677  overlap_fp = levelPrevOverlap[level];
678  }
679  prev_overlap_fp = max(prev_overlap_fp, overlap_fp);
680  }
681  }
682  CBGZFPos found_fp = CBGZFPos::GetInvalid(); // earliest filepos of overlapping alignment
683  CBGZFPos limit_fp = CBGZFPos::GetInvalid(); // filepos after this page to break the lookup
684  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
685  // advance to next bin on level if necessary
686  while ( levelBinSeqRange[level].GetToOpen() <= seqPos ) {
687  if ( is_CSI ) {
688  levelPrevOverlap[level] = s_GetOverlap(levelBins[level]);
689  }
690  ++(levelBins[level].first);
691  levelBinSeqRange[level] = s_GetSeqRange(*this, levelBins[level]);
692  }
693  if ( seqPos < levelBinSeqRange[level].GetFrom() ) {
694  // not in the bin yet
695  continue;
696  }
697  if ( is_CSI && overlap_fp.IsInvalid() ) {
698  // CSI overlap info from bin
699  overlap_fp = max(prev_overlap_fp, levelBins[level].first->m_Overlap);
700  }
701  // update limit file pos from next bin on the level
702  limit_fp = min(limit_fp, s_GetNextFilePos(levelBins[level]));
703  // locate overlapping chunk
704  auto& chunks = levelBins[level].first->m_Chunks;
705  auto it = upper_bound(chunks.begin(), chunks.end(), overlap_fp, PByEndFilePos());
706  if ( it != chunks.end() && it->first < min(found_fp, limit_fp) ) {
707  // found suitable chunk
708  found_fp = max(it->first, overlap_fp);
709  if ( found_fp <= overlap_fp ) {
710  // found minimum, no more searching
711  break;
712  }
713  }
714  }
715  if ( found_fp.IsInvalid() ) {
716  aln_over_starts[b] = seqPos;
717  }
718  else {
719  // find minmal seq pos at this file pos
720  auto iter = fp2sp.upper_bound(found_fp);
721  _ASSERT(iter != fp2sp.begin());
722  // it could be after current page
723  auto osp = min(seqPos, prev(iter)->second);
724  if ( b > 0 ) {
725  // overlap may overshot in case of empty previous bins
726  // that were explicitly marked as having no overlap
727  osp = max(osp, aln_over_starts[b-1]);
728  }
729  aln_over_starts[b] = osp;
730  }
731  }
732  return aln_over_starts;
733 }
734 }
735 
736 
737 vector<TSeqPos> SBamIndexRefIndex::GetAlnOverEnds() const
738 {
739  TSeqPos bin_size = GetMinBinSize();
740  vector<TSeqPos> starts = GetAlnOverStarts();
741  TSeqPos count = TSeqPos(starts.size());
742  vector<TSeqPos> ends(count);
743  TSeqPos si = 0, ei = 0;
744  for ( ; ei < count; ++ei ) {
745  while ( si*bin_size < starts[ei] ) {
746  ends[si++] = ei*bin_size-1;
747  }
748  }
749  while ( si < count ) {
750  ends[si++] = ei*bin_size-1;
751  }
752  return ends;
753 }
754 
755 
756 inline
758 {
760 }
761 
762 
763 inline
764 Uint8 s_EstimatedSize(CBGZFPos file_pos1, CBGZFPos file_pos2)
765 {
766  if ( file_pos1 >= file_pos2 ) {
767  // empty file region
768  return 0;
769  }
770  Uint8 pos1 = s_EstimatedPos(file_pos1);
771  Uint8 pos2 = s_EstimatedPos(file_pos2);
772  if ( pos1 < pos2 )
773  return pos2 - pos1;
774  else
775  return 1; // report non-zero size of non-empty region
776 }
777 
778 
779 inline
781 {
782  return s_EstimatedSize(range.first, range.second);
783 }
784 
785 
787  ESearchMode search_mode) const
788 {
789  CBGZFRange limit;
790  if ( m_EstimatedLength < ref_range.GetToOpen() ) {
791  ref_range.SetToOpen(m_EstimatedLength);
792  }
793  if ( ref_range.Empty() ) {
794  return limit;
795  }
796 
797  if ( search_mode == eSearchByOverlap ) {
798  if ( !m_Overlaps.empty() ) {
799  TBin beg_bin_offset = GetBinNumberOffset(ref_range.GetFrom(), 0);
800  // start limit is from intervals and beg position
801  if ( beg_bin_offset < m_Overlaps.size() ) {
802  limit.first = m_Overlaps[beg_bin_offset];
803  }
804  }
805 #ifdef BAM_SUPPORT_CSI
806  else if ( is_CSI ) {
807  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
808  TBin bin_num = GetBinNumber(ref_range.GetFrom(), level);
809  TBin bin_num_last = GetBinNumber(ref_range.GetTo(), level);
810  auto bins = GetLevelBins(level);
811  auto it = lower_bound(bins.first, bins.second, bin_num);
812  if ( it != bins.second && it->m_Bin <= bin_num_last ) {
813  if ( it->m_Overlap ) {
814  if ( !limit.first || it->m_Overlap < limit.first ) {
815  limit.first = it->m_Overlap;
816  }
817  }
818  if ( it->m_Bin == bin_num ) {
819  break;
820  }
821  }
822  }
823  }
824 #endif
825  }
826  else {
827  // start limit is determined by alignment start position
828  // for each level we'll take end position of previous existing bin
829  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
830  TBin bin_num = GetBinNumber(ref_range.GetFrom(), level);
831  auto bins = GetLevelBins(level);
832  auto it = lower_bound(bins.first, bins.second, bin_num);
833  if ( it != bins.first ) {
834  limit.first = max(limit.first, prev(it)->GetEndFilePos());
835  }
836  }
837  }
838  limit.second = CBGZFPos::GetInvalid();
839  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
840  // next bin on each level is clearly after the range
841  TBin bin_num = GetBinNumber(ref_range.GetTo(), level)+1;
842  auto bins = GetLevelBins(level);
843  auto it = lower_bound(bins.first, bins.second, bin_num);
844  if ( it != bins.second ) {
845  limit.second = min(limit.second, it->GetStartFilePos());
846  }
847  }
848  return limit;
849 }
850 
851 
852 pair<SBamIndexRefIndex::TBin, SBamIndexRefIndex::TBin>
854  TIndexLevel index_level) const
855 {
856  pair<TBin, TBin> bin_range;
857  bin_range.first = GetBinNumber(ref_range.GetFrom(), index_level);
858  if ( IsOverflowBin(bin_range.first, index_level) ) {
859  // position is beyond index limit (can happen with BAI index)
860  // only min and max levels exist, and max level is always root bin
861  if ( index_level == GetMaxIndexLevel() ) {
862  bin_range.first = kMaxBinNumber;
863  bin_range.second = kMaxBinNumber;
864  return bin_range;
865  }
866  else if ( index_level != kMinBinIndexLevel ) {
867  // start bin is neither min nor max level - no bins to scan
868  bin_range.second = bin_range.first-1;
869  return bin_range;
870  }
871  }
872  bin_range.second = GetBinNumber(ref_range.GetTo(), index_level);
873  if ( IsOverflowBin(bin_range.second, index_level) ) {
874  // position is beyond index limit (can happen with BAI index)
875  // only min and max levels exist, and max level is always root bin
876  if ( index_level == GetMaxIndexLevel() ) {
877  bin_range.second = kMaxBinNumber;
878  }
879  else if ( index_level != kMinBinIndexLevel ) {
880  // end bin is neither min nor max level - scan to the end of bins of the level
881  bin_range.second = GetLastBin(index_level);
882  }
883  }
884  return bin_range;
885 }
886 
887 
888 pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>
889 SBamIndexRefIndex::AddLevelFileRanges(vector<CBGZFRange>& ranges,
890  CBGZFRange limit_file_range,
891  pair<TBin, TBin> bin_range) const
892 {
893  TBinsIter first = lower_bound(m_Bins.begin(), m_Bins.end(), bin_range.first);
894  TBinsIter it = first;
895  for ( ; it != m_Bins.end() && it->m_Bin <= bin_range.second; ++it ) {
896  for ( auto c : it->m_Chunks ) {
897  if ( c.first < limit_file_range.first ) {
898  c.first = limit_file_range.first;
899  }
900  if ( limit_file_range.second && limit_file_range.second < c.second ) {
901  c.second = limit_file_range.second;
902  }
903  if ( c.first < c.second ) {
904  ranges.push_back(c);
905  }
906  }
907  }
908  return make_pair(first, it);
909 }
910 
911 
912 pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>
913 SBamIndexRefIndex::GetBinsIterRange(pair<TBin, TBin> bin_range) const
914 {
915  TBinsIter first = lower_bound(m_Bins.begin(), m_Bins.end(), bin_range.first);
916  TBinsIter it = upper_bound(first, m_Bins.end(), bin_range.second);
917  return make_pair(first, it);
918 }
919 
920 
922  size_t block_beg, block_end; // range of low-level pages
923  size_t fill_beg_to, fill_end_to; // uncertainty about start and end positions
924  CBGZFPos file_beg, file_end; // included BAM file range
925 
926  static
927  void x_AddDataSize(vector<Uint8>& vv, size_t beg_pos, size_t end_pos,
929  {
930  _ASSERT(beg_pos < vv.size());
931  _ASSERT(beg_pos <= end_pos);
932  _ASSERT(end_pos < vv.size());
933  Uint8 file_size = s_EstimatedSize(file_beg, file_end);
934  if ( !file_size ) {
935  return;
936  }
937  size_t page_count = end_pos - beg_pos + 1;
938  Uint8 add_size = (file_size + page_count/2) / page_count;
939  if ( add_size ) {
940  for ( size_t i = beg_pos; i <= end_pos; ++i ) {
941  vv[i] += add_size;
942  }
943  }
944  else {
945  // rounding produced zero, but the original data size was non-zero,
946  // so make resulting esimated sizes at least non-zero
947  for ( size_t i = beg_pos; i <= end_pos; ++i ) {
948  if ( !vv[i] ) {
949  vv[i] = 1;
950  }
951  }
952  }
953  }
954 
955  void Init(size_t index)
956  {
957  block_beg = block_end = index;
958  }
959 
960  void InitData(vector<Uint8>& vv, const SBamIndexBinInfo& bin)
961  {
962  if ( bin.m_Chunks.empty() ) {
963  return;
964  }
965  size_t i = block_beg;
966  _ASSERT(block_end == i);
967  _ASSERT(!file_end);
969  file_beg = bin.GetStartFilePos();
970  file_end = bin.GetEndFilePos();
973  }
974  void ExpandData(vector<Uint8>& vv, const SBamIndexBinInfo& bin)
975  {
976  if ( bin.m_Chunks.empty() ) {
977  return;
978  }
979  CBGZFPos new_file_beg = bin.GetStartFilePos();
980  CBGZFPos new_file_end = bin.GetEndFilePos();
981  _ASSERT(new_file_beg < new_file_end);
982  if ( !file_end ) {
983  // start BAM file range
984  x_AddDataSize(vv, block_beg, block_end, new_file_beg, new_file_end);
985  file_beg = new_file_beg;
986  file_end = new_file_end;
987  // pages are completely uncertain
988  fill_beg_to = block_end; // beg/end cross assignment is intentional
989  fill_end_to = block_beg; // beg/end cross assignment is intentional
990  }
991  else {
992  // expand BAM file range
993  if ( new_file_beg < file_beg ) {
994  x_AddDataSize(vv, block_beg, fill_beg_to, new_file_beg, file_beg);
995  file_beg = new_file_beg;
996  }
997  if ( new_file_end > file_end ) {
998  x_AddDataSize(vv, fill_end_to, block_end, file_end, new_file_end);
999  file_end = new_file_end;
1000  }
1001  }
1002  }
1003 
1005  {
1006  }
1007  SBamRangeBlock(vector<Uint8>& vv,
1008  const vector<SBamRangeBlock>& bb, size_t bb_beg, size_t bb_end)
1009  {
1010  for ( size_t i = bb_beg; i <= bb_end; ++i ) {
1011  const SBamRangeBlock& b = bb[i];
1012  if ( !b.file_end ) {
1013  continue;
1014  }
1015  if ( !file_end ) {
1016  // start BAM file range
1017  *this = b;
1018  }
1019  else {
1020  // include gap
1021  _ASSERT(file_end <= b.file_beg);
1022  x_AddDataSize(vv, fill_end_to, b.fill_beg_to, file_end, b.file_beg);
1023  fill_end_to = b.fill_end_to;
1024  file_end = b.file_end;
1025  }
1026  }
1027  block_beg = bb[bb_beg].block_beg;
1028  block_end = bb[bb_end].block_end;
1029  }
1030 };
1031 
1032 
1034 {
1035  CBGZFRange range;
1036  range.first = CBGZFPos::GetInvalid();
1037  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
1038  auto bins = GetLevelBins(level);
1039  if ( bins.first != bins.second ) {
1040  CBGZFPos pos_beg = bins.first->GetStartFilePos();
1041  CBGZFPos pos_end = prev(bins.second)->GetEndFilePos();
1042  if ( pos_beg < range.first ) {
1043  range.first = pos_beg;
1044  }
1045  if ( pos_end > range.second ) {
1046  range.second = pos_end;
1047  }
1048  }
1049  }
1050  if ( range.first.IsInvalid() ) {
1051  range.first = CBGZFPos();
1052  }
1053  return range;
1054 }
1055 
1056 
1058 {
1059  size_t bin_count;
1060  if ( seqlen == kInvalidSeqPos ) {
1061  seqlen = m_EstimatedLength;
1062  }
1063  else {
1064  seqlen = max(seqlen, m_EstimatedLength);
1065  }
1066  bin_count = (seqlen+GetMinBinSize()-1) >> GetMinLevelBinShift();
1067  _ASSERT(bin_count);
1068  vector<Uint8> vv(bin_count);
1069  // init blocks
1070  vector<SBamRangeBlock> bb(bin_count);
1071  size_t bb_end = bin_count-1;
1072  for ( size_t i = 0; i <= bb_end; ++i ) {
1073  bb[i].Init(i);
1074  }
1075  // fill smallest bins
1076  {
1077  TBin bin_number_base = GetBinNumberBase(0);
1078  auto level_bins = GetLevelBins(0);
1079  for ( auto bin_it = level_bins.first; bin_it != level_bins.second; ++bin_it ) {
1080  size_t i = bin_it->m_Bin - bin_number_base;
1081  _ASSERT(i <= bb_end);
1082  bb[i].InitData(vv, *bin_it);
1083  }
1084  }
1085  for ( TIndexLevel level = 1; level <= GetMaxIndexLevel(); ++level ) {
1086 
1087  // merge
1088  for ( size_t i = 0; (i<<kLevelStepBinShift) <= bb_end; ++i ) {
1089  size_t src_beg = i<<kLevelStepBinShift;
1090  size_t src_end = min(bb_end, src_beg+(1<<kLevelStepBinShift)-1);
1091  bb[i] = SBamRangeBlock(vv, bb, src_beg, src_end);
1092  }
1093  bb_end >>= kLevelStepBinShift;
1094 
1095  // add next level bins
1096  TBin bin_number_base = GetBinNumberBase(level);
1097  auto level_bins = GetLevelBins(level);
1098  for ( auto bin_it = level_bins.first; bin_it != level_bins.second; ++bin_it ) {
1099  size_t i = bin_it->m_Bin - bin_number_base;
1100  _ASSERT(i <= bb_end);
1101  bb[i].ExpandData(vv, *bin_it);
1102  }
1103  }
1104  _ASSERT(bb_end == 0);
1105  return vv;
1106 }
1107 
1108 
1110  TIndexLevel max_index_level) const
1111 {
1112  vector<uint64_t> vv(((m_EstimatedLength-GetMinBinSize()) >> GetLevelBinShift(min_index_level))+1);
1113  for ( TIndexLevel level = min_index_level; level <= max_index_level; ++level ) {
1114  uint32_t vv_bin_shift = (level-min_index_level)*kLevelStepBinShift;
1115  uint32_t vv_bin_count = 1 << vv_bin_shift;
1116  auto level_bins = GetLevelBins(level);
1117  TBin bin_base = GetBinNumberBase(level);
1118  for ( auto it = level_bins.first; it != level_bins.second; ++it ) {
1119  uint64_t value = 0;
1120  for ( auto& c : it->m_Chunks ) {
1121  value += s_EstimatedSize(c);
1122  }
1123  if ( !value ) {
1124  continue;
1125  }
1126  uint32_t pos = (it->m_Bin - bin_base) << vv_bin_shift;
1127  _ASSERT(pos < vv.size());
1128  uint64_t add = value;
1129  uint32_t cnt = min(vv_bin_count, uint32_t(vv.size()-pos));
1130  if ( cnt > 1 ) {
1131  // distribute
1132  add = (add+cnt/2)/cnt;
1133  }
1134  if ( !add ) {
1135  for ( uint32_t i = 0; i < cnt; ++i ) {
1136  vv[pos+i] = max(uint64_t(1), vv[pos+i]);
1137  }
1138  }
1139  else {
1140  for ( uint32_t i = 0; i < cnt; ++i ) {
1141  vv[pos+i] += add;
1142  }
1143  }
1144  }
1145  }
1146  return vv;
1147 }
1148 
1149 
1150 /////////////////////////////////////////////////////////////////////////////
1151 // CCached
1152 /////////////////////////////////////////////////////////////////////////////
1153 
1154 
1155 static size_t ReadVDBFile(AutoArray<char>& data, const string& path)
1156 {
1157  CBamVDBFile file(path);
1158  size_t fsz = file.GetSize();
1159  data.reset(new char[fsz]);
1160  file.ReadExactly(0, data.get(), fsz);
1161  return fsz;
1162 }
1163 
1164 
1165 /////////////////////////////////////////////////////////////////////////////
1166 // CBamIndex
1167 /////////////////////////////////////////////////////////////////////////////
1168 
1169 
1171  : m_UnmappedCount(0),
1172  m_TotalReadBytes(0),
1173  m_TotalReadSeconds(0)
1174 {
1175 }
1176 
1177 
1178 CBamIndex::CBamIndex(const string& index_file_name)
1179  : m_UnmappedCount(0),
1180  m_TotalReadBytes(0),
1181  m_TotalReadSeconds(0)
1182 {
1183  Read(index_file_name);
1184 }
1185 
1186 
1188 {
1189 }
1190 
1191 
1192 void CBamIndex::Read(const string& index_file_name)
1193 {
1194  m_FileName = index_file_name;
1195  m_Refs.clear();
1196  m_UnmappedCount = 0;
1197 
1198  AutoArray<char> data;
1200  size_t size = ReadVDBFile(data, index_file_name);
1203  if ( CBamDb::GetDebugLevel() >= 3 ) {
1204  LOG_POST("BAM: read index "<<size/double(1<<20)<<" MB"
1205  " speed: "<<size/(m_TotalReadSeconds*(1<<20))<<" MB/s");
1206  }
1207  Read(data.get(), size);
1208 }
1209 
1211 class CMemoryReader : public IReader
1212 {
1213 public:
1214  CMemoryReader(const char* ptr, size_t size)
1215  : m_Ptr(ptr),
1216  m_Size(size)
1217  {
1218  }
1219 
1221  size_t count,
1222  size_t* bytes_read)
1223  {
1224  if ( !m_Size ) {
1225  if ( bytes_read ) {
1226  *bytes_read = 0;
1227  }
1228  return eRW_Eof;
1229  }
1230  count = min(m_Size, count);
1231  memcpy(buf, m_Ptr, count);
1232  m_Ptr += count;
1233  m_Size -= count;
1234  if ( bytes_read ) {
1235  *bytes_read = count;
1236  }
1237  return eRW_Success;
1238  }
1239 
1240  ERW_Result PendingCount(size_t* count)
1241  {
1242  *count = m_Size;
1243  return eRW_Success;
1244  }
1245 
1246 private:
1247  const char* m_Ptr;
1248  size_t m_Size;
1249 };
1251 
1253 {
1254 #ifdef BAM_SUPPORT_CSI
1255  is_CSI = false;
1257  depth = kBAI_depth;
1258 #endif
1259 
1260  char magic[kIndexMagicLength];
1261  s_Read(in, magic, kIndexMagicLength);
1262  if ( memcmp(magic, kIndexMagicBAI, kIndexMagicLength) == 0 ) {
1263  // BAI, no extra parameters
1264  }
1265 #ifdef BAM_SUPPORT_CSI
1266  else if ( memcmp(magic, kIndexMagicCSI, kIndexMagicLength) == 0 ) {
1267  // CSI
1268  is_CSI = true;
1270  depth = s_ReadUInt32(in);
1271  size_t l_aux = s_ReadUInt32(in);
1272  while ( l_aux ) {
1273  char buf[256];
1274  size_t count = min(l_aux, sizeof(buf));
1275  s_Read(in, buf, count);
1276  l_aux -= count;
1277  }
1278  }
1279 #endif
1280  else {
1281  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
1282  "Bad file magic: "<<NStr::PrintableString(string(magic, magic+kIndexMagicLength)));
1283  }
1284  int32_t n_ref = s_ReadInt32(in);
1285  m_Refs.resize(n_ref);
1286  for ( int32_t i_ref = 0; i_ref < n_ref; ++i_ref ) {
1287  m_Refs[i_ref].Read(in, *this, i_ref);
1288  }
1289  streampos extra_pos = in.tellg();
1290  in.seekg(0, ios::end);
1291  streampos end_pos = in.tellg();
1292  in.seekg(extra_pos);
1293 
1294  if ( end_pos-extra_pos >= 8 ) {
1296  extra_pos += 8;
1297  }
1298  if ( end_pos != extra_pos ) {
1299  ERR_POST(Warning<<
1300  "Extra "<<(end_pos-extra_pos)<<" bytes in BAM index");
1301  }
1302 }
1303 
1304 
1305 void CBamIndex::Read(const char* buffer_ptr, size_t buffer_size)
1306 {
1307  if ( buffer_size >= kGZipMagicLength &&
1308  memcmp(buffer_ptr, kGZipMagic, kGZipMagicLength) == 0 ) {
1309  // gzipped index
1310  unique_ptr<CNcbiIstream> data_stream =
1311  make_unique<CRStream>(new CMemoryReader(buffer_ptr, buffer_size),
1312  0, nullptr, CRWStreambuf::fOwnReader);
1313  unique_ptr<CNcbiIstream> z_stream =
1314  make_unique<CCompressionIStream>(*data_stream,
1317  Read(*z_stream);
1318  return;
1319  }
1320 
1321  const char* buffer_end = buffer_ptr + buffer_size;
1322 
1323 #ifdef BAM_SUPPORT_CSI
1324  is_CSI = false;
1326  depth = kBAI_depth;
1327 #endif
1328 
1329  const char* magic = s_Read(buffer_ptr, buffer_end, kIndexMagicLength);
1330  if ( memcmp(magic, kIndexMagicBAI, kIndexMagicLength) == 0 ) {
1331  // BAI
1332  }
1333 #ifdef BAM_SUPPORT_CSI
1334  else if ( memcmp(magic, kIndexMagicCSI, kIndexMagicLength) == 0 ) {
1335  // CSI
1336  is_CSI = true;
1337  const char* header = s_Read(buffer_ptr, buffer_end, 12);
1338  min_shift = SBamUtil::MakeUint4(header);
1339  depth = SBamUtil::MakeUint4(header+4);
1340  auto l_aux = SBamUtil::MakeUint4(header+8);
1341  s_Read(buffer_ptr, buffer_end, l_aux);
1342  }
1343 #endif
1344  else {
1345  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
1346  "Bad file magic: "<<NStr::PrintableString(string(magic, magic+kIndexMagicLength)));
1347  }
1348  const char* header = s_Read(buffer_ptr, buffer_end, 4);
1349  uint32_t n_ref = SBamUtil::MakeUint4(header);
1350  m_Refs.resize(n_ref);
1351  for ( uint32_t i = 0; i < n_ref; ++i ) {
1352  buffer_ptr = m_Refs[i].Read(buffer_ptr, buffer_end, *this, i);
1353  }
1354  if ( buffer_end - buffer_ptr >= 8 ) {
1355  m_UnmappedCount = SBamUtil::MakeUint8(buffer_ptr);
1356  buffer_ptr += 8;
1357  }
1358  if ( buffer_ptr != buffer_end ) {
1359  ERR_POST(Warning<<
1360  "Extra "<<(buffer_end-buffer_ptr)<<" bytes in BAM index");
1361  }
1362 }
1363 
1364 
1365 const SBamIndexRefIndex& CBamIndex::GetRef(size_t ref_index) const
1366 {
1367  if ( ref_index >= GetRefCount() ) {
1368  NCBI_THROW(CBamException, eInvalidArg,
1369  "Bad reference sequence index");
1370  }
1371  return m_Refs[ref_index];
1372 }
1373 
1374 
1376 {
1377  if ( GetRefCount() != header.GetRefCount() ) {
1378  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
1379  "Wrong index ref count: "<<
1380  GetRefCount()<<" <> "<<header.GetRefCount());
1381  }
1382  for ( size_t i = 0; i < GetRefCount(); ++i ) {
1383  m_Refs[i].SetLengthFromHeader(header.GetRef(i).m_Length);
1384  }
1385 }
1386 
1387 
1389 {
1390  CBGZFRange total_range(CBGZFPos(-1), CBGZFPos(0));
1391  for ( auto& b : GetRef(ref_index).m_Bins ) {
1392  CBGZFPos start_pos = b.GetStartFilePos();
1393  if ( start_pos < total_range.first )
1394  total_range.first = start_pos;
1395  CBGZFPos end_pos = b.GetEndFilePos();
1396  if ( total_range.second < end_pos )
1397  total_range.second = end_pos;
1398  }
1399  return total_range;
1400 }
1401 
1402 
1403 static void sx_SetTitle(CSeq_graph& graph, CSeq_annot& annot,
1404  string title, string name)
1405 {
1406  if ( name.empty() ) {
1407  name = "BAM coverage";
1408  }
1409  if ( title.empty() ) {
1410  title = name;
1411  }
1412  graph.SetTitle(title);
1413  annot.SetNameDesc(name);
1414 }
1415 
1416 
1419  const string& ref_name,
1420  const string& seq_id,
1421  const string& annot_name,
1422  TIndexLevel min_index_level,
1423  TIndexLevel max_index_level) const
1424 {
1425  CSeq_id id(seq_id);
1426  return MakeEstimatedCoverageAnnot(header, ref_name, id, annot_name, min_index_level, max_index_level);
1427 }
1428 
1429 
1432  const string& ref_name,
1433  const CSeq_id& seq_id,
1434  const string& annot_name,
1435  TIndexLevel min_index_level,
1436  TIndexLevel max_index_level) const
1437 {
1438  size_t ref_index = header.GetRefIndex(ref_name);
1439  if ( ref_index == size_t(-1) ) {
1440  NCBI_THROW_FMT(CBamException, eInvalidArg,
1441  "Cannot find RefSeq: "<<ref_name);
1442  }
1443  return MakeEstimatedCoverageAnnot(ref_index, seq_id, annot_name,
1444  header.GetRefLength(ref_index), min_index_level, max_index_level);
1445 }
1446 
1447 
1450  const string& seq_id,
1451  const string& annot_name,
1452  TSeqPos length,
1453  TIndexLevel min_index_level,
1454  TIndexLevel max_index_level) const
1455 {
1456  CSeq_id id(seq_id);
1457  return MakeEstimatedCoverageAnnot(ref_index, id, annot_name, length, min_index_level, max_index_level);
1458 }
1459 
1460 
1463  const string& seq_id,
1464  const string& annot_name,
1465  TIndexLevel min_index_level,
1466  TIndexLevel max_index_level) const
1467 {
1468  return MakeEstimatedCoverageAnnot(ref_index, seq_id, annot_name, kInvalidSeqPos, min_index_level, max_index_level);
1469 }
1470 
1471 
1474  const CSeq_id& seq_id,
1475  const string& annot_name,
1476  TIndexLevel min_index_level,
1477  TIndexLevel max_index_level) const
1478 {
1479  return MakeEstimatedCoverageAnnot(ref_index, seq_id, annot_name, kInvalidSeqPos, min_index_level, max_index_level);
1480 }
1481 
1482 
1483 vector<uint64_t>
1485  TIndexLevel min_index_level,
1486  TIndexLevel max_index_level) const
1487 {
1488  return GetRef(ref_index).CollectEstimatedCoverage(min_index_level, max_index_level);
1489 }
1490 
1491 
1494  const CSeq_id& seq_id,
1495  const string& annot_name,
1496  TSeqPos length,
1497  TIndexLevel min_index_level,
1498  TIndexLevel max_index_level) const
1499 {
1500  TSeqPos bin_size = GetBinSize(min_index_level);
1501  vector<uint64_t> vv = CollectEstimatedCoverage(ref_index, min_index_level, max_index_level);
1502  if ( vv.empty() ) vv.push_back(0);
1503  uint32_t count = uint32_t(vv.size());
1504  if ( length == 0 || length == kInvalidSeqPos ) {
1505  length = count*bin_size;
1506  }
1507 
1508  CRef<CSeq_annot> annot(new CSeq_annot);
1509  CRef<CSeq_graph> graph(new CSeq_graph);
1510  annot->SetData().SetGraph().push_back(graph);
1511  sx_SetTitle(*graph, *annot, annot_name, annot_name);
1512 
1513  graph->SetLoc().SetInt().SetId().Assign(seq_id);
1514  graph->SetLoc().SetInt().SetFrom(0);
1515  graph->SetLoc().SetInt().SetTo(length-1);
1516  graph->SetComp(bin_size);
1517  graph->SetNumval(count);
1518  CByte_graph& bgraph = graph->SetGraph().SetByte();
1519  vector<char>& bvalues = bgraph.SetValues();
1520  bvalues.resize(count);
1521  Uint1 bmax = 0;
1522  uint64_t max_value = *max_element(vv.begin(), vv.end());
1523  double mul = min(1., 255./max_value);
1524  for ( size_t i = 0; i < count; ++i ) {
1525  if ( auto v = vv[i] ) {
1526  Uint1 b = Uint1(v*mul+.5);
1527  // ensure non-zero value be still non-zero after scaling
1528  if ( !b ) {
1529  b = 1;
1530  }
1531  bvalues[i] = b;
1532  bmax = max(bmax, b);
1533  }
1534  }
1535  bgraph.SetAxis(0);
1536  bgraph.SetMin(1);
1537  bgraph.SetMax(bmax);
1538  if ( mul != 1 ) {
1539  graph->SetA(1/mul);
1540  }
1541  return annot;
1542 }
1543 
1544 
1545 /////////////////////////////////////////////////////////////////////////////
1546 // CBamHeader
1547 /////////////////////////////////////////////////////////////////////////////
1548 
1549 
1551 {
1552 }
1553 
1554 
1555 CBamHeader::CBamHeader(const string& bam_file_name)
1556 {
1557  Read(bam_file_name);
1558 }
1559 
1560 
1562 {
1563 }
1564 
1565 
1567 {
1568  int32_t l_name = s_ReadInt32(in);
1569  s_ReadString(in, m_Name, l_name);
1570  m_Name.resize(l_name-1);
1571  m_Length = s_ReadInt32(in);
1572 }
1573 
1574 
1575 void CBamHeader::Read(const string& bam_file_name)
1576 {
1577  CBGZFFile file(bam_file_name);
1578  CBGZFStream file_stream(file);
1579  Read(file_stream);
1580 }
1581 
1582 
1584 {
1585  m_RefByName.clear();
1586  m_Refs.clear();
1587  s_ReadMagic(stream, "BAM\1");
1588  int32_t l_text = s_ReadInt32(stream);
1589  s_ReadString(stream, m_Text, l_text);
1590  int32_t n_ref = s_ReadInt32(stream);
1591  m_Refs.resize(n_ref);
1592  for ( int32_t i_ref = 0; i_ref < n_ref; ++i_ref ) {
1593  m_Refs[i_ref].Read(stream);
1594  m_RefByName[m_Refs[i_ref].m_Name] = i_ref;
1595  }
1596  m_AlignStart = stream.GetSeekPos();
1597 }
1598 
1599 
1600 const SBamHeaderRefInfo& CBamHeader::GetRef(size_t ref_index) const
1601 {
1602  if ( ref_index >= GetRefCount() ) {
1603  NCBI_THROW(CBamException, eInvalidArg,
1604  "Bad reference sequence index");
1605  }
1606  return m_Refs[ref_index];
1607 }
1608 
1609 
1610 size_t CBamHeader::GetRefIndex(const string& name) const
1611 {
1612  auto iter = m_RefByName.find(name);
1613  if ( iter == m_RefByName.end() ) {
1614  return size_t(-1);
1615  }
1616  return iter->second;
1617 }
1618 
1620 {
1621  CTempString record;
1622  enum { eNone, eTag, eRecord, eValue} state = eNone;
1623  bool state_changed = true;
1624  const char *p, *p0, *pend;
1625 
1626  for (p = m_Text.data(), pend = p + m_Text.size(); p < pend; ++p) {
1627  if (state_changed) {
1628  state_changed = false;
1629  for (; p < pend && iswspace(*p); ++p)
1630  ;
1631  p0 = p;
1632  }
1633  if (*p == '@') {
1634  state = eTag;
1635  p0 = p;
1636  }
1637  else if (*p == ':') {
1638  if (state == eRecord) {
1639  record.assign(p0, p-p0);
1640  state = eValue;
1641  state_changed = true;
1642  p0 = p;
1643  }
1644  }
1645  else if ( iswspace(*p) ) {
1646  if (state == eTag) {
1647  records.push_back( TSBamRecord(string(p0, p-p0), TSBamTags()));
1648  state = eRecord;
1649  state_changed = true;
1650  }
1651  else if (state == eValue) {
1652  records.back().second[record] = string(p0, p-p0);
1653  state = eRecord;
1654  state_changed = true;
1655  }
1656  }
1657  }
1658  if (state == eValue) {
1659  records.back().second[record] = string(p0, p-p0);
1660  }
1661  return records.size();
1662 }
1663 
1664 
1665 /////////////////////////////////////////////////////////////////////////////
1666 // CBamFileRangeSet
1667 /////////////////////////////////////////////////////////////////////////////
1668 
1669 
1671 {
1672 }
1673 
1674 
1676  size_t ref_index,
1677  COpenRange<TSeqPos> ref_range,
1678  ESearchMode search_mode)
1679 {
1680  AddRanges(index, ref_index, ref_range, search_mode);
1681 }
1682 
1683 
1685  size_t ref_index,
1686  COpenRange<TSeqPos> ref_range,
1687  TIndexLevel min_level, TIndexLevel max_level,
1688  ESearchMode search_mode)
1689 {
1690  AddRanges(index, ref_index, ref_range, min_level, max_level, search_mode);
1691 }
1692 
1693 
1695  size_t ref_index,
1696  COpenRange<TSeqPos> ref_range,
1697  EIndexLevel min_level, EIndexLevel max_level,
1698  ESearchMode search_mode)
1699 {
1700  AddRanges(index, ref_index, ref_range, min_level, max_level, search_mode);
1701 }
1702 
1703 
1705 {
1706 }
1707 
1708 
1709 ostream& operator<<(ostream& out, const CBamFileRangeSet& ranges)
1710 {
1711  cout << '(';
1712  for ( auto& r : ranges ) {
1713  cout << " (" << r.first<<" "<<r.second<<")";
1714  }
1715  return cout << " )";
1716 }
1717 
1718 
1719 inline
1720 void CBamFileRangeSet::AddSortedRanges(const vector<CBGZFRange>& ranges)
1721 {
1722  for ( auto iter = ranges.begin(); iter != ranges.end(); ) {
1723  CBGZFPos start = iter->first, end = iter->second;
1724  for ( ++iter; iter != ranges.end() && !(end < iter->first); ++iter ) {
1725  if ( end < iter->second ) {
1726  end = iter->second;
1727  }
1728  }
1729  m_Ranges += CBGZFRange(start, end);
1730  }
1731 }
1732 
1733 
1734 NCBI_PARAM_DECL(int, BAM, RANGES_MODE);
1735 NCBI_PARAM_DEF_EX(int, BAM, RANGES_MODE, 1, eParam_NoThread, BAM_RANGES_MODE);
1736 
1737 
1738 static int s_GetRangesMode()
1739 {
1740  static int value = NCBI_PARAM_TYPE(BAM, RANGES_MODE)::GetDefault();
1741  return value;
1742 }
1743 
1744 
1746  size_t ref_index,
1747  COpenRange<TSeqPos> ref_range,
1748  TIndexLevel min_index_level,
1749  TIndexLevel max_index_level,
1750  ESearchMode search_mode)
1751 {
1752  vector<CBGZFRange> ranges;
1753  const SBamIndexRefIndex& ref = index.GetRef(ref_index);
1754 if ( s_GetRangesMode() == 0 ) {
1755  // set limits
1756  CBGZFRange limit = ref.GetLimitRange(ref_range, search_mode);
1757  if ( ref_range.Empty() ) {
1758  return;
1759  }
1760  for ( TIndexLevel level = min_index_level; level <= index.GetMaxIndexLevel(); ++level ) {
1761  ref.AddLevelFileRanges(ranges, limit, index.GetBinRange(ref_range, level));
1762  }
1763 }
1764 else {
1766  // iterate index levels starting with 0 to set limits correctly
1767  // iterate index levels till the end because alignments may be moved up
1768  TSeqPos set_limit_by_overlap_at = 0;
1769  for ( TIndexLevel level = 0; level <= index.GetMaxIndexLevel(); ++level ) {
1770  // omit ranges from lower index levels because they contain only low-level alignments
1771  auto bin_range = index.GetBinRange(ref_range, level);
1772  pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter> iter_range;
1773  if ( level >= min_index_level ) {
1774  iter_range = ref.AddLevelFileRanges(ranges, limit, bin_range);
1775  _ASSERT(iter_range == ref.GetBinsIterRange(bin_range));
1776  }
1777  else {
1778  iter_range = ref.GetBinsIterRange(bin_range);
1779  }
1780  // set file range limit from overlap fields
1781  // the most limiting overlap is on the lowest existing index level, so set it once
1782  // this limit is valid for both search modes
1783  if ( index.is_CSI ) {
1784  // CSI overlaps are in bins
1785  auto first_bin = iter_range.first;
1786  if ( (first_bin == ref.m_Bins.end() ||
1787  first_bin->m_Bin != bin_range.first) &&
1788  first_bin != ref.m_Bins.begin() ) {
1789  --first_bin;
1790  }
1791  if ( first_bin != ref.m_Bins.end() &&
1792  first_bin->m_Bin <= bin_range.first &&
1793  first_bin->m_Bin >= index.GetFirstBin(level) ) {
1794  // the bin is at or before the first one and at the same level
1795  TSeqPos pos = first_bin->GetSeqRange(index).GetFrom();
1796  if ( pos > set_limit_by_overlap_at ) {
1797  // better limit
1798  set_limit_by_overlap_at = pos;
1799  limit.first = max(limit.first, first_bin->m_Overlap);
1800  }
1801  }
1802  }
1803  else {
1804  // BAI overlaps are in a separate array, reflecting min level
1805  if ( level == kMinBinIndexLevel && !ref.m_Overlaps.empty() ) {
1806  size_t bin_index = bin_range.first-index.GetFirstBin(kMinBinIndexLevel);
1807  if ( bin_index < ref.m_Overlaps.size() ) {
1808  limit.first = max(limit.first, ref.m_Overlaps[bin_index]);
1809  }
1810  }
1811  }
1812  // in eSearchByStart mode we can set lower limit of file positions
1813  // from the end of previous bin on the same level
1814  // these limits are combined
1815  if ( search_mode == eSearchByStart ) {
1816  // set file range limit from previous bins
1817  // limits from all levels matter, choose the most limiting one
1818  auto first_bin = iter_range.first;
1819  if ( first_bin != ref.m_Bins.begin() ) {
1820  auto prev_bin = prev(first_bin);
1821  _ASSERT(prev_bin->m_Bin < bin_range.first);
1822  if ( prev_bin->m_Bin >= index.GetFirstBin(level) ) {
1823  // prev bin is on the same level
1824  limit.first = max(limit.first, prev_bin->GetEndFilePos());
1825  }
1826  }
1827  }
1828  // in all search modes we can limit end of search range by next bin on the same level
1829  // update cutoff file pos from the first next bin
1830  auto next_bin = iter_range.second;
1831  if ( next_bin != ref.m_Bins.end() &&
1832  next_bin->m_Bin < index.GetFirstBin(level-1) ) {
1833  // next bin is on the same level
1834  limit.second = min(limit.second, next_bin->GetStartFilePos());
1835  }
1836  }
1837 }
1838  gfx::timsort(ranges.begin(), ranges.end());
1839  AddSortedRanges(ranges);
1840 }
1841 
1842 
1844  size_t ref_index,
1845  COpenRange<TSeqPos> ref_range,
1846  ESearchMode search_mode)
1847 {
1848  AddRanges(index, ref_index, ref_range, 0, index.GetMaxIndexLevel(), search_mode);
1849 }
1850 
1851 
1853  size_t ref_index,
1854  COpenRange<TSeqPos> ref_range,
1855  TIndexLevel index_level,
1856  ESearchMode search_mode)
1857 {
1858  AddRanges(index, ref_index, ref_range, index_level, index_level, search_mode);
1859 }
1860 
1861 
1863 {
1864  CBGZFRange whole;
1865  whole.first = header.GetAlignStart();
1866  whole.second = CBGZFPos::GetInvalid();
1867  m_Ranges += whole;
1868 }
1869 
1870 
1872 {
1873  Clear();
1874  AddWhole(header);
1875 }
1876 
1877 
1879  size_t ref_index,
1880  COpenRange<TSeqPos> ref_range,
1881  TIndexLevel min_index_level,
1882  TIndexLevel max_index_level,
1883  ESearchMode search_mode)
1884 {
1885  Clear();
1886  AddRanges(index, ref_index, ref_range, min_index_level, max_index_level, search_mode);
1887 }
1888 
1889 
1891  size_t ref_index,
1892  COpenRange<TSeqPos> ref_range,
1893  ESearchMode search_mode)
1894 {
1895  SetRanges(index, ref_index, ref_range, 0, index.GetMaxIndexLevel(), search_mode);
1896 }
1897 
1898 
1900  size_t ref_index,
1901  COpenRange<TSeqPos> ref_range,
1902  TIndexLevel index_level,
1903  ESearchMode search_mode)
1904 {
1905  SetRanges(index, ref_index, ref_range, index_level, index_level, search_mode);
1906 }
1907 
1908 
1910 {
1911  return s_EstimatedSize(range);
1912 }
1913 
1914 
1916 {
1917  Uint8 size = 0;
1918  for ( auto& c : m_Ranges ) {
1919  size += GetFileSize(c);
1920  }
1921  return size;
1922 }
1923 
1924 
1925 /////////////////////////////////////////////////////////////////////////////
1926 // CBamRawDb
1927 /////////////////////////////////////////////////////////////////////////////
1928 
1929 
1931 {
1932 }
1933 
1934 
1935 void CBamRawDb::Open(const string& bam_path)
1936 {
1937  m_File = new CBGZFFile(bam_path);
1938  CBGZFStream stream(*m_File);
1939  m_Header.Read(stream);
1940 }
1941 
1942 
1943 static void s_AddReplacedExt(vector<string>& dst,
1944  const string& base_name,
1945  CTempString old_ext,
1946  CTempString new_ext)
1947 {
1948  if ( NStr::EndsWith(base_name, old_ext) ) {
1949  dst.push_back(base_name.substr(0, base_name.size()-old_ext.size())+new_ext);
1950  }
1951 }
1952 
1953 
1954 void CBamRawDb::Open(const string& bam_path, const string& index_path)
1955 {
1956  vector<string> index_name_candidates;
1957  if ( index_path.empty() || index_path == bam_path ) {
1958 #ifdef BAM_SUPPORT_CSI
1959  bool prefer_csi = NCBI_PARAM_TYPE(BAM, PREFER_CSI)::GetDefault();
1960  if ( prefer_csi ) {
1961  index_name_candidates.push_back(bam_path+kCsiExt);
1962  s_AddReplacedExt(index_name_candidates, bam_path, kBamExt, kCsiExt);
1963  }
1964 #endif
1965  index_name_candidates.push_back(bam_path+kBaiExt);
1966  s_AddReplacedExt(index_name_candidates, bam_path, kBamExt, kBaiExt);
1967 #ifdef BAM_SUPPORT_CSI
1968  if ( !prefer_csi ) {
1969  index_name_candidates.push_back(bam_path+kCsiExt);
1970  s_AddReplacedExt(index_name_candidates, bam_path, kBamExt, kCsiExt);
1971  }
1972 #endif
1973  }
1974  else {
1975  index_name_candidates.push_back(index_path);
1976  }
1977  for ( size_t i = 0; i < index_name_candidates.size(); ++i ) {
1978  try {
1979  m_Index.Read(index_name_candidates[i]);
1980  break;
1981  }
1982  catch ( CBamException& exc ) {
1983  if ( i < index_name_candidates.size()-1 &&
1985  // try next index file name candidate
1986  continue;
1987  }
1988  else {
1989  throw;
1990  }
1991  }
1992  }
1993  m_File = new CBGZFFile(bam_path);
1995  CBGZFStream stream(*m_File);
1996  m_Header.Read(stream);
1998 }
1999 
2000 
2002 {
2003  // adjustments
2004  const double index_read_weight = 10;
2005  const Uint8 add_read_bytes = 100000; // 100KB
2006  const double add_read_bytes_per_second = 80e6; // 80 MBps
2007  const Uint8 add_unzip_bytes = 100000; // 100KB
2008  const double add_unzip_bytes_per_second = 80e6; // 80 MBps
2009 
2010  pair<Uint8, double> index_read_stat = m_Index.GetReadStatistics();
2011  pair<Uint8, double> data_read_stat = m_File->GetReadStatistics();
2012  pair<Uint8, double> data_unzip_stat = m_File->GetUncompressStatistics();
2013  Uint8 read_bytes =
2014  Uint8(index_read_stat.first*index_read_weight) +
2015  data_read_stat.first +
2016  add_read_bytes;
2017  double read_seconds =
2018  index_read_stat.second*index_read_weight +
2019  data_read_stat.second +
2020  add_read_bytes/add_read_bytes_per_second;
2021 
2022  Uint8 unzip_bytes = data_unzip_stat.first + add_unzip_bytes;
2023  double unzip_seconds = data_unzip_stat.second + add_unzip_bytes/add_unzip_bytes_per_second;
2024 
2025  return read_seconds/read_bytes + unzip_seconds/unzip_bytes;
2026 }
2027 
2028 
2029 /////////////////////////////////////////////////////////////////////////////
2030 // SBamAlignInfo
2031 /////////////////////////////////////////////////////////////////////////////
2032 
2034 {
2035  string ret;
2036  if ( uint32_t len = get_read_len() ) {
2037  ret.resize(len);
2038  char* dst = &ret[0];
2039  const char* src = get_read_ptr();
2040  for ( uint32_t len = get_read_len(); len; ) {
2041  char c = *src++;
2042  uint32_t b1 = (c >> 4)&0xf;
2043  uint32_t b2 = (c )&0xf;
2044  *dst = kBaseSymbols[b1];
2045  if ( len == 1 ) {
2046  break;
2047  }
2048  dst[1] = kBaseSymbols[b2];
2049  dst += 2;
2050  len -= 2;
2051  }
2052  }
2053  return ret;
2054 }
2055 
2056 
2058 {
2060  str.reserve(len+1);
2061  str.resize(len);
2062  char* dst = str.data();
2063  const char* src = get_read_ptr();
2064  for ( uint32_t len = get_read_len(); len; ) {
2065  char c = *src++;
2066  uint32_t b1 = (c >> 4)&0xf;
2067  uint32_t b2 = (c )&0xf;
2068  *dst = kBaseSymbols[b1];
2069  if ( len == 1 ) {
2070  break;
2071  }
2072  dst[1] = kBaseSymbols[b2];
2073  dst += 2;
2074  len -= 2;
2075  }
2076 }
2077 
2078 
2080 {
2081  // ignore optional starting hard break
2082  // return optional starting soft break
2083  // or 0 if there is no soft break
2084  const char* ptr = get_cigar_ptr();
2085  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2086  uint32_t op = SBamUtil::MakeUint4(ptr);
2087  ptr += 4;
2088  switch ( op & 0xf ) {
2089  case kCIGAR_H:
2090  continue;
2091  case kCIGAR_S:
2092  return op >> 4;
2093  default:
2094  return 0;
2095  }
2096  }
2097  return 0;
2098 }
2099 
2100 
2102 {
2103  // ignore hard and soft breaks, ignore insertions
2104  // only match/mismatch, deletes, and skips remain
2105  uint32_t ret = 0;
2106  const char* ptr = get_cigar_ptr();
2107  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2108  uint32_t op = SBamUtil::MakeUint4(ptr);
2109  ptr += 4;
2110  uint32_t seglen = op >> 4;
2111  switch ( op & 0xf ) {
2112  case kCIGAR_M:
2113  case kCIGAR_eq:
2114  case kCIGAR_X:
2115  case kCIGAR_D:
2116  case kCIGAR_N:
2117  ret += seglen;
2118  break;
2119  default:
2120  break;
2121  }
2122  }
2123  return ret;
2124 }
2125 
2126 
2128 {
2129  // ignore hard and soft breaks, ignore deletions and skips
2130  // only match/mismatch and inserts remain
2131  uint32_t ret = 0;
2132  const char* ptr = get_cigar_ptr();
2133  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2134  uint32_t op = SBamUtil::MakeUint4(ptr);
2135  ptr += 4;
2136  uint32_t seglen = op >> 4;
2137  switch ( op & 0xf ) {
2138  case kCIGAR_M:
2139  case kCIGAR_eq:
2140  case kCIGAR_X:
2141  case kCIGAR_I:
2142  ret += seglen;
2143  break;
2144  default:
2145  break;
2146  }
2147  }
2148  return ret;
2149 }
2150 
2151 
2152 pair< COpenRange<uint32_t>, COpenRange<uint32_t> > SBamAlignInfo::get_cigar_alignment(void) const
2153 {
2154  // ignore hard and soft breaks, ignore deletions and skips
2155  // only match/mismatch and inserts remain
2156  uint32_t ref_pos = get_ref_pos(), ref_size = 0, read_pos = 0, read_size = 0;
2157  bool first = true;
2158  const char* ptr = get_cigar_ptr();
2159  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2160  uint32_t op = SBamUtil::MakeUint4(ptr);
2161  ptr += 4;
2162  uint32_t seglen = op >> 4;
2163  switch ( op & 0xf ) {
2164  case kCIGAR_M:
2165  case kCIGAR_eq:
2166  case kCIGAR_X:
2167  ref_size += seglen;
2168  read_size += seglen;
2169  break;
2170  case kCIGAR_D:
2171  case kCIGAR_N:
2172  ref_size += seglen;
2173  break;
2174  case kCIGAR_I:
2175  read_size += seglen;
2176  break;
2177  case kCIGAR_S:
2178  if ( first ) {
2179  read_pos = seglen;
2180  }
2181  break;
2182  default:
2183  break;
2184  }
2185  first = false;
2186  }
2187  pair< COpenRange<uint32_t>, COpenRange<uint32_t> > ret;
2188  ret.first.SetFrom(ref_pos).SetLength(ref_size);
2189  ret.second.SetFrom(read_pos).SetLength(read_size);
2190  return ret;
2191 }
2192 
2193 
2194 const char SBamAlignInfo::kCIGARSymbols[] = "MIDNSHP=X???????";
2195 const char SBamAlignInfo::kBaseSymbols[] = "=ACMGRSVTWYHKDBN";
2196 
2197 
2199 {
2200  // ignore hard and soft breaks
2201  CNcbiOstrstream ret;
2202  const char* ptr = get_cigar_ptr();
2203  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2204  uint32_t op = SBamUtil::MakeUint4(ptr);
2205  ptr += 4;
2206  switch ( op & 0xf ) {
2207  case kCIGAR_H:
2208  case kCIGAR_S:
2209  continue;
2210  default:
2211  break;
2212  }
2213  uint32_t seglen = op >> 4;
2214  ret << kCIGARSymbols[op & 0xf] << seglen;
2215  }
2216  return CNcbiOstrstreamToString(ret);
2217 }
2218 
2219 
2221 {
2222  const char* ptr = get_cigar_ptr();
2223  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2224  uint32_t op = SBamUtil::MakeUint4(ptr);
2225  ptr += 4;
2226  switch ( op & 0xf ) {
2227  case kCIGAR_M:
2228  return true;
2229  default:
2230  break;
2231  }
2232  }
2233  return false;
2234 }
2235 
2236 
2237 static inline char* s_format(char* dst, uint32_t v)
2238 {
2239  if ( v < 10 ) {
2240  *dst = '0'+v;
2241  return dst+1;
2242  }
2243  if ( v >= 100 ) {
2244  dst = s_format(dst, v/100);
2245  v %= 100;
2246  }
2247  dst[0] = '0'+(v/10);
2248  dst[1] = '0'+(v%10);
2249  return dst+2;
2250 }
2251 
2252 
2254 {
2255  // it takes at most 10 symbols per op - op char + 9-symbols number up to 2^28
2256  size_t count = get_cigar_ops_count();
2257  str.reserve(count*10+1);
2258  char* dst = str.data();
2259  const char* src = get_cigar_ptr();
2260  for ( ; count--; ) {
2261  uint32_t op = SBamUtil::MakeUint4(src);
2262  src += 4;
2263  switch ( op & 0xf ) {
2264  case kCIGAR_H:
2265  case kCIGAR_S:
2266  continue;
2267  default:
2268  break;
2269  }
2270  uint32_t seglen = op >> 4;
2271  *dst = kCIGARSymbols[op & 0xf];
2272  dst = s_format(dst+1, seglen);
2273  }
2274  str.resize(dst-str.data());
2275 }
2276 
2277 
2279 {
2280  const char* ptr = m_AuxPtr;
2281  const char* end = m_AuxEnd;
2282  if ( ptr == end ) {
2283  // end of tags
2284  m_AuxData = SBamAuxData();
2285  return;
2286  }
2287  ptr += 3; // skip tag name and type
2288  if ( ptr <= end ) {
2289  m_AuxData.m_Tag[0] = ptr[-3];
2290  m_AuxData.m_Tag[1] = ptr[-2];
2291  m_AuxData.m_DataType = ptr[-1];
2292  m_AuxData.m_IsArray = false;
2294  m_AuxData.m_DataPtr = ptr;
2295  switch ( m_AuxData.m_DataType ) {
2296  case 'A':
2297  case 'c':
2298  case 'C':
2299  // 1-byte value
2300  ptr += 1;
2301  if ( ptr <= end ) {
2302  // fits
2303  m_AuxPtr = ptr;
2304  return;
2305  }
2306  // fallback to error
2307  break;
2308  case 's':
2309  case 'S':
2310  // 2-byte value
2311  ptr += 2;
2312  if ( ptr <= end ) {
2313  // fits
2314  m_AuxPtr = ptr;
2315  return;
2316  }
2317  // fallback to error
2318  break;
2319  case 'i':
2320  case 'I':
2321  case 'f':
2322  // 4-byte value
2323  ptr += 4;
2324  if ( ptr <= end ) {
2325  // fits
2326  m_AuxPtr = ptr;
2327  return;
2328  }
2329  // fallback to error
2330  break;
2331  case 'Z':
2332  case 'H':
2333  // zero-terminated string
2334  ptr = static_cast<const char*>(memchr(ptr, 0, end-ptr));
2335  if ( ptr ) {
2336  // found zero termination
2338  m_AuxPtr = ptr + 1; // skip zero-termination too
2339  return;
2340  }
2341  // fallback to error
2342  break;
2343  case 'B':
2344  // array of fixed-size elements
2345  ptr += 5; // skip element type and count
2346  if ( ptr <= end ) {
2347  m_AuxData.m_IsArray = true;
2348  m_AuxData.m_DataType = ptr[-5];
2350  m_AuxData.m_DataPtr = ptr;
2351  size_t element_size;
2352  switch ( m_AuxData.m_DataType ) {
2353  case 'c':
2354  case 'C':
2355  element_size = 1;
2356  break;
2357  case 's':
2358  case 'S':
2359  element_size = 2;
2360  break;
2361  case 'i':
2362  case 'I':
2363  case 'f':
2364  element_size = 4;
2365  break;
2366  default:
2367  element_size = 0;
2368  break;
2369  }
2370  if ( element_size == 0 ) {
2371  // fallback to error
2372  break;
2373  }
2374  ptr += m_AuxData.m_ElementCount*element_size;
2375  if ( ptr <= end ) {
2376  // fits
2377  m_AuxPtr = ptr;
2378  return;
2379  }
2380  }
2381  // fallback to error
2382  break;
2383  default:
2384  // fallback to error
2385  break;
2386  }
2387  }
2388  // bad aux format, cannot continue parsing aux data
2389  ERR_POST("BAM: Alignment aux tag parse error");
2390  m_AuxData = SBamAuxData();
2391  m_AuxPtr = end;
2392 }
2393 
2394 
2396 {
2397  if ( !IsChar() ) {
2398  NCBI_THROW_FMT(CBamException, eOtherError,
2399  "Conversion error: "
2400  "type "<<GetDataType()<<" cannot be converted to char");
2401  }
2402  return m_DataPtr[0];
2403 }
2404 
2405 
2407 {
2408  if ( !IsString() ) {
2409  NCBI_THROW_FMT(CBamException, eOtherError,
2410  "Conversion error: "
2411  "type "<<GetDataType()<<" cannot be converted to string");
2412  }
2413  return CTempString(m_DataPtr, size());
2414 }
2415 
2416 
2417 Int8 SBamAuxData::GetInt(size_t index) const
2418 {
2419  if ( !IsInt() ) {
2420  NCBI_THROW_FMT(CBamException, eOtherError,
2421  "Conversion error: "
2422  "type "<<GetDataType()<<" cannot be converted to int");
2423  }
2424  if ( index >= size() ) {
2425  NCBI_THROW_FMT(CBamException, eInvalidArg,
2426  "Index overflow: "<<index<<" >= "<<size());
2427  return false;
2428  }
2429  switch ( GetDataType() ) {
2430  case 'c': // signed byte
2431  return Int1(m_DataPtr[index]);
2432  case 'C': // unsigned byte
2433  return Uint1(m_DataPtr[index]);
2434  case 's': // signed 2-byte int
2435  return Int2(SBamUtil::MakeUint2(m_DataPtr+2*index));
2436  case 'S': // unsigned 2-byte int
2437  return Uint2(SBamUtil::MakeUint2(m_DataPtr+2*index));
2438  case 'i': // signed 4-byte int
2439  return Int4(SBamUtil::MakeUint4(m_DataPtr+4*index));
2440  case 'I': // unsigned 4-byte int
2441  return Uint4(SBamUtil::MakeUint4(m_DataPtr+4*index));
2442  default:
2443  // couldn't be here because IsInt() == true
2444  return 0;
2445  }
2446 }
2447 
2448 
2449 float SBamAuxData::GetFloat(size_t index) const
2450 {
2451  if ( !IsFloat() ) {
2452  NCBI_THROW_FMT(CBamException, eOtherError,
2453  "Conversion error: "
2454  "type "<<GetDataType()<<" cannot be converted to float");
2455  }
2456  if ( index >= size() ) {
2457  NCBI_THROW_FMT(CBamException, eInvalidArg,
2458  "Index overflow: "<<index<<" >= "<<size());
2459  return false;
2460  }
2461  return SBamUtil::MakeFloat(m_DataPtr+4*index);
2462 }
2463 
2464 
2465 SBamAuxData SBamAlignInfo::get_aux_data(char c1, char c2, bool allow_missing) const
2466 {
2467  for ( CBamAuxIterator iter(get_aux_data_ptr(), get_aux_data_end()); iter; ++iter ) {
2468  if ( iter->IsTag(c1, c2) ) {
2469  return *iter;
2470  }
2471  }
2472  if ( !allow_missing ) {
2473  NCBI_THROW_FMT(CBamException, eNoData,
2474  "Tag "<<c1<<c2<<" not found");
2475  }
2476  return SBamAuxData();
2477 }
2478 
2479 
2481 {
2482  if ( auto data = get_aux_data('R', 'G', true) ) {
2483  return data.GetString();
2484  }
2485  return CTempString();
2486 }
2487 
2488 
2490 {
2491  in.GetNextAvailableBytes(); // update position if it's at the end of block
2492  m_FilePos = in.GetPos();
2493  m_RecordSize = SBamUtil::MakeUint4(in.Read(4));
2494  m_RecordPtr = in.Read(m_RecordSize);
2498 }
2499 
2500 
2501 /////////////////////////////////////////////////////////////////////////////
2502 // CBamRawAlignIterator
2503 /////////////////////////////////////////////////////////////////////////////
2504 
2505 
2507  const string& ref_label,
2508  TSeqPos ref_pos,
2509  TSeqPos window,
2510  ESearchMode search_mode)
2511  : m_Reader(bam_db.GetFile())
2512 {
2513  CRange<TSeqPos> ref_range(ref_pos, ref_pos);
2514  if ( window && ref_pos < kInvalidSeqPos-window ) {
2515  ref_range.SetToOpen(ref_pos+window);
2516  }
2517  else {
2518  ref_range.SetToOpen(kInvalidSeqPos);
2519  }
2520  Select(bam_db, ref_label, ref_range, search_mode);
2521 }
2522 
2523 
2525  const string& ref_label,
2526  TSeqPos ref_pos,
2527  TSeqPos window,
2528  TIndexLevel min_index_level,
2529  TIndexLevel max_index_level,
2530  ESearchMode search_mode)
2531  : m_Reader(bam_db.GetFile())
2532 {
2533  CRange<TSeqPos> ref_range(ref_pos, ref_pos);
2534  if ( window && ref_pos < kInvalidSeqPos-window ) {
2535  ref_range.SetToOpen(ref_pos+window);
2536  }
2537  else {
2538  ref_range.SetToOpen(kInvalidSeqPos);
2539  }
2540  Select(bam_db, ref_label, ref_range, min_index_level, max_index_level, search_mode);
2541 }
2542 
2543 
2545  const string& ref_label,
2546  TSeqPos ref_pos,
2547  TSeqPos window,
2548  EIndexLevel min_index_level,
2549  EIndexLevel max_index_level,
2550  ESearchMode search_mode)
2551  : m_Reader(bam_db.GetFile())
2552 {
2553  CRange<TSeqPos> ref_range(ref_pos, ref_pos);
2554  if ( window && ref_pos < kInvalidSeqPos-window ) {
2555  ref_range.SetToOpen(ref_pos+window);
2556  }
2557  else {
2558  ref_range.SetToOpen(kInvalidSeqPos);
2559  }
2560  Select(bam_db, ref_label, ref_range, min_index_level, max_index_level, search_mode);
2561 }
2562 
2563 
2565 {
2566  m_RefIndex = size_t(-1);
2568  m_Ranges.SetWhole(header);
2570  m_MinIndexLevel = 0;
2571  m_MaxIndexLevel = 0;
2572  if ( x_UpdateRange() ) {
2573  Next();
2574  }
2575 }
2576 
2577 
2579  size_t ref_index,
2580  CRange<TSeqPos> ref_range,
2581  TIndexLevel min_index_level,
2582  TIndexLevel max_index_level,
2583  ESearchMode search_mode)
2584 {
2585  SBamIndexParams::operator=(index);
2586  m_RefIndex = ref_index;
2587  m_QueryRefRange = ref_range;
2588  m_Ranges.SetRanges(index, ref_index, ref_range, min_index_level, max_index_level, search_mode);
2590  m_MinIndexLevel = min_index_level;
2591  m_MaxIndexLevel = max_index_level;
2592  m_SearchMode = search_mode;
2593  if ( x_UpdateRange() ) {
2594  Next();
2595  }
2596 }
2597 
2598 
2600 {
2601  if ( m_NextRange == m_Ranges.end() ) {
2603  return false;
2604  }
2605  else {
2606  m_CurrentRangeEnd = m_NextRange->second;
2607  m_Reader.Seek(m_NextRange->first, m_NextRange->second);
2608  ++m_NextRange;
2609  return true;
2610  }
2611 }
2612 
2613 
2615 {
2616  _ASSERT(*this);
2618  if ( m_RefIndex != size_t(-1) ) {
2619  // check for alignment validity
2620  if ( size_t(m_AlignInfo.get_ref_index()) != m_RefIndex ) {
2621  // wrong reference sequence
2622  return true;
2623  }
2624  if ( !IsMapped() ) {
2625  // unaligned read
2626  return true;
2627  }
2628  if ( GetCIGAROpsCount() == 0 ) {
2629  // empty CIGAR string
2630  return true;
2631  }
2632  }
2633  auto alignment = m_AlignInfo.get_cigar_alignment();
2634  m_AlignRefRange = alignment.first;
2635  m_AlignReadRange = alignment.second;
2636  if ( m_RefIndex == size_t(-1) ) {
2637  // unfiltered alignments
2638  return false;
2639  }
2641  // after search range
2642  x_Stop();
2643  return false;
2644  }
2645  if ( m_SearchMode == eSearchByOverlap ) {
2646  // any overlapping alignment
2648  // before search range
2649  return true;
2650  }
2651  }
2652  else {
2653  // only starting within the range
2655  // before search range
2656  return true;
2657  }
2658  }
2659  if ( m_MinIndexLevel != 0 || m_MaxIndexLevel != GetMaxIndexLevel() ) {
2660  TIndexLevel index_level = GetIndexLevel();
2661  if ( index_level < m_MinIndexLevel || index_level > m_MaxIndexLevel ) {
2662  // this index level is not requested
2663  return true;
2664  }
2665  }
2666  return false;
2667 }
2668 
2669 
2671 {
2672  while ( x_NextAnnot() && x_NeedToSkip() ) {
2673  // continue
2674  }
2675 }
2676 
2677 
2678 void CBamRawAlignIterator::GetSegments(vector<int>& starts, vector<TSeqPos>& lens) const
2679 {
2680  TSeqPos refpos = GetRefSeqPos();
2681  TSeqPos seqpos = 0;
2682 
2683  // ignore hard breaks
2684  // omit soft breaks in the alignment
2685  const char* ptr = m_AlignInfo.get_cigar_ptr();
2686  for ( uint16_t count = m_AlignInfo.get_cigar_ops_count(); count--; ) {
2687  uint32_t op = SBamUtil::MakeUint4(ptr);
2688  ptr += 4;
2689  TSeqPos seglen = op >> 4;
2690  int refstart, seqstart;
2691  switch ( op & 0xf ) {
2693  case SBamAlignInfo::kCIGAR_P: // ?
2694  continue;
2696  seqpos += seglen;
2697  continue;
2701  refstart = refpos;
2702  refpos += seglen;
2703  seqstart = seqpos;
2704  seqpos += seglen;
2705  break;
2707  refstart = kInvalidSeqPos;
2708  seqstart = seqpos;
2709  seqpos += seglen;
2710  break;
2713  refstart = refpos;
2714  refpos += seglen;
2715  seqstart = kInvalidSeqPos;
2716  break;
2717  default:
2718  NCBI_THROW_FMT(CBamException, eBadCIGAR,
2719  "Bad CIGAR segment: " << (op & 0xf) << " in " <<GetCIGAR());
2720  }
2721  if ( seglen == 0 ) {
2722  NCBI_THROW_FMT(CBamException, eBadCIGAR,
2723  "Zero CIGAR segment: in " << GetCIGAR());
2724  }
2725  starts.push_back(refstart);
2726  starts.push_back(seqstart);
2727  lens.push_back(seglen);
2728  }
2729 }
2730 
2731 
static void sx_SetTitle(CSeq_graph &graph, CSeq_annot &annot, string title, string name)
Definition: bamindex.cpp:1403
static char * s_format(char *dst, uint32_t v)
Definition: bamindex.cpp:2237
static const size_t kIndexMagicLength
Definition: bamindex.cpp:67
BEGIN_LOCAL_NAMESPACE
Definition: bamindex.cpp:1210
NCBI_PARAM_DEF_EX(int, BAM, OVERLAP_MODE, 2, eParam_NoThread, BAM_OVERLAP_MODE)
static const char kBamExt[]
Definition: bamindex.cpp:65
static const float kEstimatedCompression
Definition: bamindex.cpp:76
static void s_ReadMagic(CBGZFStream &in, const char *magic)
Definition: bamindex.cpp:126
static CBGZFPos s_ReadFilePos(CNcbiIstream &in)
Definition: bamindex.cpp:164
static int s_GetRangesMode()
Definition: bamindex.cpp:1738
Uint8 s_EstimatedPos(CBGZFPos pos)
Definition: bamindex.cpp:757
static size_t ReadVDBFile(AutoArray< char > &data, const string &path)
Definition: bamindex.cpp:1155
static int32_t s_ReadInt32(CNcbiIstream &in)
Definition: bamindex.cpp:148
static int s_GetOverlapMode()
Definition: bamindex.cpp:493
static CBGZFPos s_GetOverlap(const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:437
static void s_AddReplacedExt(vector< string > &dst, const string &base_name, CTempString old_ext, CTempString new_ext)
Definition: bamindex.cpp:1943
END_LOCAL_NAMESPACE
Definition: bamindex.cpp:1250
NCBI_PARAM_DECL(int, BAM, OVERLAP_MODE)
static CBGZFPos s_GetNextFilePos(const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:462
static CBGZFPos s_GetFilePos(const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:450
static uint32_t s_ReadUInt32(CNcbiIstream &in)
Definition: bamindex.cpp:139
static const char kBaiExt[]
Definition: bamindex.cpp:68
static const size_t kGZipMagicLength
Definition: bamindex.cpp:62
static CBGZFRange s_ReadFileRange(CNcbiIstream &in)
Definition: bamindex.cpp:171
static COpenRange< TSeqPos > s_GetSeqRange(SBamIndexParams params, const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:419
static const char kGZipMagic[]
Definition: bamindex.cpp:63
static const char kIndexMagicBAI[]
Definition: bamindex.cpp:69
static void s_ReadString(CBGZFStream &in, string &ret, size_t len)
Definition: bamindex.cpp:118
ostream & operator<<(ostream &out, const CBamFileRangeSet &ranges)
Definition: bamindex.cpp:1709
static void s_Read(CNcbiIstream &in, char *dst, size_t len)
Definition: bamindex.cpp:79
Uint8 s_EstimatedSize(CBGZFPos file_pos1, CBGZFPos file_pos2)
Definition: bamindex.cpp:764
static uint64_t s_ReadUInt64(CNcbiIstream &in)
Definition: bamindex.cpp:155
pair< CBGZFPos, CBGZFPos > CBGZFRange
Definition: bgzf.hpp:272
void SetPreviousReadStatistics(const pair< Uint8, double > &stats)
Definition: bgzf.hpp:331
pair< Uint8, double > GetReadStatistics() const
Definition: bgzf.hpp:327
pair< Uint8, double > GetUncompressStatistics() const
Definition: bgzf.cpp:402
TByteOffset GetByteOffset() const
Definition: bgzf.hpp:224
TFileBlockPos GetFileBlockPos() const
Definition: bgzf.hpp:220
static CBGZFPos GetInvalid()
Definition: bgzf.hpp:254
bool IsInvalid() const
Definition: bgzf.hpp:258
CBGZFPos GetSeekPos() const
Definition: bgzf.hpp:402
void Seek(CBGZFPos pos, CBGZFPos end_pos=CBGZFPos::GetInvalid())
Definition: bgzf.cpp:462
const char * m_AuxEnd
Definition: bamindex.hpp:1106
SBamAuxData m_AuxData
Definition: bamindex.hpp:1104
const char * m_AuxPtr
Definition: bamindex.hpp:1105
static int GetDebugLevel()
Definition: bamread.cpp:389
@ eFileNotFound
File not found.
virtual TErrCode GetErrCode(void) const
Definition: bamread.cpp:168
void AddWhole(const CBamHeader &header)
Definition: bamindex.cpp:1862
void AddSortedRanges(const vector< CBGZFRange > &ranges)
Definition: bamindex.cpp:1720
const_iterator end() const
Definition: bamindex.hpp:941
TRanges m_Ranges
Definition: bamindex.hpp:953
void SetRanges(const CBamIndex &index, size_t ref_index, COpenRange< TSeqPos > ref_range, ESearchMode search_mode=eSearchByOverlap)
Definition: bamindex.cpp:1890
const_iterator begin() const
Definition: bamindex.hpp:937
void SetWhole(const CBamHeader &header)
Definition: bamindex.cpp:1871
Uint8 GetFileSize() const
Definition: bamindex.cpp:1915
void AddRanges(const CBamIndex &index, size_t ref_index, COpenRange< TSeqPos > ref_range, ESearchMode search_mode=eSearchByOverlap)
Definition: bamindex.cpp:1843
const SBamHeaderRefInfo & GetRef(size_t ref_index) const
Definition: bamindex.cpp:1600
void Read(CBGZFStream &stream)
Definition: bamindex.cpp:1583
TRefs m_Refs
Definition: bamindex.hpp:109
map< string, string > TSBamTags
Definition: bamindex.hpp:72
size_t GetRefCount() const
Definition: bamindex.hpp:84
size_t GetRefIndex(const string &name) const
Definition: bamindex.cpp:1610
size_t GetSBamRecords(TSBamRecords &records) const
Definition: bamindex.cpp:1619
CBGZFPos m_AlignStart
Definition: bamindex.hpp:110
list< TSBamRecord > TSBamRecords
Definition: bamindex.hpp:74
CBGZFPos GetAlignStart() const
Definition: bamindex.hpp:101
string m_Text
Definition: bamindex.hpp:107
pair< string, TSBamTags > TSBamRecord
Definition: bamindex.hpp:73
TSeqPos GetRefLength(size_t index) const
Definition: bamindex.hpp:94
map< string, size_t > m_RefByName
Definition: bamindex.hpp:108
const SBamIndexRefIndex & GetRef(size_t ref_index) const
Definition: bamindex.cpp:1365
Uint8 m_TotalReadBytes
Definition: bamindex.hpp:753
double m_TotalReadSeconds
Definition: bamindex.hpp:754
vector< uint64_t > CollectEstimatedCoverage(size_t ref_index, TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1484
TRefs m_Refs
Definition: bamindex.hpp:751
void Read(const string &index_file_name)
Definition: bamindex.cpp:1192
size_t GetRefCount() const
Definition: bamindex.hpp:461
Uint8 m_UnmappedCount
Definition: bamindex.hpp:752
string m_FileName
Definition: bamindex.hpp:750
pair< Uint8, double > GetReadStatistics() const
Definition: bamindex.hpp:744
CRef< CSeq_annot > MakeEstimatedCoverageAnnot(const CBamHeader &header, const string &ref_name, const string &seq_id, const string &annot_name, TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1418
CBGZFRange GetTotalFileRange(size_t ref_index) const
Definition: bamindex.cpp:1388
void SetLengthFromHeader(const CBamHeader &header)
Definition: bamindex.cpp:1375
Uint2 GetCIGAROpsCount() const
Definition: bamindex.hpp:1498
TSeqPos GetRefSeqPos() const
Definition: bamindex.hpp:1457
void x_Select(const CBamHeader &header)
Definition: bamindex.cpp:2564
SBamAlignInfo m_AlignInfo
Definition: bamindex.hpp:1666
ESearchMode m_SearchMode
Definition: bamindex.hpp:1665
CBamFileRangeSet::const_iterator m_NextRange
Definition: bamindex.hpp:1670
string GetCIGAR() const
Definition: bamindex.hpp:1535
bool IsMapped() const
Definition: bamindex.hpp:1570
void GetSegments(vector< int > &starts, vector< TSeqPos > &lens) const
Definition: bamindex.cpp:2678
TIndexLevel GetIndexLevel() const
Definition: bamindex.hpp:1544
void Select(CBamRawDb &bam_db)
Definition: bamindex.hpp:1367
TIndexLevel m_MaxIndexLevel
Definition: bamindex.hpp:1664
CBamFileRangeSet m_Ranges
Definition: bamindex.hpp:1669
TIndexLevel m_MinIndexLevel
Definition: bamindex.hpp:1664
COpenRange< TSeqPos > m_QueryRefRange
Definition: bamindex.hpp:1663
COpenRange< TSeqPos > m_AlignRefRange
Definition: bamindex.hpp:1667
CBGZFPos m_CurrentRangeEnd
Definition: bamindex.hpp:1671
COpenRange< TSeqPos > m_AlignReadRange
Definition: bamindex.hpp:1668
CBGZFStream m_Reader
Definition: bamindex.hpp:1672
double GetEstimatedSecondsPerByte() const
Definition: bamindex.cpp:2001
CRef< CBGZFFile > m_File
Definition: bamindex.hpp:1023
void Open(const string &bam_path)
Definition: bamindex.cpp:1935
CBamIndex m_Index
Definition: bamindex.hpp:1025
CBamHeader m_Header
Definition: bamindex.hpp:1024
CByte_graph –.
Definition: Byte_graph.hpp:66
CMemoryReader(const char *ptr, size_t size)
Definition: bamindex.cpp:1214
ERW_Result Read(void *buf, size_t count, size_t *bytes_read)
Read as many as "count" bytes into a buffer pointed to by the "buf" argument.
Definition: bamindex.cpp:1220
const char * m_Ptr
Definition: bamindex.cpp:1247
ERW_Result PendingCount(size_t *count)
Via parameter "count" (which is guaranteed to be supplied non-NULL) return the number of bytes that a...
Definition: bamindex.cpp:1240
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
@ fOwnReader
Own the underlying reader.
Definition: rwstreambuf.hpp:66
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
Definition: Seq_entry.hpp:56
CStopWatch –.
Definition: ncbitime.hpp:1938
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CZipStreamDecompressor – zlib based decompression stream processor.
Definition: zlib.hpp:817
A very basic data-read interface.
void erase(iterator pos)
Definition: map.hpp:167
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator upper_bound(const key_type &key) const
Definition: map.hpp:155
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
char value[7]
Definition: config.c:431
static ulg bb
static const char fp[]
Definition: des.c:87
static const char si[8][64]
Definition: des.c:146
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
std::ofstream out("events_result.xml")
main entry point for tests
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:581
void reset(element_type *p=0)
Reset will delete the old pointer, set content to the new value, and assume the ownership upon the ne...
Definition: ncbimisc.hpp:598
string
Definition: cgiapp.hpp:687
@ fGZip
Set of flags for gzip file support. See each flag description above.
Definition: zlib.hpp:120
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define NCBI_THROW_FMT(exception_class, err_code, message)
The same as NCBI_THROW but with message processed as output to ostream.
Definition: ncbiexpt.hpp:719
#define NCBI_PARAM_TYPE(section, name)
Generate typename for a parameter from its {section, name} attributes.
Definition: ncbi_param.hpp:149
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
int8_t Int1
1-byte (8-bit) signed integer
Definition: ncbitype.h:98
TThisType & SetFrom(position_type from)
Definition: range.hpp:170
position_type GetTo(void) const
Definition: range.hpp:142
TThisType & SetToOpen(position_type toOpen)
Definition: range.hpp:175
position_type GetToOpen(void) const
Definition: range.hpp:138
position_type GetFrom(void) const
Definition: range.hpp:134
static TThisType GetEmpty(void)
Definition: range.hpp:306
TThisType & SetLength(position_type length)
Definition: range.hpp:194
bool Empty(void) const
Definition: range.hpp:148
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
ERW_Result
Result codes for I/O operations.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
@ eRW_Eof
End of data, should be considered permanent.
@ eRW_Success
Everything is okay, I/O completed.
static string PrintableString(const CTempString str, TPrintableMode mode=fNewLine_Quote|fNonAscii_Passthru)
Get a printable version of the specified string.
Definition: ncbistr.cpp:3949
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5429
CTempString & assign(const char *src_str, size_type len)
Assign new values to the content of the a string.
Definition: tempstr.hpp:733
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1942
void SetA(TA value)
Assign a value to A data member.
void SetMin(TMin value)
Assign a value to Min data member.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
void SetNumval(TNumval value)
Assign a value to Numval data member.
void SetComp(TComp value)
Assign a value to Comp data member.
TValues & SetValues(void)
Assign a value to Values data member.
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
void SetMax(TMax value)
Assign a value to Max data member.
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
void SetAxis(TAxis value)
Assign a value to Axis data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
static CStopWatch sw
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
FILE * file
char * buf
int i
int len
range(_Ty, _Ty) -> range< _Ty >
void timsort(RandomAccessIterator const first, RandomAccessIterator const last)
Same as std::stable_sort(first, last).
Definition: timsort.hpp:650
const struct ncbi::grid::netcache::search::fields::SIZE size
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
@ eRead
Definition: ns_types.hpp:56
static unsigned cnt[256]
static int buffer_size
Definition: pcretest.c:1050
static size_t read_size(CNcbiIstream &stream, const char *name)
Definition: reader_snp.cpp:404
Reader-writer based streams.
static const char * str(char *buf, int n)
Definition: stats.c:84
unsigned short uint16_t
Definition: stdint.h:125
unsigned int uint32_t
Definition: stdint.h:126
signed int int32_t
Definition: stdint.h:123
unsigned __int64 uint64_t
Definition: stdint.h:136
bool operator()(const CBGZFPos p1, const CBGZFRange &p2) const
Definition: bamindex.cpp:282
bool operator()(const CBGZFRange &p1, const CBGZFPos p2) const
Definition: bamindex.cpp:286
bool operator()(const CBGZFPos p1, const SBamIndexBinInfo &p2) const
Definition: bamindex.cpp:270
bool operator()(const SBamIndexBinInfo &p1, const CBGZFPos p2) const
Definition: bamindex.cpp:274
string get_read() const
Definition: bamindex.cpp:2033
const char * get_cigar_ptr() const
Definition: bamindex.hpp:1209
uint32_t get_cigar_read_size() const
Definition: bamindex.cpp:2127
const char * m_ReadPtr
Definition: bamindex.hpp:1278
const char * m_CIGARPtr
Definition: bamindex.hpp:1277
uint32_t get_cigar_pos() const
Definition: bamindex.cpp:2079
CTempString get_short_seq_accession_id() const
Definition: bamindex.cpp:2480
int32_t get_ref_pos() const
Definition: bamindex.hpp:1134
pair< COpenRange< uint32_t >, COpenRange< uint32_t > > get_cigar_alignment(void) const
Definition: bamindex.cpp:2152
uint8_t get_read_name_len() const
Definition: bamindex.hpp:1139
const char * get_read_ptr() const
Definition: bamindex.hpp:1232
const char * get_aux_data_end() const
Definition: bamindex.hpp:1252
bool has_ambiguous_match() const
Definition: bamindex.cpp:2220
uint32_t get_cigar_ref_size() const
Definition: bamindex.cpp:2101
const char * get_read_name_ptr() const
Definition: bamindex.hpp:1201
static const char kBaseSymbols[]
Definition: bamindex.hpp:1261
string get_cigar() const
Definition: bamindex.cpp:2198
const char * m_RecordPtr
Definition: bamindex.hpp:1276
void Read(CBGZFStream &in)
Definition: bamindex.cpp:2489
const char * get_aux_data_ptr() const
Definition: bamindex.hpp:1248
CBGZFPos m_FilePos
Definition: bamindex.hpp:1275
uint16_t get_cigar_ops_count() const
Definition: bamindex.hpp:1163
static const char kCIGARSymbols[]
Definition: bamindex.hpp:1151
Uint4 m_RecordSize
Definition: bamindex.hpp:1279
int32_t get_ref_index() const
Definition: bamindex.hpp:1130
uint32_t get_read_len() const
Definition: bamindex.hpp:1185
SBamAuxData get_aux_data(char c1, char c2, bool allow_missing=false) const
Definition: bamindex.cpp:2465
char GetChar() const
Definition: bamindex.cpp:2395
char m_DataType
Definition: bamindex.hpp:1067
uint32_t m_ElementCount
Definition: bamindex.hpp:1069
bool IsChar() const
Definition: bamindex.hpp:1053
float GetFloat(size_t index=0) const
Definition: bamindex.cpp:2449
bool IsInt() const
Definition: bamindex.hpp:1056
char m_Tag[2]
Definition: bamindex.hpp:1066
char GetDataType() const
Definition: bamindex.hpp:1048
Int8 GetInt(size_t index=0) const
Definition: bamindex.cpp:2417
size_t size() const
Definition: bamindex.hpp:1051
bool IsFloat() const
Definition: bamindex.hpp:1055
bool m_IsArray
Definition: bamindex.hpp:1068
CTempString GetString() const
Definition: bamindex.cpp:2406
const char * m_DataPtr
Definition: bamindex.hpp:1070
bool IsString() const
Definition: bamindex.hpp:1054
void Read(CBGZFStream &in)
Definition: bamindex.cpp:1566
TSeqPos m_Length
Definition: bamindex.hpp:52
CBGZFPos GetEndFilePos() const
Definition: bamindex.hpp:358
CBGZFPos m_Overlap
Definition: bamindex.hpp:350
vector< CBGZFRange > m_Chunks
Definition: bamindex.hpp:352
CBGZFPos GetStartFilePos() const
Definition: bamindex.hpp:354
void Read(CNcbiIstream &in, SBamIndexParams params)
Definition: bamindex.cpp:200
COpenRange< TSeqPos > GetSeqRange(SBamIndexParams params) const
Definition: bamindex.hpp:343
uint32_t TBin
Definition: bamindex.hpp:120
static const TShift kLevelStepBinShift
Definition: bamindex.hpp:137
uint8_t TIndexLevel
Definition: bamindex.hpp:121
static const TShift kBAI_min_shift
Definition: bamindex.hpp:138
static const TIndexLevel kMinBinIndexLevel
Definition: bamindex.hpp:134
static const TIndexLevel kBAI_depth
Definition: bamindex.hpp:139
static const TBin kMaxBinNumber
Definition: bamindex.hpp:133
constexpr TSeqPos GetBinSize(TIndexLevel level) const
Definition: bamindex.hpp:196
constexpr TBin GetPseudoBin() const
Definition: bamindex.hpp:265
TIndexLevel depth
Definition: bamindex.hpp:157
constexpr TBin GetBinNumberBase(int level) const
Definition: bamindex.hpp:239
constexpr TBin GetFirstBin(TIndexLevel level) const
Definition: bamindex.hpp:257
TShift min_shift
Definition: bamindex.hpp:156
pair< TBin, TBin > GetBinRange(COpenRange< TSeqPos > ref_range, TIndexLevel index_level) const
Definition: bamindex.cpp:853
TBin GetBinNumber(TSeqPos pos, TIndexLevel level) const
Definition: bamindex.hpp:285
TBin GetBinNumberOffset(TSeqPos pos, TIndexLevel level) const
Definition: bamindex.hpp:277
bool IsOverflowBin(TBin bin, TIndexLevel level=0) const
Definition: bamindex.hpp:269
constexpr TShift GetMinLevelBinShift() const
Definition: bamindex.hpp:158
constexpr TSeqPos GetMinBinSize() const
Definition: bamindex.hpp:208
constexpr TShift GetLevelBinShift(TIndexLevel level) const
Definition: bamindex.hpp:187
constexpr TIndexLevel GetMaxIndexLevel() const
Definition: bamindex.hpp:162
constexpr TBin GetLastBin(TIndexLevel level) const
Definition: bamindex.hpp:261
vector< TSeqPos > GetAlnOverStarts(void) const
Definition: bamindex.cpp:500
pair< TBinsIter, TBinsIter > GetBinsIterRange(pair< TBin, TBin > bin_range) const
Definition: bamindex.cpp:913
vector< Uint8 > EstimateDataSizeByAlnStartPos(TSeqPos seqlen=kInvalidSeqPos) const
Definition: bamindex.cpp:1057
TSeqPos m_EstimatedLength
Definition: bamindex.hpp:435
CBGZFRange GetFileRange() const
Definition: bamindex.cpp:1033
pair< TBinsIter, TBinsIter > GetLevelBins(TIndexLevel level) const
Definition: bamindex.cpp:256
bool ProcessPseudoBin(SBamIndexBinInfo &bin)
Definition: bamindex.cpp:302
TBins::const_iterator TBinsIter
Definition: bamindex.hpp:411
CBGZFRange GetLimitRange(COpenRange< TSeqPos > &ref_range, ESearchMode search_mode) const
Definition: bamindex.cpp:786
const char * Read(const char *buffer_ptr, const char *buffer_end, SBamIndexParams params, int32_t ref_index)
Definition: bamindex.cpp:381
vector< uint64_t > CollectEstimatedCoverage(TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1109
CBGZFRange m_UnmappedChunk
Definition: bamindex.hpp:430
vector< CBGZFPos > m_Overlaps
Definition: bamindex.hpp:433
void SetLengthFromHeader(TSeqPos length)
Definition: bamindex.cpp:293
pair< TBinsIter, TBinsIter > AddLevelFileRanges(vector< CBGZFRange > &ranges, CBGZFRange limit_file_range, pair< TBin, TBin > bin_range) const
Definition: bamindex.cpp:889
void ProcessBin(const SBamIndexBinInfo &bin)
Definition: bamindex.cpp:316
vector< TSeqPos > GetAlnOverEnds(void) const
Definition: bamindex.cpp:737
static void x_AddDataSize(vector< Uint8 > &vv, size_t beg_pos, size_t end_pos, CBGZFPos file_beg, CBGZFPos file_end)
Definition: bamindex.cpp:927
size_t block_end
Definition: bamindex.cpp:922
size_t fill_beg_to
Definition: bamindex.cpp:923
void Init(size_t index)
Definition: bamindex.cpp:955
size_t block_beg
Definition: bamindex.cpp:922
CBGZFPos file_end
Definition: bamindex.cpp:924
void InitData(vector< Uint8 > &vv, const SBamIndexBinInfo &bin)
Definition: bamindex.cpp:960
CBGZFPos file_beg
Definition: bamindex.cpp:924
size_t fill_end_to
Definition: bamindex.cpp:923
SBamRangeBlock(vector< Uint8 > &vv, const vector< SBamRangeBlock > &bb, size_t bb_beg, size_t bb_end)
Definition: bamindex.cpp:1007
void ExpandData(vector< Uint8 > &vv, const SBamIndexBinInfo &bin)
Definition: bamindex.cpp:974
static Uint8 MakeUint8(const char *buf)
Definition: bgzf.hpp:167
static Uint4 MakeUint4(const char *buf)
Definition: bgzf.hpp:159
static Uint2 MakeUint2(const char *buf)
Definition: bgzf.hpp:153
static float MakeFloat(const char *buf)
Definition: bgzf.hpp:183
#define _ASSERT
ZLib Compression API.
Modified on Thu Dec 07 10:09:18 2023 by modify_doxy.py rev. 669887