NCBI C++ ToolKit
bamindex.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: bamindex.cpp 101886 2024-02-28 18:12:53Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description:
29  * Access to BAM index files
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <sra/readers/bam/bamread.hpp> // for CBamException
37 #include <util/compress/zlib.hpp>
38 #include <corelib/rwstream.hpp>
39 #include <util/util_exception.hpp>
40 #include <util/timsort.hpp>
44 
45 #ifndef NCBI_THROW2_FMT
46 # define NCBI_THROW2_FMT(exception_class, err_code, message, extra) \
47  throw NCBI_EXCEPTION2(exception_class, err_code, FORMAT(message), extra)
48 #endif
49 
50 
52 
53 //#define NCBI_USE_ERRCODE_X BAM2Graph
54 //NCBI_DEFINE_ERR_SUBCODE_X(6);
55 
57 
58 class CSeq_entry;
59 
60 static const size_t kGZipMagicLength = 2;
61 static const char kGZipMagic[] = "\x1f\x8b";
62 
63 static const char kBamExt[] = ".bam";
64 
65 static const size_t kIndexMagicLength = 4;
66 static const char kBaiExt[] = ".bai";
67 static const char kIndexMagicBAI[] = "BAI\1";
68 #ifdef BAM_SUPPORT_CSI
69 static const char kCsiExt[] = ".csi";
70 static const char kIndexMagicCSI[] = "CSI\1";
71 NCBI_PARAM_DECL(bool, BAM, PREFER_CSI);
72 NCBI_PARAM_DEF_EX(bool, BAM, PREFER_CSI, false, eParam_NoThread, BAM_PREFER_CSI);
73 #endif
74 static const float kEstimatedCompression = 0.25;
75 
76 static inline
77 void s_Read(CNcbiIstream& in, char* dst, size_t len)
78 {
79  while ( len ) {
80  in.read(dst, len);
81  if ( !in ) {
82  NCBI_THROW(CIOException, eRead, "Read failure");
83  }
84  size_t cnt = in.gcount();
85  len -= cnt;
86  dst += cnt;
87  }
88 }
89 
90 
91 static inline
92 const char* s_Read(const char*& buffer_ptr, const char* buffer_end, size_t len)
93 {
94  const char* ret_ptr = buffer_ptr;
95  const char* ret_end = ret_ptr + len;
96  if ( ret_end > buffer_end ) {
97  NCBI_THROW(CIOException, eRead, "BAM index EOF");
98  }
99  buffer_ptr = ret_end;
100  return ret_ptr;
101 }
102 
103 
104 static inline
105 void s_Read(CBGZFStream& in, char* dst, size_t len)
106 {
107  while ( len ) {
108  size_t cnt = in.Read(dst, len);
109  len -= cnt;
110  dst += cnt;
111  }
112 }
113 
114 
115 static inline
116 void s_ReadString(CBGZFStream& in, string& ret, size_t len)
117 {
118  ret.resize(len);
119  s_Read(in, &ret[0], len);
120 }
121 
122 
123 static inline
124 void s_ReadMagic(CBGZFStream& in, const char* magic)
125 {
126  _ASSERT(strlen(magic) == 4);
127  char buf[4];
128  s_Read(in, buf, 4);
129  if ( memcmp(buf, magic, 4) != 0 ) {
130  NCBI_THROW_FMT(CBGZFException, eFormatError,
131  "Bad file magic: "<<NStr::PrintableString(string(buf, buf+4)));
132  }
133 }
134 
135 
136 static inline
138 {
139  char buf[4];
140  s_Read(in, buf, 4);
141  return SBamUtil::MakeUint4(buf);
142 }
143 
144 
145 static inline
147 {
148  return int32_t(s_ReadUInt32(in));
149 }
150 
151 
152 static inline
154 {
155  char buf[8];
156  s_Read(in, buf, 8);
157  return SBamUtil::MakeUint8(buf);
158 }
159 
160 
161 static inline
163 {
164  return CBGZFPos(s_ReadUInt64(in));
165 }
166 
167 
168 static inline
170 {
171  CBGZFPos beg = s_ReadFilePos(in);
172  CBGZFPos end = s_ReadFilePos(in);
173  return CBGZFRange(beg, end);
174 }
175 
176 
177 static inline
179 {
180  char buf[4];
181  s_Read(in, buf, 4);
182  return SBamUtil::MakeUint4(buf);
183 }
184 
185 
186 static inline
188 {
189  return int32_t(s_ReadUInt32(in));
190 }
191 
192 
193 /////////////////////////////////////////////////////////////////////////////
194 // SBamIndexBinInfo
195 /////////////////////////////////////////////////////////////////////////////
196 
197 
199  SBamIndexParams params)
200 {
201  m_Bin = s_ReadUInt32(in);
202 #ifdef BAM_SUPPORT_CSI
203  if ( params.is_CSI ) {
205  }
206  else {
207  m_Overlap = CBGZFPos();
208  }
209 #endif
210  int32_t n_chunks = s_ReadInt32(in);
211  m_Chunks.resize(n_chunks);
212  for ( int32_t i_chunk = 0; i_chunk < n_chunks; ++i_chunk ) {
213  m_Chunks[i_chunk] = s_ReadFileRange(in);
214  }
215 }
216 
217 
218 const char* SBamIndexBinInfo::Read(const char* ptr, const char* end,
219  SBamIndexParams params)
220 {
221  size_t n_chunks;
222 #ifdef BAM_SUPPORT_CSI
223  if ( params.is_CSI ) {
224  const char* header = s_Read(ptr, end, 16);
225  m_Bin = SBamUtil::MakeUint4(header);
227  n_chunks = SBamUtil::MakeUint4(header+12);
228  }
229  else {
230  const char* header = s_Read(ptr, end, 8);
231  m_Bin = SBamUtil::MakeUint4(header);
232  m_Overlap = CBGZFPos();
233  n_chunks = SBamUtil::MakeUint4(header+4);
234  }
235 #endif
236  m_Chunks.reserve(n_chunks);
237  const char* data = s_Read(ptr, end, n_chunks*16);
238  for ( size_t i = 0; i < n_chunks; ++i ) {
239  Uint8 start = SBamUtil::MakeUint8(data+i*16);
240  Uint8 end = SBamUtil::MakeUint8(data+i*16+8);
241  m_Chunks.push_back(CBGZFRange(CBGZFPos(start), CBGZFPos(end)));
242  }
243  return ptr;
244 }
245 
246 
247 /////////////////////////////////////////////////////////////////////////////
248 // SBamIndexRefIndex
249 /////////////////////////////////////////////////////////////////////////////
250 
251 
252 pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>
253 inline
255 {
256  pair<TBinsIter, TBinsIter> ret;
257  if ( level == 0 ) {
258  ret.second = m_Bins.end();
259  }
260  else {
261  ret.second = lower_bound(m_Bins.begin(), m_Bins.end(), GetBinNumberBase(level-1));
262  }
263  ret.first = lower_bound(m_Bins.begin(), ret.second, GetBinNumberBase(level));
264  return ret;
265 }
266 
268  bool operator()(const CBGZFPos p1, const SBamIndexBinInfo& p2) const
269  {
270  return p1 < p2.GetStartFilePos();
271  }
272  bool operator()(const SBamIndexBinInfo& p1, const CBGZFPos p2) const
273  {
274  return p1.GetStartFilePos() < p2;
275  }
276 };
277 
278 
280  bool operator()(const CBGZFPos p1, const CBGZFRange& p2) const
281  {
282  return p1 < p2.second;
283  }
284  bool operator()(const CBGZFRange& p1, const CBGZFPos p2) const
285  {
286  return p1.second < p2;
287  }
288 };
289 
290 
292 {
293  if ( length != kInvalidSeqPos ) {
294  TSeqPos rounded_length = (length+GetMinBinSize()-1)&~(GetMinBinSize()-1);
295  m_EstimatedLength = max(m_EstimatedLength, rounded_length);
296  }
297 }
298 
299 
301 {
302  if ( bin.m_Chunks.size() != 2 ) {
303  NCBI_THROW(CBamException, eInvalidBAIFormat,
304  "Bad unmapped bin format");
305  }
306  m_UnmappedChunk = bin.m_Chunks[0];
307  m_MappedCount = bin.m_Chunks[1].first.GetVirtualPos();
308  m_UnmappedCount = bin.m_Chunks[1].second.GetVirtualPos();
309  bin.m_Chunks.erase(bin.m_Chunks.begin(), bin.m_Chunks.begin()+2);
310  return bin.m_Chunks.empty();
311 }
312 
313 
315 {
316  if ( bin.m_Chunks.empty() ) {
317  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
318  "No chunks in bin "<<bin.m_Bin);
319  }
320  for ( size_t i = 0; i < bin.m_Chunks.size(); ++i ) {
321  auto& range = bin.m_Chunks[i];
322  if ( range.first >= range.second ) {
323  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
324  "Empty BAM BGZF range in bin "<<bin.m_Bin<<
325  ": "<<range.first<<" - "<<range.second);
326  }
327  if ( i && bin.m_Chunks[i-1].second >= range.first ) {
328  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
329  "Overlapping BAM BGZF ranges in bin "<<bin.m_Bin<<
330  ": "<<bin.m_Chunks[i-1].second<<" over "<<range.first);
331  }
332  }
333  auto range = bin.GetSeqRange(*this);
334  TSeqPos min_end = range.GetFrom();
335  if ( range.GetLength() != GetMinBinSize() ) {
336  // at least 1 sub-range
337  min_end += range.GetLength() >> kLevelStepBinShift;
338  }
339  // at least 1 minimal page
340  min_end += GetMinBinSize();
342 }
343 
344 
346  SBamIndexParams params,
347  int32_t ref_index)
348 {
349  SBamIndexParams::operator=(params);
351  size_t bin_count = 0;
352  int32_t n_bin = s_ReadInt32(in);
353  m_Bins.resize(n_bin);
354  const TBin kPseudoBin = GetPseudoBin();
355  for ( int32_t i_bin = 0; i_bin < n_bin; ++i_bin ) {
356  SBamIndexBinInfo& bin = m_Bins[bin_count++];
357  bin.Read(in, *this);
358  if ( bin.m_Bin == kPseudoBin && ProcessPseudoBin(bin) ) {
359  --bin_count;
360  continue;
361  }
362  ProcessBin(bin);
363  }
364  m_Bins.resize(bin_count);
365  gfx::timsort(m_Bins.begin(), m_Bins.end());
366 
367  if ( !is_CSI ) {
368  int32_t n_intv = s_ReadInt32(in);
369  m_Overlaps.resize(n_intv);
370  for ( int32_t i = 0; i < n_intv; ++i ) {
372  }
374  }
376 }
377 
378 
379 const char* SBamIndexRefIndex::Read(const char* buffer_ptr, const char* buffer_end,
380  SBamIndexParams params,
381  int32_t ref_index)
382 {
383  SBamIndexParams::operator=(params);
385  size_t bin_count = 0;
386  size_t n_bin = SBamUtil::MakeUint4(s_Read(buffer_ptr, buffer_end, 4));
387  m_Bins.resize(n_bin);
388  const TBin kPseudoBin = GetPseudoBin();
389  for ( size_t i_bin = 0; i_bin < n_bin; ++i_bin ) {
390  SBamIndexBinInfo& bin = m_Bins[bin_count++];
391  buffer_ptr = bin.Read(buffer_ptr, buffer_end, *this);
392  if ( bin.m_Bin == kPseudoBin && ProcessPseudoBin(bin) ) {
393  --bin_count;
394  continue;
395  }
396  ProcessBin(bin);
397  }
398  m_Bins.resize(bin_count);
399  gfx::timsort(m_Bins.begin(), m_Bins.end());
400 
401  if ( !is_CSI ) {
402  size_t n_intv = SBamUtil::MakeUint4(s_Read(buffer_ptr, buffer_end, 4));
403  m_Overlaps.resize(n_intv);
404  const char* data = s_Read(buffer_ptr, buffer_end, n_intv*8);
405  for ( size_t i = 0; i < n_intv; ++i ) {
407  }
409  }
411  return buffer_ptr;
412 }
413 
414 
415 static
418  const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
419 {
420  if ( iters.first == iters.second ) {
422  }
423  else if ( !params.is_CSI && iters.first->m_Bin == params.kMaxBinNumber ) {
424  // special case for BAI index of too long sequence
426  }
427  else {
428  return iters.first->GetSeqRange(params);
429  }
430 }
431 
432 
433 static
434 CBGZFPos
435 s_GetOverlap(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
436 {
437  if ( iters.first == iters.second ) {
438  return CBGZFPos::GetInvalid();
439  }
440  else {
441  return iters.first->m_Overlap;
442  }
443 }
444 
445 
446 static
447 CBGZFPos
448 s_GetFilePos(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
449 {
450  auto iter = iters.first;
451  if ( iter == iters.second ) {
452  return CBGZFPos::GetInvalid();
453  }
454  return iter->GetStartFilePos();
455 }
456 
457 
458 static
459 CBGZFPos
460 s_GetNextFilePos(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
461 {
462  auto iter = iters.first;
463  if ( iter == iters.second ) {
464  return CBGZFPos::GetInvalid();
465  }
466  ++iter;
467  if ( iter == iters.second ) {
468  return CBGZFPos::GetInvalid();
469  }
470  return iter->GetStartFilePos();
471 }
472 
473 /*
474 static
475 CBGZFPos
476 s_GetFileEnd(const pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>& iters)
477 {
478  if ( iters.first == iters.second ) {
479  return CBGZFPos::GetInvalid();
480  }
481  else {
482  return iters.first->GetEndFilePos();
483  }
484 }
485 */
486 
487 NCBI_PARAM_DECL(int, BAM, OVERLAP_MODE);
488 NCBI_PARAM_DEF_EX(int, BAM, OVERLAP_MODE, 2, eParam_NoThread, BAM_OVERLAP_MODE);
489 
490 
491 static int s_GetOverlapMode()
492 {
493  static int value = NCBI_PARAM_TYPE(BAM, OVERLAP_MODE)::GetDefault();
494  return value;
495 }
496 
497 
498 vector<TSeqPos> SBamIndexRefIndex::GetAlnOverStarts() const
499 {
500 if ( s_GetOverlapMode() == 0 ) {
502  vector<TSeqPos> aln_over_starts(nBins);
503  for ( TSeqPos i = 0; i < nBins; ++i ) {
504  // set limits
505  COpenRange<TSeqPos> ref_range;
507  CBGZFRange limit = GetLimitRange(ref_range, eSearchByOverlap);
508  CBGZFPos min_fp = CBGZFPos::GetInvalid();
509  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
510  TBin bin = GetBinNumberBase(level) + (i>>(level*kLevelStepBinShift));
511  auto it = lower_bound(m_Bins.begin(), m_Bins.end(), bin);
512  if ( it != m_Bins.end() && it->m_Bin == bin ) {
513  for ( auto c : it->m_Chunks ) {
514  if ( c.first >= min_fp ) {
515  break;
516  }
517  if ( c.first >= limit.second ) {
518  break;
519  }
520  if ( c.second <= limit.first ) {
521  continue;
522  }
523  if ( c.first < limit.first ) {
524  c.first = limit.first;
525  }
526  _ASSERT(c.first >= limit.first);
527  _ASSERT(c.first < limit.second);
528  _ASSERT(c.first < c.second);
529  if ( c.first < min_fp ) {
530  min_fp = c.first;
531  }
532  break;
533  }
534  }
535  }
536  TSeqPos min_aln_start;
537  if ( min_fp.IsInvalid() ) {
538  min_aln_start = ref_range.GetFrom();
539  }
540  else {
541  min_aln_start = 0;
542  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
543  auto level_bins = GetLevelBins(level);
544  auto it = lower_bound(level_bins.first, level_bins.second, min_fp, PByStartFilePos());
545  if ( it == level_bins.first ) {
546  continue;
547  }
548  --it;
549  min_aln_start = max(min_aln_start, it->GetSeqRange(*this).GetFrom());
550  if ( it->GetEndFilePos() > min_fp ) {
551  // found exact bin containing the alignment
552  // since we start with the narrowest range there is no point to continue
553  break;
554  }
555  }
556  }
557  aln_over_starts[i] = min_aln_start;
558  }
559  return aln_over_starts;
560 }
561 else if ( s_GetOverlapMode() == 1 ) {
562  size_t nBins = m_Overlaps.size();
563  vector<TSeqPos> aln_over_starts(nBins);
564  // next_bin_it points to a low-level bin that starts after current position
565  auto bin_it_start = GetLevelBins(0).first, next_bin_it = bin_it_start;
566  for ( size_t i = 0; i < nBins; ++i ) {
567  TSeqPos ref_pos = TSeqPos(i * GetMinBinSize());
568  CBGZFPos min_fp = m_Overlaps[i];
569  if ( !min_fp ) {
570  // no overspan
571  aln_over_starts[i] = ref_pos;
572  continue;
573  }
574  // update next_bin_it to point to the next bin after current refseq position
575  while ( next_bin_it != m_Bins.end() && next_bin_it->GetStartFilePos() <= min_fp ) {
576  ++next_bin_it;
577  }
578  TSeqPos min_aln_start = i? aln_over_starts[i-1]: 0;
579  bool inside_min_bin = false;
580  if ( next_bin_it != bin_it_start ) {
581  auto& bin = next_bin_it[-1];
582  _ASSERT(bin.GetStartFilePos() <= min_fp);
583  inside_min_bin = bin.GetEndFilePos() > min_fp;
584  min_aln_start = max(min_aln_start, (bin.m_Bin-GetBinNumberBase(0))*GetMinBinSize());
585  }
586  if ( min_aln_start+GetMinBinSize() < ref_pos && !inside_min_bin ) {
587  // more than 1 page before -> lookup all levels for better estimate
588  for ( TIndexLevel level = 1; level <= GetMaxIndexLevel(); ++level ) {
589  auto level_bins = GetLevelBins(level);
590  auto it = upper_bound(level_bins.first, level_bins.second, min_fp, PByStartFilePos());
591  if ( it == level_bins.first ) {
592  continue;
593  }
594  --it;
595  min_aln_start = max(min_aln_start, it->GetSeqRange(*this).GetFrom());
596  if ( it->GetEndFilePos() > min_fp ) {
597  // found exact bin containing the alignment
598  // since we start with the narrowest range there is no point to continue
599  break;
600  }
601  }
602  }
603  if ( min_aln_start > ref_pos ) {
604  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
605  "Inconsistent linear index at ref pos "<<ref_pos<<
606  ": align starts after end bin start "<<min_aln_start);
607  }
608  aln_over_starts[i] = min_aln_start;
609  }
610  return aln_over_starts;
611 }
612 else {
614  vector<TSeqPos> aln_over_starts(nBins);
615  vector<pair<TBinsIter, TBinsIter>> levelBins;
616  vector<COpenRange<TSeqPos>> levelBinSeqRange;
617  vector<CBGZFPos> levelPrevOverlap;
618  if ( is_CSI ) {
619  levelPrevOverlap.resize(GetMaxIndexLevel()+1);
620  }
621  CBGZFPos minfp = CBGZFPos::GetInvalid();
622  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
623  levelBins.push_back(GetLevelBins(level));
624  levelBinSeqRange.push_back(s_GetSeqRange(*this, levelBins.back()));
625  minfp = min(minfp, s_GetFilePos(levelBins.back()));
626  }
627  if ( minfp.IsInvalid() ) {
628  // no file data -> no overlaps
629  return aln_over_starts;
630  }
631  map<TSeqPos, CBGZFPos> sp2minfp; // map seqpos to the earliest filepos it could appear
632  for ( auto& bin : m_Bins ) {
633  auto sp = bin.GetSeqRange(*this).GetFrom();
634  auto fp = bin.GetStartFilePos();
635  auto ins = sp2minfp.insert(make_pair(sp, fp));
636  if ( !ins.second ) {
637  // uptade with minimum
638  auto& minfp = ins.first->second;
639  minfp = min(minfp, fp);
640  }
641  }
642  map<CBGZFPos, TSeqPos> fp2sp; // map filepos to seqpos that certainly appear at or after
643  for ( auto p : sp2minfp ) {
644  auto ins = fp2sp.insert(make_pair(p.second, p.first));
645  if ( ins.second ) {
646  auto iter = ins.first;
647  ++iter;
648  while ( iter != fp2sp.end() && iter->second < p.first ) {
649  iter = fp2sp.erase(iter);
650  }
651  }
652  }
653  for ( TSeqPos b = 0; b < nBins; ++b ) {
654  TSeqPos seqPos = b << GetMinLevelBinShift();
655  CBGZFPos overlap_fp = CBGZFPos::GetInvalid();
656  if ( b < m_Overlaps.size() && m_Overlaps[b] ) { // BAI overlap table
657  overlap_fp = m_Overlaps[b];
658  }
659  CBGZFPos prev_overlap_fp; // max overlap of previous bins on all levels
660  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
661  // advance to next bin on level if necessary
662  while ( levelBinSeqRange[level].GetToOpen() <= seqPos ) {
663  if ( is_CSI ) {
664  levelPrevOverlap[level] = s_GetOverlap(levelBins[level]);
665  }
666  ++(levelBins[level].first);
667  levelBinSeqRange[level] = s_GetSeqRange(*this, levelBins[level]);
668  }
669  if ( is_CSI ) {
670  CBGZFPos overlap_fp;
671  if ( seqPos >= levelBinSeqRange[level].GetFrom() ) {
672  overlap_fp = s_GetOverlap(levelBins[level]);
673  }
674  else {
675  overlap_fp = levelPrevOverlap[level];
676  }
677  prev_overlap_fp = max(prev_overlap_fp, overlap_fp);
678  }
679  }
680  CBGZFPos found_fp = CBGZFPos::GetInvalid(); // earliest filepos of overlapping alignment
681  CBGZFPos limit_fp = CBGZFPos::GetInvalid(); // filepos after this page to break the lookup
682  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
683  // advance to next bin on level if necessary
684  while ( levelBinSeqRange[level].GetToOpen() <= seqPos ) {
685  if ( is_CSI ) {
686  levelPrevOverlap[level] = s_GetOverlap(levelBins[level]);
687  }
688  ++(levelBins[level].first);
689  levelBinSeqRange[level] = s_GetSeqRange(*this, levelBins[level]);
690  }
691  if ( seqPos < levelBinSeqRange[level].GetFrom() ) {
692  // not in the bin yet
693  continue;
694  }
695  if ( is_CSI && overlap_fp.IsInvalid() ) {
696  // CSI overlap info from bin
697  overlap_fp = max(prev_overlap_fp, levelBins[level].first->m_Overlap);
698  }
699  // update limit file pos from next bin on the level
700  limit_fp = min(limit_fp, s_GetNextFilePos(levelBins[level]));
701  // locate overlapping chunk
702  auto& chunks = levelBins[level].first->m_Chunks;
703  auto it = upper_bound(chunks.begin(), chunks.end(), overlap_fp, PByEndFilePos());
704  if ( it != chunks.end() && it->first < min(found_fp, limit_fp) ) {
705  // found suitable chunk
706  found_fp = max(it->first, overlap_fp);
707  if ( found_fp <= overlap_fp ) {
708  // found minimum, no more searching
709  break;
710  }
711  }
712  }
713  if ( found_fp.IsInvalid() ) {
714  aln_over_starts[b] = seqPos;
715  }
716  else {
717  // find minmal seq pos at this file pos
718  auto iter = fp2sp.upper_bound(found_fp);
719  _ASSERT(iter != fp2sp.begin());
720  // it could be after current page
721  auto osp = min(seqPos, prev(iter)->second);
722  if ( b > 0 ) {
723  // overlap may overshot in case of empty previous bins
724  // that were explicitly marked as having no overlap
725  osp = max(osp, aln_over_starts[b-1]);
726  }
727  aln_over_starts[b] = osp;
728  }
729  }
730  return aln_over_starts;
731 }
732 }
733 
734 
735 vector<TSeqPos> SBamIndexRefIndex::GetAlnOverEnds() const
736 {
737  TSeqPos bin_size = GetMinBinSize();
738  vector<TSeqPos> starts = GetAlnOverStarts();
739  TSeqPos count = TSeqPos(starts.size());
740  vector<TSeqPos> ends(count);
741  TSeqPos si = 0, ei = 0;
742  for ( ; ei < count; ++ei ) {
743  while ( si*bin_size < starts[ei] ) {
744  ends[si++] = ei*bin_size-1;
745  }
746  }
747  while ( si < count ) {
748  ends[si++] = ei*bin_size-1;
749  }
750  return ends;
751 }
752 
753 
754 inline
756 {
758 }
759 
760 
761 inline
762 Uint8 s_EstimatedSize(CBGZFPos file_pos1, CBGZFPos file_pos2)
763 {
764  if ( file_pos1 >= file_pos2 ) {
765  // empty file region
766  return 0;
767  }
768  Uint8 pos1 = s_EstimatedPos(file_pos1);
769  Uint8 pos2 = s_EstimatedPos(file_pos2);
770  if ( pos1 < pos2 )
771  return pos2 - pos1;
772  else
773  return 1; // report non-zero size of non-empty region
774 }
775 
776 
777 inline
779 {
780  return s_EstimatedSize(range.first, range.second);
781 }
782 
783 
785  ESearchMode search_mode) const
786 {
787  CBGZFRange limit;
788  if ( m_EstimatedLength < ref_range.GetToOpen() ) {
789  ref_range.SetToOpen(m_EstimatedLength);
790  }
791  if ( ref_range.Empty() ) {
792  return limit;
793  }
794 
795  if ( search_mode == eSearchByOverlap ) {
796  if ( !m_Overlaps.empty() ) {
797  TBin beg_bin_offset = GetBinNumberOffset(ref_range.GetFrom(), 0);
798  // start limit is from intervals and beg position
799  if ( beg_bin_offset < m_Overlaps.size() ) {
800  limit.first = m_Overlaps[beg_bin_offset];
801  }
802  }
803 #ifdef BAM_SUPPORT_CSI
804  else if ( is_CSI ) {
805  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
806  TBin bin_num = GetBinNumber(ref_range.GetFrom(), level);
807  TBin bin_num_last = GetBinNumber(ref_range.GetTo(), level);
808  auto bins = GetLevelBins(level);
809  auto it = lower_bound(bins.first, bins.second, bin_num);
810  if ( it != bins.second && it->m_Bin <= bin_num_last ) {
811  if ( it->m_Overlap ) {
812  if ( !limit.first || it->m_Overlap < limit.first ) {
813  limit.first = it->m_Overlap;
814  }
815  }
816  if ( it->m_Bin == bin_num ) {
817  break;
818  }
819  }
820  }
821  }
822 #endif
823  }
824  else {
825  // start limit is determined by alignment start position
826  // for each level we'll take end position of previous existing bin
827  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
828  TBin bin_num = GetBinNumber(ref_range.GetFrom(), level);
829  auto bins = GetLevelBins(level);
830  auto it = lower_bound(bins.first, bins.second, bin_num);
831  if ( it != bins.first ) {
832  limit.first = max(limit.first, prev(it)->GetEndFilePos());
833  }
834  }
835  }
836  limit.second = CBGZFPos::GetInvalid();
837  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
838  // next bin on each level is clearly after the range
839  TBin bin_num = GetBinNumber(ref_range.GetTo(), level)+1;
840  auto bins = GetLevelBins(level);
841  auto it = lower_bound(bins.first, bins.second, bin_num);
842  if ( it != bins.second ) {
843  limit.second = min(limit.second, it->GetStartFilePos());
844  }
845  }
846  return limit;
847 }
848 
849 
850 pair<SBamIndexRefIndex::TBin, SBamIndexRefIndex::TBin>
852  TIndexLevel index_level) const
853 {
854  pair<TBin, TBin> bin_range;
855  bin_range.first = GetBinNumber(ref_range.GetFrom(), index_level);
856  if ( IsOverflowBin(bin_range.first, index_level) ) {
857  // position is beyond index limit (can happen with BAI index)
858  // only min and max levels exist, and max level is always root bin
859  if ( index_level == GetMaxIndexLevel() ) {
860  bin_range.first = kMaxBinNumber;
861  bin_range.second = kMaxBinNumber;
862  return bin_range;
863  }
864  else if ( index_level != kMinBinIndexLevel ) {
865  // start bin is neither min nor max level - no bins to scan
866  bin_range.second = bin_range.first-1;
867  return bin_range;
868  }
869  }
870  bin_range.second = GetBinNumber(ref_range.GetTo(), index_level);
871  if ( IsOverflowBin(bin_range.second, index_level) ) {
872  // position is beyond index limit (can happen with BAI index)
873  // only min and max levels exist, and max level is always root bin
874  if ( index_level == GetMaxIndexLevel() ) {
875  bin_range.second = kMaxBinNumber;
876  }
877  else if ( index_level != kMinBinIndexLevel ) {
878  // end bin is neither min nor max level - scan to the end of bins of the level
879  bin_range.second = GetLastBin(index_level);
880  }
881  }
882  return bin_range;
883 }
884 
885 
886 pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>
887 SBamIndexRefIndex::AddLevelFileRanges(vector<CBGZFRange>& ranges,
888  CBGZFRange limit_file_range,
889  pair<TBin, TBin> bin_range) const
890 {
891  TBinsIter first = lower_bound(m_Bins.begin(), m_Bins.end(), bin_range.first);
892  TBinsIter it = first;
893  for ( ; it != m_Bins.end() && it->m_Bin <= bin_range.second; ++it ) {
894  for ( auto c : it->m_Chunks ) {
895  if ( c.first < limit_file_range.first ) {
896  c.first = limit_file_range.first;
897  }
898  if ( limit_file_range.second && limit_file_range.second < c.second ) {
899  c.second = limit_file_range.second;
900  }
901  if ( c.first < c.second ) {
902  ranges.push_back(c);
903  }
904  }
905  }
906  return make_pair(first, it);
907 }
908 
909 
910 pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter>
911 SBamIndexRefIndex::GetBinsIterRange(pair<TBin, TBin> bin_range) const
912 {
913  TBinsIter first = lower_bound(m_Bins.begin(), m_Bins.end(), bin_range.first);
914  TBinsIter it = upper_bound(first, m_Bins.end(), bin_range.second);
915  return make_pair(first, it);
916 }
917 
918 
920  size_t block_beg, block_end; // range of low-level pages
921  size_t fill_beg_to, fill_end_to; // uncertainty about start and end positions
922  CBGZFPos file_beg, file_end; // included BAM file range
923 
924  static
925  void x_AddDataSize(vector<Uint8>& vv, size_t beg_pos, size_t end_pos,
927  {
928  _ASSERT(beg_pos < vv.size());
929  _ASSERT(beg_pos <= end_pos);
930  _ASSERT(end_pos < vv.size());
931  Uint8 file_size = s_EstimatedSize(file_beg, file_end);
932  if ( !file_size ) {
933  return;
934  }
935  size_t page_count = end_pos - beg_pos + 1;
936  Uint8 add_size = (file_size + page_count/2) / page_count;
937  if ( add_size ) {
938  for ( size_t i = beg_pos; i <= end_pos; ++i ) {
939  vv[i] += add_size;
940  }
941  }
942  else {
943  // rounding produced zero, but the original data size was non-zero,
944  // so make resulting esimated sizes at least non-zero
945  for ( size_t i = beg_pos; i <= end_pos; ++i ) {
946  if ( !vv[i] ) {
947  vv[i] = 1;
948  }
949  }
950  }
951  }
952 
953  void Init(size_t index)
954  {
955  block_beg = block_end = index;
956  }
957 
958  void InitData(vector<Uint8>& vv, const SBamIndexBinInfo& bin)
959  {
960  if ( bin.m_Chunks.empty() ) {
961  return;
962  }
963  size_t i = block_beg;
964  _ASSERT(block_end == i);
965  _ASSERT(!file_end);
967  file_beg = bin.GetStartFilePos();
968  file_end = bin.GetEndFilePos();
971  }
972  void ExpandData(vector<Uint8>& vv, const SBamIndexBinInfo& bin)
973  {
974  if ( bin.m_Chunks.empty() ) {
975  return;
976  }
977  CBGZFPos new_file_beg = bin.GetStartFilePos();
978  CBGZFPos new_file_end = bin.GetEndFilePos();
979  _ASSERT(new_file_beg < new_file_end);
980  if ( !file_end ) {
981  // start BAM file range
982  x_AddDataSize(vv, block_beg, block_end, new_file_beg, new_file_end);
983  file_beg = new_file_beg;
984  file_end = new_file_end;
985  // pages are completely uncertain
986  fill_beg_to = block_end; // beg/end cross assignment is intentional
987  fill_end_to = block_beg; // beg/end cross assignment is intentional
988  }
989  else {
990  // expand BAM file range
991  if ( new_file_beg < file_beg ) {
992  x_AddDataSize(vv, block_beg, fill_beg_to, new_file_beg, file_beg);
993  file_beg = new_file_beg;
994  }
995  if ( new_file_end > file_end ) {
996  x_AddDataSize(vv, fill_end_to, block_end, file_end, new_file_end);
997  file_end = new_file_end;
998  }
999  }
1000  }
1001 
1003  {
1004  }
1005  SBamRangeBlock(vector<Uint8>& vv,
1006  const vector<SBamRangeBlock>& bb, size_t bb_beg, size_t bb_end)
1007  {
1008  for ( size_t i = bb_beg; i <= bb_end; ++i ) {
1009  const SBamRangeBlock& b = bb[i];
1010  if ( !b.file_end ) {
1011  continue;
1012  }
1013  if ( !file_end ) {
1014  // start BAM file range
1015  *this = b;
1016  }
1017  else {
1018  // include gap
1019  _ASSERT(file_end <= b.file_beg);
1020  x_AddDataSize(vv, fill_end_to, b.fill_beg_to, file_end, b.file_beg);
1021  fill_end_to = b.fill_end_to;
1022  file_end = b.file_end;
1023  }
1024  }
1025  block_beg = bb[bb_beg].block_beg;
1026  block_end = bb[bb_end].block_end;
1027  }
1028 };
1029 
1030 
1032 {
1033  CBGZFRange range;
1034  range.first = CBGZFPos::GetInvalid();
1035  for ( TIndexLevel level = 0; level <= GetMaxIndexLevel(); ++level ) {
1036  auto bins = GetLevelBins(level);
1037  if ( bins.first != bins.second ) {
1038  CBGZFPos pos_beg = bins.first->GetStartFilePos();
1039  CBGZFPos pos_end = prev(bins.second)->GetEndFilePos();
1040  if ( pos_beg < range.first ) {
1041  range.first = pos_beg;
1042  }
1043  if ( pos_end > range.second ) {
1044  range.second = pos_end;
1045  }
1046  }
1047  }
1048  if ( range.first.IsInvalid() ) {
1049  range.first = CBGZFPos();
1050  }
1051  return range;
1052 }
1053 
1054 
1056 {
1057  size_t bin_count;
1058  if ( seqlen == kInvalidSeqPos ) {
1059  seqlen = m_EstimatedLength;
1060  }
1061  else {
1062  seqlen = max(seqlen, m_EstimatedLength);
1063  }
1064  bin_count = (seqlen+GetMinBinSize()-1) >> GetMinLevelBinShift();
1065  _ASSERT(bin_count);
1066  vector<Uint8> vv(bin_count);
1067  // init blocks
1068  vector<SBamRangeBlock> bb(bin_count);
1069  size_t bb_end = bin_count-1;
1070  for ( size_t i = 0; i <= bb_end; ++i ) {
1071  bb[i].Init(i);
1072  }
1073  // fill smallest bins
1074  {
1075  TBin bin_number_base = GetBinNumberBase(0);
1076  auto level_bins = GetLevelBins(0);
1077  for ( auto bin_it = level_bins.first; bin_it != level_bins.second; ++bin_it ) {
1078  size_t i = bin_it->m_Bin - bin_number_base;
1079  _ASSERT(i <= bb_end);
1080  bb[i].InitData(vv, *bin_it);
1081  }
1082  }
1083  for ( TIndexLevel level = 1; level <= GetMaxIndexLevel(); ++level ) {
1084 
1085  // merge
1086  for ( size_t i = 0; (i<<kLevelStepBinShift) <= bb_end; ++i ) {
1087  size_t src_beg = i<<kLevelStepBinShift;
1088  size_t src_end = min(bb_end, src_beg+(1<<kLevelStepBinShift)-1);
1089  bb[i] = SBamRangeBlock(vv, bb, src_beg, src_end);
1090  }
1091  bb_end >>= kLevelStepBinShift;
1092 
1093  // add next level bins
1094  TBin bin_number_base = GetBinNumberBase(level);
1095  auto level_bins = GetLevelBins(level);
1096  for ( auto bin_it = level_bins.first; bin_it != level_bins.second; ++bin_it ) {
1097  size_t i = bin_it->m_Bin - bin_number_base;
1098  _ASSERT(i <= bb_end);
1099  bb[i].ExpandData(vv, *bin_it);
1100  }
1101  }
1102  _ASSERT(bb_end == 0);
1103  return vv;
1104 }
1105 
1106 
1108  TIndexLevel max_index_level) const
1109 {
1110  vector<uint64_t> vv(((m_EstimatedLength-GetMinBinSize()) >> GetLevelBinShift(min_index_level))+1);
1111  for ( TIndexLevel level = min_index_level; level <= max_index_level; ++level ) {
1112  uint32_t vv_bin_shift = (level-min_index_level)*kLevelStepBinShift;
1113  uint32_t vv_bin_count = 1 << vv_bin_shift;
1114  auto level_bins = GetLevelBins(level);
1115  TBin bin_base = GetBinNumberBase(level);
1116  for ( auto it = level_bins.first; it != level_bins.second; ++it ) {
1117  uint64_t value = 0;
1118  for ( auto& c : it->m_Chunks ) {
1119  value += s_EstimatedSize(c);
1120  }
1121  if ( !value ) {
1122  continue;
1123  }
1124  uint32_t pos = (it->m_Bin - bin_base) << vv_bin_shift;
1125  _ASSERT(pos < vv.size());
1126  uint64_t add = value;
1127  uint32_t cnt = min(vv_bin_count, uint32_t(vv.size()-pos));
1128  if ( cnt > 1 ) {
1129  // distribute
1130  add = (add+cnt/2)/cnt;
1131  }
1132  if ( !add ) {
1133  for ( uint32_t i = 0; i < cnt; ++i ) {
1134  vv[pos+i] = max(uint64_t(1), vv[pos+i]);
1135  }
1136  }
1137  else {
1138  for ( uint32_t i = 0; i < cnt; ++i ) {
1139  vv[pos+i] += add;
1140  }
1141  }
1142  }
1143  }
1144  return vv;
1145 }
1146 
1147 
1148 /////////////////////////////////////////////////////////////////////////////
1149 // CCached
1150 /////////////////////////////////////////////////////////////////////////////
1151 
1152 
1153 static size_t ReadVDBFile(AutoArray<char>& data, const string& path)
1154 {
1155  CBamVDBFile file(path);
1156  size_t fsz = file.GetSize();
1157  data.reset(new char[fsz]);
1158  file.ReadExactly(0, data.get(), fsz);
1159  return fsz;
1160 }
1161 
1162 
1163 /////////////////////////////////////////////////////////////////////////////
1164 // CBamIndex
1165 /////////////////////////////////////////////////////////////////////////////
1166 
1167 
1169  : m_UnmappedCount(0),
1170  m_TotalReadBytes(0),
1171  m_TotalReadSeconds(0)
1172 {
1173 }
1174 
1175 
1176 CBamIndex::CBamIndex(const string& index_file_name)
1177  : m_UnmappedCount(0),
1178  m_TotalReadBytes(0),
1179  m_TotalReadSeconds(0)
1180 {
1181  Read(index_file_name);
1182 }
1183 
1184 
1186 {
1187 }
1188 
1189 
1190 void CBamIndex::Read(const string& index_file_name)
1191 {
1192  m_FileName = index_file_name;
1193  m_Refs.clear();
1194  m_UnmappedCount = 0;
1195 
1198  size_t size = ReadVDBFile(data, index_file_name);
1201  if ( CBamDb::GetDebugLevel() >= 3 ) {
1202  LOG_POST("BAM: read index "<<size/double(1<<20)<<" MB"
1203  " speed: "<<size/(m_TotalReadSeconds*(1<<20))<<" MB/s");
1204  }
1205  Read(data.get(), size);
1206 }
1207 
1209 class CMemoryReader : public IReader
1210 {
1211 public:
1212  CMemoryReader(const char* ptr, size_t size)
1213  : m_Ptr(ptr),
1214  m_Size(size)
1215  {
1216  }
1217 
1219  size_t count,
1220  size_t* bytes_read)
1221  {
1222  if ( !m_Size ) {
1223  if ( bytes_read ) {
1224  *bytes_read = 0;
1225  }
1226  return eRW_Eof;
1227  }
1228  count = min(m_Size, count);
1229  memcpy(buf, m_Ptr, count);
1230  m_Ptr += count;
1231  m_Size -= count;
1232  if ( bytes_read ) {
1233  *bytes_read = count;
1234  }
1235  return eRW_Success;
1236  }
1237 
1239  {
1240  *count = m_Size;
1241  return eRW_Success;
1242  }
1243 
1244 private:
1245  const char* m_Ptr;
1246  size_t m_Size;
1247 };
1249 
1251 {
1252 #ifdef BAM_SUPPORT_CSI
1253  is_CSI = false;
1255  depth = kBAI_depth;
1256 #endif
1257 
1258  char magic[kIndexMagicLength];
1259  s_Read(in, magic, kIndexMagicLength);
1260  if ( memcmp(magic, kIndexMagicBAI, kIndexMagicLength) == 0 ) {
1261  // BAI, no extra parameters
1262  }
1263 #ifdef BAM_SUPPORT_CSI
1264  else if ( memcmp(magic, kIndexMagicCSI, kIndexMagicLength) == 0 ) {
1265  // CSI
1266  is_CSI = true;
1268  depth = s_ReadUInt32(in);
1269  size_t l_aux = s_ReadUInt32(in);
1270  while ( l_aux ) {
1271  char buf[256];
1272  size_t count = min(l_aux, sizeof(buf));
1273  s_Read(in, buf, count);
1274  l_aux -= count;
1275  }
1276  }
1277 #endif
1278  else {
1279  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
1280  "Bad file magic: "<<NStr::PrintableString(string(magic, magic+kIndexMagicLength)));
1281  }
1282  int32_t n_ref = s_ReadInt32(in);
1283  m_Refs.resize(n_ref);
1284  for ( int32_t i_ref = 0; i_ref < n_ref; ++i_ref ) {
1285  m_Refs[i_ref].Read(in, *this, i_ref);
1286  }
1287  streampos extra_pos = in.tellg();
1288  in.seekg(0, ios::end);
1289  streampos end_pos = in.tellg();
1290  in.seekg(extra_pos);
1291 
1292  if ( end_pos-extra_pos >= 8 ) {
1294  extra_pos += 8;
1295  }
1296  if ( end_pos != extra_pos ) {
1297  ERR_POST(Warning<<
1298  "Extra "<<(end_pos-extra_pos)<<" bytes in BAM index");
1299  }
1300 }
1301 
1302 
1303 void CBamIndex::Read(const char* buffer_ptr, size_t buffer_size)
1304 {
1305  if ( buffer_size >= kGZipMagicLength &&
1306  memcmp(buffer_ptr, kGZipMagic, kGZipMagicLength) == 0 ) {
1307  // gzipped index
1308  unique_ptr<CNcbiIstream> data_stream =
1309  make_unique<CRStream>(new CMemoryReader(buffer_ptr, buffer_size),
1310  0, nullptr, CRWStreambuf::fOwnReader);
1311  unique_ptr<CNcbiIstream> z_stream =
1312  make_unique<CCompressionIStream>(*data_stream,
1315  Read(*z_stream);
1316  return;
1317  }
1318 
1319  const char* buffer_end = buffer_ptr + buffer_size;
1320 
1321 #ifdef BAM_SUPPORT_CSI
1322  is_CSI = false;
1324  depth = kBAI_depth;
1325 #endif
1326 
1327  const char* magic = s_Read(buffer_ptr, buffer_end, kIndexMagicLength);
1328  if ( memcmp(magic, kIndexMagicBAI, kIndexMagicLength) == 0 ) {
1329  // BAI
1330  }
1331 #ifdef BAM_SUPPORT_CSI
1332  else if ( memcmp(magic, kIndexMagicCSI, kIndexMagicLength) == 0 ) {
1333  // CSI
1334  is_CSI = true;
1335  const char* header = s_Read(buffer_ptr, buffer_end, 12);
1336  min_shift = SBamUtil::MakeUint4(header);
1337  depth = SBamUtil::MakeUint4(header+4);
1338  auto l_aux = SBamUtil::MakeUint4(header+8);
1339  s_Read(buffer_ptr, buffer_end, l_aux);
1340  }
1341 #endif
1342  else {
1343  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
1344  "Bad file magic: "<<NStr::PrintableString(string(magic, magic+kIndexMagicLength)));
1345  }
1346  const char* header = s_Read(buffer_ptr, buffer_end, 4);
1347  uint32_t n_ref = SBamUtil::MakeUint4(header);
1348  m_Refs.resize(n_ref);
1349  for ( uint32_t i = 0; i < n_ref; ++i ) {
1350  buffer_ptr = m_Refs[i].Read(buffer_ptr, buffer_end, *this, i);
1351  }
1352  if ( buffer_end - buffer_ptr >= 8 ) {
1353  m_UnmappedCount = SBamUtil::MakeUint8(buffer_ptr);
1354  buffer_ptr += 8;
1355  }
1356  if ( buffer_ptr != buffer_end ) {
1357  ERR_POST(Warning<<
1358  "Extra "<<(buffer_end-buffer_ptr)<<" bytes in BAM index");
1359  }
1360 }
1361 
1362 
1363 const SBamIndexRefIndex& CBamIndex::GetRef(size_t ref_index) const
1364 {
1365  if ( ref_index >= GetRefCount() ) {
1366  NCBI_THROW(CBamException, eInvalidArg,
1367  "Bad reference sequence index");
1368  }
1369  return m_Refs[ref_index];
1370 }
1371 
1372 
1374 {
1375  if ( GetRefCount() != header.GetRefCount() ) {
1376  NCBI_THROW_FMT(CBamException, eInvalidBAIFormat,
1377  "Wrong index ref count: "<<
1378  GetRefCount()<<" <> "<<header.GetRefCount());
1379  }
1380  for ( size_t i = 0; i < GetRefCount(); ++i ) {
1381  m_Refs[i].SetLengthFromHeader(header.GetRef(i).m_Length);
1382  }
1383 }
1384 
1385 
1387 {
1388  CBGZFRange total_range(CBGZFPos(-1), CBGZFPos(0));
1389  for ( auto& b : GetRef(ref_index).m_Bins ) {
1390  CBGZFPos start_pos = b.GetStartFilePos();
1391  if ( start_pos < total_range.first )
1392  total_range.first = start_pos;
1393  CBGZFPos end_pos = b.GetEndFilePos();
1394  if ( total_range.second < end_pos )
1395  total_range.second = end_pos;
1396  }
1397  return total_range;
1398 }
1399 
1400 
1401 static void sx_SetTitle(CSeq_graph& graph, CSeq_annot& annot,
1402  string title, string name)
1403 {
1404  if ( name.empty() ) {
1405  name = "BAM coverage";
1406  }
1407  if ( title.empty() ) {
1408  title = name;
1409  }
1410  graph.SetTitle(title);
1411  annot.SetNameDesc(name);
1412 }
1413 
1414 
1417  const string& ref_name,
1418  const string& seq_id,
1419  const string& annot_name,
1420  TIndexLevel min_index_level,
1421  TIndexLevel max_index_level) const
1422 {
1423  CSeq_id id(seq_id);
1424  return MakeEstimatedCoverageAnnot(header, ref_name, id, annot_name, min_index_level, max_index_level);
1425 }
1426 
1427 
1430  const string& ref_name,
1431  const CSeq_id& seq_id,
1432  const string& annot_name,
1433  TIndexLevel min_index_level,
1434  TIndexLevel max_index_level) const
1435 {
1436  size_t ref_index = header.GetRefIndex(ref_name);
1437  if ( ref_index == size_t(-1) ) {
1438  NCBI_THROW_FMT(CBamException, eInvalidArg,
1439  "Cannot find RefSeq: "<<ref_name);
1440  }
1441  return MakeEstimatedCoverageAnnot(ref_index, seq_id, annot_name,
1442  header.GetRefLength(ref_index), min_index_level, max_index_level);
1443 }
1444 
1445 
1448  const string& seq_id,
1449  const string& annot_name,
1450  TSeqPos length,
1451  TIndexLevel min_index_level,
1452  TIndexLevel max_index_level) const
1453 {
1454  CSeq_id id(seq_id);
1455  return MakeEstimatedCoverageAnnot(ref_index, id, annot_name, length, min_index_level, max_index_level);
1456 }
1457 
1458 
1461  const string& seq_id,
1462  const string& annot_name,
1463  TIndexLevel min_index_level,
1464  TIndexLevel max_index_level) const
1465 {
1466  return MakeEstimatedCoverageAnnot(ref_index, seq_id, annot_name, kInvalidSeqPos, min_index_level, max_index_level);
1467 }
1468 
1469 
1472  const CSeq_id& seq_id,
1473  const string& annot_name,
1474  TIndexLevel min_index_level,
1475  TIndexLevel max_index_level) const
1476 {
1477  return MakeEstimatedCoverageAnnot(ref_index, seq_id, annot_name, kInvalidSeqPos, min_index_level, max_index_level);
1478 }
1479 
1480 
1481 vector<uint64_t>
1483  TIndexLevel min_index_level,
1484  TIndexLevel max_index_level) const
1485 {
1486  return GetRef(ref_index).CollectEstimatedCoverage(min_index_level, max_index_level);
1487 }
1488 
1489 
1492  const CSeq_id& seq_id,
1493  const string& annot_name,
1494  TSeqPos length,
1495  TIndexLevel min_index_level,
1496  TIndexLevel max_index_level) const
1497 {
1498  TSeqPos bin_size = GetBinSize(min_index_level);
1499  vector<uint64_t> vv = CollectEstimatedCoverage(ref_index, min_index_level, max_index_level);
1500  if ( vv.empty() ) vv.push_back(0);
1501  uint32_t count = uint32_t(vv.size());
1502  if ( length == 0 || length == kInvalidSeqPos ) {
1503  length = count*bin_size;
1504  }
1505 
1506  CRef<CSeq_annot> annot(new CSeq_annot);
1507  CRef<CSeq_graph> graph(new CSeq_graph);
1508  annot->SetData().SetGraph().push_back(graph);
1509  sx_SetTitle(*graph, *annot, annot_name, annot_name);
1510 
1511  graph->SetLoc().SetInt().SetId().Assign(seq_id);
1512  graph->SetLoc().SetInt().SetFrom(0);
1513  graph->SetLoc().SetInt().SetTo(length-1);
1514  graph->SetComp(bin_size);
1515  graph->SetNumval(count);
1516  CByte_graph& bgraph = graph->SetGraph().SetByte();
1517  CByte_graph::TValues& bvalues = bgraph.SetValues();
1518  bvalues.resize(count);
1519  Uint1 bmax = 0;
1520  uint64_t max_value = *max_element(vv.begin(), vv.end());
1521  double mul = min(1., 255./max_value);
1522  for ( size_t i = 0; i < count; ++i ) {
1523  if ( auto v = vv[i] ) {
1524  Uint1 b = Uint1(v*mul+.5);
1525  // ensure non-zero value be still non-zero after scaling
1526  if ( !b ) {
1527  b = 1;
1528  }
1529  bvalues[i] = b;
1530  bmax = max(bmax, b);
1531  }
1532  }
1533  bgraph.SetAxis(0);
1534  bgraph.SetMin(1);
1535  bgraph.SetMax(bmax);
1536  if ( mul != 1 ) {
1537  graph->SetA(1/mul);
1538  }
1539  return annot;
1540 }
1541 
1542 
1543 /////////////////////////////////////////////////////////////////////////////
1544 // CBamHeader
1545 /////////////////////////////////////////////////////////////////////////////
1546 
1547 
1549 {
1550 }
1551 
1552 
1553 CBamHeader::CBamHeader(const string& bam_file_name)
1554 {
1555  Read(bam_file_name);
1556 }
1557 
1558 
1560 {
1561 }
1562 
1563 
1565 {
1566  int32_t l_name = s_ReadInt32(in);
1567  s_ReadString(in, m_Name, l_name);
1568  m_Name.resize(l_name-1);
1569  m_Length = s_ReadInt32(in);
1570 }
1571 
1572 
1573 void CBamHeader::Read(const string& bam_file_name)
1574 {
1575  CBGZFFile file(bam_file_name);
1576  CBGZFStream file_stream(file);
1577  Read(file_stream);
1578 }
1579 
1580 
1582 {
1583  m_RefByName.clear();
1584  m_Refs.clear();
1585  s_ReadMagic(stream, "BAM\1");
1586  int32_t l_text = s_ReadInt32(stream);
1587  s_ReadString(stream, m_Text, l_text);
1588  int32_t n_ref = s_ReadInt32(stream);
1589  m_Refs.resize(n_ref);
1590  for ( int32_t i_ref = 0; i_ref < n_ref; ++i_ref ) {
1591  m_Refs[i_ref].Read(stream);
1592  m_RefByName[m_Refs[i_ref].m_Name] = i_ref;
1593  }
1594  m_AlignStart = stream.GetSeekPos();
1595 }
1596 
1597 
1598 const SBamHeaderRefInfo& CBamHeader::GetRef(size_t ref_index) const
1599 {
1600  if ( ref_index >= GetRefCount() ) {
1601  NCBI_THROW(CBamException, eInvalidArg,
1602  "Bad reference sequence index");
1603  }
1604  return m_Refs[ref_index];
1605 }
1606 
1607 
1608 size_t CBamHeader::GetRefIndex(const string& name) const
1609 {
1610  auto iter = m_RefByName.find(name);
1611  if ( iter == m_RefByName.end() ) {
1612  return size_t(-1);
1613  }
1614  return iter->second;
1615 }
1616 
1618 {
1619  CTempString record;
1620  enum { eNone, eTag, eRecord, eValue} state = eNone;
1621  bool state_changed = true;
1622  const char *p, *p0, *pend;
1623 
1624  for (p = m_Text.data(), pend = p + m_Text.size(); p < pend; ++p) {
1625  if (state_changed) {
1626  state_changed = false;
1627  for (; p < pend && iswspace(*p); ++p)
1628  ;
1629  p0 = p;
1630  }
1631  if (*p == '@') {
1632  state = eTag;
1633  p0 = p;
1634  }
1635  else if (*p == ':') {
1636  if (state == eRecord) {
1637  record.assign(p0, p-p0);
1638  state = eValue;
1639  state_changed = true;
1640  p0 = p;
1641  }
1642  }
1643  else if ( iswspace(*p) ) {
1644  if (state == eTag) {
1645  records.push_back( TSBamRecord(string(p0, p-p0), TSBamTags()));
1646  state = eRecord;
1647  state_changed = true;
1648  }
1649  else if (state == eValue) {
1650  records.back().second[record] = string(p0, p-p0);
1651  state = eRecord;
1652  state_changed = true;
1653  }
1654  }
1655  }
1656  if (state == eValue) {
1657  records.back().second[record] = string(p0, p-p0);
1658  }
1659  return records.size();
1660 }
1661 
1662 
1663 /////////////////////////////////////////////////////////////////////////////
1664 // CBamFileRangeSet
1665 /////////////////////////////////////////////////////////////////////////////
1666 
1667 
1669 {
1670 }
1671 
1672 
1674  size_t ref_index,
1675  COpenRange<TSeqPos> ref_range,
1676  ESearchMode search_mode)
1677 {
1678  AddRanges(index, ref_index, ref_range, search_mode);
1679 }
1680 
1681 
1683  size_t ref_index,
1684  COpenRange<TSeqPos> ref_range,
1685  TIndexLevel min_level, TIndexLevel max_level,
1686  ESearchMode search_mode)
1687 {
1688  AddRanges(index, ref_index, ref_range, min_level, max_level, search_mode);
1689 }
1690 
1691 
1693  size_t ref_index,
1694  COpenRange<TSeqPos> ref_range,
1695  EIndexLevel min_level, EIndexLevel max_level,
1696  ESearchMode search_mode)
1697 {
1698  AddRanges(index, ref_index, ref_range, min_level, max_level, search_mode);
1699 }
1700 
1701 
1703 {
1704 }
1705 
1706 
1707 ostream& operator<<(ostream& out, const CBamFileRangeSet& ranges)
1708 {
1709  cout << '(';
1710  for ( auto& r : ranges ) {
1711  cout << " (" << r.first<<" "<<r.second<<")";
1712  }
1713  return cout << " )";
1714 }
1715 
1716 
1718 {
1719  m_Ranges.clear();
1720 }
1721 
1722 
1723 inline
1724 void CBamFileRangeSet::AddSortedRanges(const vector<CBGZFRange>& ranges,
1725  const CBGZFPos* file_pos)
1726 {
1727  CBGZFPos min_pos = file_pos? *file_pos: CBGZFPos();
1728  for ( auto iter = ranges.begin(); iter != ranges.end(); ) {
1729  CBGZFPos start = iter->first, end = iter->second;
1730  for ( ++iter; iter != ranges.end() && !(end < iter->first); ++iter ) {
1731  if ( end < iter->second ) {
1732  end = iter->second;
1733  }
1734  }
1735  if ( start < min_pos ) {
1736  // the range starts before the requested starting file position
1737  if ( end <= min_pos ) {
1738  // the range is fully before the starting file position - skip it
1739  continue;
1740  }
1741  // otherwise trim it
1742  start = min_pos;
1743  }
1744  m_Ranges += CBGZFRange(start, end);
1745  }
1746 }
1747 
1748 
1749 NCBI_PARAM_DECL(int, BAM, RANGES_MODE);
1750 NCBI_PARAM_DEF_EX(int, BAM, RANGES_MODE, 1, eParam_NoThread, BAM_RANGES_MODE);
1751 
1752 
1753 static int s_GetRangesMode()
1754 {
1755  static int value = NCBI_PARAM_TYPE(BAM, RANGES_MODE)::GetDefault();
1756  return value;
1757 }
1758 
1759 
1761  size_t ref_index,
1762  COpenRange<TSeqPos> ref_range,
1763  TIndexLevel min_index_level,
1764  TIndexLevel max_index_level,
1765  ESearchMode search_mode,
1766  const CBGZFPos* file_pos)
1767 {
1768  vector<CBGZFRange> ranges;
1769  const SBamIndexRefIndex& ref = index.GetRef(ref_index);
1770  if ( s_GetRangesMode() == 0 ) {
1771  // set limits
1772  CBGZFRange limit = ref.GetLimitRange(ref_range, search_mode);
1773  if ( ref_range.Empty() ) {
1774  return;
1775  }
1776  for ( TIndexLevel level = min_index_level; level <= index.GetMaxIndexLevel(); ++level ) {
1777  ref.AddLevelFileRanges(ranges, limit, index.GetBinRange(ref_range, level));
1778  }
1779  }
1780  else {
1782  // iterate index levels starting with 0 to set limits correctly
1783  // iterate index levels till the end because alignments may be moved up
1784  TSeqPos set_limit_by_overlap_at = 0;
1785  for ( TIndexLevel level = 0; level <= index.GetMaxIndexLevel(); ++level ) {
1786  // omit ranges from lower index levels because they contain only low-level alignments
1787  auto bin_range = index.GetBinRange(ref_range, level);
1788  pair<SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter> iter_range;
1789  if ( level >= min_index_level ) {
1790  iter_range = ref.AddLevelFileRanges(ranges, limit, bin_range);
1791  _ASSERT(iter_range == ref.GetBinsIterRange(bin_range));
1792  }
1793  else {
1794  iter_range = ref.GetBinsIterRange(bin_range);
1795  }
1796  // set file range limit from overlap fields
1797  // the most limiting overlap is on the lowest existing index level, so set it once
1798  // this limit is valid for both search modes
1799  if ( index.is_CSI ) {
1800  // CSI overlaps are in bins
1801  auto first_bin = iter_range.first;
1802  if ( (first_bin == ref.m_Bins.end() ||
1803  first_bin->m_Bin != bin_range.first) &&
1804  first_bin != ref.m_Bins.begin() ) {
1805  --first_bin;
1806  }
1807  if ( first_bin != ref.m_Bins.end() &&
1808  first_bin->m_Bin <= bin_range.first &&
1809  first_bin->m_Bin >= index.GetFirstBin(level) ) {
1810  // the bin is at or before the first one and at the same level
1811  TSeqPos pos = first_bin->GetSeqRange(index).GetFrom();
1812  if ( pos > set_limit_by_overlap_at ) {
1813  // better limit
1814  set_limit_by_overlap_at = pos;
1815  limit.first = max(limit.first, first_bin->m_Overlap);
1816  }
1817  }
1818  }
1819  else {
1820  // BAI overlaps are in a separate array, reflecting min level
1821  if ( level == kMinBinIndexLevel && !ref.m_Overlaps.empty() ) {
1822  size_t bin_index = bin_range.first-index.GetFirstBin(kMinBinIndexLevel);
1823  if ( bin_index < ref.m_Overlaps.size() ) {
1824  limit.first = max(limit.first, ref.m_Overlaps[bin_index]);
1825  }
1826  }
1827  }
1828  // in eSearchByStart mode we can set lower limit of file positions
1829  // from the end of previous bin on the same level
1830  // these limits are combined
1831  if ( search_mode == eSearchByStart ) {
1832  // set file range limit from previous bins
1833  // limits from all levels matter, choose the most limiting one
1834  auto first_bin = iter_range.first;
1835  if ( first_bin != ref.m_Bins.begin() ) {
1836  auto prev_bin = prev(first_bin);
1837  _ASSERT(prev_bin->m_Bin < bin_range.first);
1838  if ( prev_bin->m_Bin >= index.GetFirstBin(level) ) {
1839  // prev bin is on the same level
1840  limit.first = max(limit.first, prev_bin->GetEndFilePos());
1841  }
1842  }
1843  }
1844  // in all search modes we can limit end of search range by next bin on the same level
1845  // update cutoff file pos from the first next bin
1846  auto next_bin = iter_range.second;
1847  if ( next_bin != ref.m_Bins.end() &&
1848  next_bin->m_Bin < index.GetFirstBin(level-1) ) {
1849  // next bin is on the same level
1850  limit.second = min(limit.second, next_bin->GetStartFilePos());
1851  }
1852  }
1853  }
1854  gfx::timsort(ranges.begin(), ranges.end());
1855  AddSortedRanges(ranges, file_pos);
1856 }
1857 
1858 
1860  size_t ref_index,
1861  COpenRange<TSeqPos> ref_range,
1862  ESearchMode search_mode,
1863  const CBGZFPos* file_pos)
1864 {
1865  AddRanges(index, ref_index, ref_range, 0, index.GetMaxIndexLevel(), search_mode, file_pos);
1866 }
1867 
1868 
1870  size_t ref_index,
1871  COpenRange<TSeqPos> ref_range,
1872  TIndexLevel index_level,
1873  ESearchMode search_mode,
1874  const CBGZFPos* file_pos)
1875 {
1876  AddRanges(index, ref_index, ref_range, index_level, index_level, search_mode, file_pos);
1877 }
1878 
1879 
1881 {
1882  CBGZFRange whole;
1883  whole.first = file_pos;
1884  whole.second = CBGZFPos::GetInvalid();
1885  m_Ranges += whole;
1886 }
1887 
1888 
1890 {
1891  AddFrom(header.GetAlignStart());
1892 }
1893 
1894 
1895 void CBamFileRangeSet::AddFrom(const CBamHeader& header, const CBGZFPos* file_pos)
1896 {
1897  if ( file_pos && *file_pos ) {
1898  AddFrom(*file_pos);
1899  }
1900  else {
1901  AddWhole(header);
1902  }
1903 }
1904 
1905 
1907  size_t ref_index,
1908  COpenRange<TSeqPos> ref_range,
1909  TIndexLevel min_index_level,
1910  TIndexLevel max_index_level,
1911  ESearchMode search_mode,
1912  const CBGZFPos* file_pos)
1913 {
1914  Clear();
1915  AddRanges(index, ref_index, ref_range, min_index_level, max_index_level, search_mode, file_pos);
1916 }
1917 
1918 
1920  size_t ref_index,
1921  COpenRange<TSeqPos> ref_range,
1922  ESearchMode search_mode,
1923  const CBGZFPos* file_pos)
1924 {
1925  SetRanges(index, ref_index, ref_range, 0, index.GetMaxIndexLevel(), search_mode, file_pos);
1926 }
1927 
1928 
1930  size_t ref_index,
1931  COpenRange<TSeqPos> ref_range,
1932  TIndexLevel index_level,
1933  ESearchMode search_mode,
1934  const CBGZFPos* file_pos)
1935 {
1936  SetRanges(index, ref_index, ref_range, index_level, index_level, search_mode, file_pos);
1937 }
1938 
1939 
1941 {
1942  return s_EstimatedSize(range);
1943 }
1944 
1945 
1947 {
1948  Uint8 size = 0;
1949  for ( auto& c : m_Ranges ) {
1950  size += GetFileSize(c);
1951  }
1952  return size;
1953 }
1954 
1955 
1956 /////////////////////////////////////////////////////////////////////////////
1957 // CBamRawDb
1958 /////////////////////////////////////////////////////////////////////////////
1959 
1960 
1962 {
1963 }
1964 
1965 
1966 void CBamRawDb::Open(const string& bam_path)
1967 {
1968  m_File = new CBGZFFile(bam_path);
1969  CBGZFStream stream(*m_File);
1970  m_Header.Read(stream);
1971 }
1972 
1973 
1974 static void s_AddReplacedExt(vector<string>& dst,
1975  const string& base_name,
1976  CTempString old_ext,
1977  CTempString new_ext)
1978 {
1979  if ( NStr::EndsWith(base_name, old_ext) ) {
1980  dst.push_back(base_name.substr(0, base_name.size()-old_ext.size())+new_ext);
1981  }
1982 }
1983 
1984 
1985 void CBamRawDb::Open(const string& bam_path, const string& index_path)
1986 {
1987  vector<string> index_name_candidates;
1988  if ( index_path.empty() || index_path == bam_path ) {
1989 #ifdef BAM_SUPPORT_CSI
1990  bool prefer_csi = NCBI_PARAM_TYPE(BAM, PREFER_CSI)::GetDefault();
1991  if ( prefer_csi ) {
1992  index_name_candidates.push_back(bam_path+kCsiExt);
1993  s_AddReplacedExt(index_name_candidates, bam_path, kBamExt, kCsiExt);
1994  }
1995 #endif
1996  index_name_candidates.push_back(bam_path+kBaiExt);
1997  s_AddReplacedExt(index_name_candidates, bam_path, kBamExt, kBaiExt);
1998 #ifdef BAM_SUPPORT_CSI
1999  if ( !prefer_csi ) {
2000  index_name_candidates.push_back(bam_path+kCsiExt);
2001  s_AddReplacedExt(index_name_candidates, bam_path, kBamExt, kCsiExt);
2002  }
2003 #endif
2004  }
2005  else {
2006  index_name_candidates.push_back(index_path);
2007  }
2008  for ( size_t i = 0; i < index_name_candidates.size(); ++i ) {
2009  try {
2010  m_Index.Read(index_name_candidates[i]);
2011  break;
2012  }
2013  catch ( CBamException& exc ) {
2014  if ( i < index_name_candidates.size()-1 &&
2016  // try next index file name candidate
2017  continue;
2018  }
2019  else {
2020  throw;
2021  }
2022  }
2023  }
2024  m_File = new CBGZFFile(bam_path);
2026  CBGZFStream stream(*m_File);
2027  m_Header.Read(stream);
2029 }
2030 
2031 
2033 {
2034  // adjustments
2035  const double index_read_weight = 10;
2036  const Uint8 add_read_bytes = 100000; // 100KB
2037  const double add_read_bytes_per_second = 80e6; // 80 MBps
2038  const Uint8 add_unzip_bytes = 100000; // 100KB
2039  const double add_unzip_bytes_per_second = 80e6; // 80 MBps
2040 
2041  pair<Uint8, double> index_read_stat = m_Index.GetReadStatistics();
2042  pair<Uint8, double> data_read_stat = m_File->GetReadStatistics();
2043  pair<Uint8, double> data_unzip_stat = m_File->GetUncompressStatistics();
2044  Uint8 read_bytes =
2045  Uint8(index_read_stat.first*index_read_weight) +
2046  data_read_stat.first +
2047  add_read_bytes;
2048  double read_seconds =
2049  index_read_stat.second*index_read_weight +
2050  data_read_stat.second +
2051  add_read_bytes/add_read_bytes_per_second;
2052 
2053  Uint8 unzip_bytes = data_unzip_stat.first + add_unzip_bytes;
2054  double unzip_seconds = data_unzip_stat.second + add_unzip_bytes/add_unzip_bytes_per_second;
2055 
2056  return read_seconds/read_bytes + unzip_seconds/unzip_bytes;
2057 }
2058 
2059 
2060 /////////////////////////////////////////////////////////////////////////////
2061 // SBamAlignInfo
2062 /////////////////////////////////////////////////////////////////////////////
2063 
2065 {
2066  string ret;
2067  if ( uint32_t len = get_read_len() ) {
2068  ret.resize(len);
2069  char* dst = &ret[0];
2070  const char* src = get_read_ptr();
2071  for ( uint32_t len = get_read_len(); len; ) {
2072  char c = *src++;
2073  uint32_t b1 = (c >> 4)&0xf;
2074  uint32_t b2 = (c )&0xf;
2075  *dst = kBaseSymbols[b1];
2076  if ( len == 1 ) {
2077  break;
2078  }
2079  dst[1] = kBaseSymbols[b2];
2080  dst += 2;
2081  len -= 2;
2082  }
2083  }
2084  return ret;
2085 }
2086 
2087 
2089 {
2091  str.reserve(len+1);
2092  str.resize(len);
2093  char* dst = str.data();
2094  const char* src = get_read_ptr();
2095  for ( uint32_t len = get_read_len(); len; ) {
2096  char c = *src++;
2097  uint32_t b1 = (c >> 4)&0xf;
2098  uint32_t b2 = (c )&0xf;
2099  *dst = kBaseSymbols[b1];
2100  if ( len == 1 ) {
2101  break;
2102  }
2103  dst[1] = kBaseSymbols[b2];
2104  dst += 2;
2105  len -= 2;
2106  }
2107 }
2108 
2109 
2111 {
2112  // ignore optional starting hard break
2113  // return optional starting soft break
2114  // or 0 if there is no soft break
2115  const char* ptr = get_cigar_ptr();
2116  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2117  uint32_t op = SBamUtil::MakeUint4(ptr);
2118  ptr += 4;
2119  switch ( op & 0xf ) {
2120  case kCIGAR_H:
2121  continue;
2122  case kCIGAR_S:
2123  return op >> 4;
2124  default:
2125  return 0;
2126  }
2127  }
2128  return 0;
2129 }
2130 
2131 
2133 {
2134  // ignore hard and soft breaks, ignore insertions
2135  // only match/mismatch, deletes, and skips remain
2136  uint32_t ret = 0;
2137  const char* ptr = get_cigar_ptr();
2138  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2139  uint32_t op = SBamUtil::MakeUint4(ptr);
2140  ptr += 4;
2141  uint32_t seglen = op >> 4;
2142  switch ( op & 0xf ) {
2143  case kCIGAR_M:
2144  case kCIGAR_eq:
2145  case kCIGAR_X:
2146  case kCIGAR_D:
2147  case kCIGAR_N:
2148  ret += seglen;
2149  break;
2150  default:
2151  break;
2152  }
2153  }
2154  return ret;
2155 }
2156 
2157 
2159 {
2160  // ignore hard and soft breaks, ignore deletions and skips
2161  // only match/mismatch and inserts remain
2162  uint32_t ret = 0;
2163  const char* ptr = get_cigar_ptr();
2164  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2165  uint32_t op = SBamUtil::MakeUint4(ptr);
2166  ptr += 4;
2167  uint32_t seglen = op >> 4;
2168  switch ( op & 0xf ) {
2169  case kCIGAR_M:
2170  case kCIGAR_eq:
2171  case kCIGAR_X:
2172  case kCIGAR_I:
2173  ret += seglen;
2174  break;
2175  default:
2176  break;
2177  }
2178  }
2179  return ret;
2180 }
2181 
2182 
2183 pair< COpenRange<uint32_t>, COpenRange<uint32_t> > SBamAlignInfo::get_cigar_alignment(void) const
2184 {
2185  // ignore hard and soft breaks, ignore deletions and skips
2186  // only match/mismatch and inserts remain
2187  uint32_t ref_pos = get_ref_pos(), ref_size = 0, read_pos = 0, read_size = 0;
2188  bool first = true;
2189  const char* ptr = get_cigar_ptr();
2190  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2191  uint32_t op = SBamUtil::MakeUint4(ptr);
2192  ptr += 4;
2193  uint32_t seglen = op >> 4;
2194  switch ( op & 0xf ) {
2195  case kCIGAR_M:
2196  case kCIGAR_eq:
2197  case kCIGAR_X:
2198  ref_size += seglen;
2199  read_size += seglen;
2200  break;
2201  case kCIGAR_D:
2202  case kCIGAR_N:
2203  ref_size += seglen;
2204  break;
2205  case kCIGAR_I:
2206  read_size += seglen;
2207  break;
2208  case kCIGAR_S:
2209  if ( first ) {
2210  read_pos = seglen;
2211  }
2212  break;
2213  default:
2214  break;
2215  }
2216  first = false;
2217  }
2218  pair< COpenRange<uint32_t>, COpenRange<uint32_t> > ret;
2219  ret.first.SetFrom(ref_pos).SetLength(ref_size);
2220  ret.second.SetFrom(read_pos).SetLength(read_size);
2221  return ret;
2222 }
2223 
2224 
2225 const char SBamAlignInfo::kCIGARSymbols[] = "MIDNSHP=X???????";
2226 const char SBamAlignInfo::kBaseSymbols[] = "=ACMGRSVTWYHKDBN";
2227 
2228 
2230 {
2231  // ignore hard and soft breaks
2232  CNcbiOstrstream ret;
2233  const char* ptr = get_cigar_ptr();
2234  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2235  uint32_t op = SBamUtil::MakeUint4(ptr);
2236  ptr += 4;
2237  switch ( op & 0xf ) {
2238  case kCIGAR_H:
2239  case kCIGAR_S:
2240  continue;
2241  default:
2242  break;
2243  }
2244  uint32_t seglen = op >> 4;
2245  ret << kCIGARSymbols[op & 0xf] << seglen;
2246  }
2247  return CNcbiOstrstreamToString(ret);
2248 }
2249 
2250 
2252 {
2253  const char* ptr = get_cigar_ptr();
2254  for ( uint16_t count = get_cigar_ops_count(); count--; ) {
2255  uint32_t op = SBamUtil::MakeUint4(ptr);
2256  ptr += 4;
2257  switch ( op & 0xf ) {
2258  case kCIGAR_M:
2259  return true;
2260  default:
2261  break;
2262  }
2263  }
2264  return false;
2265 }
2266 
2267 
2268 static inline char* s_format(char* dst, uint32_t v)
2269 {
2270  if ( v < 10 ) {
2271  *dst = '0'+v;
2272  return dst+1;
2273  }
2274  if ( v >= 100 ) {
2275  dst = s_format(dst, v/100);
2276  v %= 100;
2277  }
2278  dst[0] = '0'+(v/10);
2279  dst[1] = '0'+(v%10);
2280  return dst+2;
2281 }
2282 
2283 
2285 {
2286  // it takes at most 10 symbols per op - op char + 9-symbols number up to 2^28
2287  size_t count = get_cigar_ops_count();
2288  str.reserve(count*10+1);
2289  char* dst = str.data();
2290  const char* src = get_cigar_ptr();
2291  for ( ; count--; ) {
2292  uint32_t op = SBamUtil::MakeUint4(src);
2293  src += 4;
2294  switch ( op & 0xf ) {
2295  case kCIGAR_H:
2296  case kCIGAR_S:
2297  continue;
2298  default:
2299  break;
2300  }
2301  uint32_t seglen = op >> 4;
2302  *dst = kCIGARSymbols[op & 0xf];
2303  dst = s_format(dst+1, seglen);
2304  }
2305  str.resize(dst-str.data());
2306 }
2307 
2308 
2310 {
2311  const char* ptr = m_AuxPtr;
2312  const char* end = m_AuxEnd;
2313  if ( ptr == end ) {
2314  // end of tags
2315  m_AuxData = SBamAuxData();
2316  return;
2317  }
2318  ptr += 3; // skip tag name and type
2319  if ( ptr <= end ) {
2320  m_AuxData.m_Tag[0] = ptr[-3];
2321  m_AuxData.m_Tag[1] = ptr[-2];
2322  m_AuxData.m_DataType = ptr[-1];
2323  m_AuxData.m_IsArray = false;
2325  m_AuxData.m_DataPtr = ptr;
2326  switch ( m_AuxData.m_DataType ) {
2327  case 'A':
2328  case 'c':
2329  case 'C':
2330  // 1-byte value
2331  ptr += 1;
2332  if ( ptr <= end ) {
2333  // fits
2334  m_AuxPtr = ptr;
2335  return;
2336  }
2337  // fallback to error
2338  break;
2339  case 's':
2340  case 'S':
2341  // 2-byte value
2342  ptr += 2;
2343  if ( ptr <= end ) {
2344  // fits
2345  m_AuxPtr = ptr;
2346  return;
2347  }
2348  // fallback to error
2349  break;
2350  case 'i':
2351  case 'I':
2352  case 'f':
2353  // 4-byte value
2354  ptr += 4;
2355  if ( ptr <= end ) {
2356  // fits
2357  m_AuxPtr = ptr;
2358  return;
2359  }
2360  // fallback to error
2361  break;
2362  case 'Z':
2363  case 'H':
2364  // zero-terminated string
2365  ptr = static_cast<const char*>(memchr(ptr, 0, end-ptr));
2366  if ( ptr ) {
2367  // found zero termination
2369  m_AuxPtr = ptr + 1; // skip zero-termination too
2370  return;
2371  }
2372  // fallback to error
2373  break;
2374  case 'B':
2375  // array of fixed-size elements
2376  ptr += 5; // skip element type and count
2377  if ( ptr <= end ) {
2378  m_AuxData.m_IsArray = true;
2379  m_AuxData.m_DataType = ptr[-5];
2381  m_AuxData.m_DataPtr = ptr;
2382  size_t element_size;
2383  switch ( m_AuxData.m_DataType ) {
2384  case 'c':
2385  case 'C':
2386  element_size = 1;
2387  break;
2388  case 's':
2389  case 'S':
2390  element_size = 2;
2391  break;
2392  case 'i':
2393  case 'I':
2394  case 'f':
2395  element_size = 4;
2396  break;
2397  default:
2398  element_size = 0;
2399  break;
2400  }
2401  if ( element_size == 0 ) {
2402  // fallback to error
2403  break;
2404  }
2405  ptr += m_AuxData.m_ElementCount*element_size;
2406  if ( ptr <= end ) {
2407  // fits
2408  m_AuxPtr = ptr;
2409  return;
2410  }
2411  }
2412  // fallback to error
2413  break;
2414  default:
2415  // fallback to error
2416  break;
2417  }
2418  }
2419  // bad aux format, cannot continue parsing aux data
2420  ERR_POST("BAM: Alignment aux tag parse error");
2421  m_AuxData = SBamAuxData();
2422  m_AuxPtr = end;
2423 }
2424 
2425 
2427 {
2428  if ( !IsChar() ) {
2429  NCBI_THROW_FMT(CBamException, eOtherError,
2430  "Conversion error: "
2431  "type "<<GetDataType()<<" cannot be converted to char");
2432  }
2433  return m_DataPtr[0];
2434 }
2435 
2436 
2438 {
2439  if ( !IsString() ) {
2440  NCBI_THROW_FMT(CBamException, eOtherError,
2441  "Conversion error: "
2442  "type "<<GetDataType()<<" cannot be converted to string");
2443  }
2444  return CTempString(m_DataPtr, size());
2445 }
2446 
2447 
2448 Int8 SBamAuxData::GetInt(size_t index) const
2449 {
2450  if ( !IsInt() ) {
2451  NCBI_THROW_FMT(CBamException, eOtherError,
2452  "Conversion error: "
2453  "type "<<GetDataType()<<" cannot be converted to int");
2454  }
2455  if ( index >= size() ) {
2456  NCBI_THROW_FMT(CBamException, eInvalidArg,
2457  "Index overflow: "<<index<<" >= "<<size());
2458  return false;
2459  }
2460  switch ( GetDataType() ) {
2461  case 'c': // signed byte
2462  return Int1(m_DataPtr[index]);
2463  case 'C': // unsigned byte
2464  return Uint1(m_DataPtr[index]);
2465  case 's': // signed 2-byte int
2466  return Int2(SBamUtil::MakeUint2(m_DataPtr+2*index));
2467  case 'S': // unsigned 2-byte int
2468  return Uint2(SBamUtil::MakeUint2(m_DataPtr+2*index));
2469  case 'i': // signed 4-byte int
2470  return Int4(SBamUtil::MakeUint4(m_DataPtr+4*index));
2471  case 'I': // unsigned 4-byte int
2472  return Uint4(SBamUtil::MakeUint4(m_DataPtr+4*index));
2473  default:
2474  // couldn't be here because IsInt() == true
2475  return 0;
2476  }
2477 }
2478 
2479 
2480 float SBamAuxData::GetFloat(size_t index) const
2481 {
2482  if ( !IsFloat() ) {
2483  NCBI_THROW_FMT(CBamException, eOtherError,
2484  "Conversion error: "
2485  "type "<<GetDataType()<<" cannot be converted to float");
2486  }
2487  if ( index >= size() ) {
2488  NCBI_THROW_FMT(CBamException, eInvalidArg,
2489  "Index overflow: "<<index<<" >= "<<size());
2490  return false;
2491  }
2492  return SBamUtil::MakeFloat(m_DataPtr+4*index);
2493 }
2494 
2495 
2496 SBamAuxData SBamAlignInfo::get_aux_data(char c1, char c2, bool allow_missing) const
2497 {
2498  for ( CBamAuxIterator iter(get_aux_data_ptr(), get_aux_data_end()); iter; ++iter ) {
2499  if ( iter->IsTag(c1, c2) ) {
2500  return *iter;
2501  }
2502  }
2503  if ( !allow_missing ) {
2504  NCBI_THROW_FMT(CBamException, eNoData,
2505  "Tag "<<c1<<c2<<" not found");
2506  }
2507  return SBamAuxData();
2508 }
2509 
2510 
2512 {
2513  if ( auto data = get_aux_data('R', 'G', true) ) {
2514  return data.GetString();
2515  }
2516  return CTempString();
2517 }
2518 
2519 
2521 {
2522  in.GetNextAvailableBytes(); // update position if it's at the end of block
2523  m_FilePos = in.GetPos();
2524  m_RecordSize = SBamUtil::MakeUint4(in.Read(4));
2525  m_RecordPtr = in.Read(m_RecordSize);
2529 }
2530 
2531 
2532 /////////////////////////////////////////////////////////////////////////////
2533 // CBamRawAlignIterator
2534 /////////////////////////////////////////////////////////////////////////////
2535 
2536 
2538  const string& ref_label,
2539  TSeqPos ref_pos,
2540  TSeqPos window,
2541  ESearchMode search_mode,
2542  const CBGZFPos* file_pos)
2543  : m_Reader(bam_db.GetFile())
2544 {
2545  CRange<TSeqPos> ref_range(ref_pos, ref_pos);
2546  if ( window && ref_pos < kInvalidSeqPos-window ) {
2547  ref_range.SetToOpen(ref_pos+window);
2548  }
2549  else {
2550  ref_range.SetToOpen(kInvalidSeqPos);
2551  }
2552  Select(bam_db, ref_label, ref_range, search_mode, file_pos);
2553 }
2554 
2555 
2557  const string& ref_label,
2558  TSeqPos ref_pos,
2559  TSeqPos window,
2560  TIndexLevel min_index_level,
2561  TIndexLevel max_index_level,
2562  ESearchMode search_mode,
2563  const CBGZFPos* file_pos)
2564  : m_Reader(bam_db.GetFile())
2565 {
2566  CRange<TSeqPos> ref_range(ref_pos, ref_pos);
2567  if ( window && ref_pos < kInvalidSeqPos-window ) {
2568  ref_range.SetToOpen(ref_pos+window);
2569  }
2570  else {
2571  ref_range.SetToOpen(kInvalidSeqPos);
2572  }
2573  Select(bam_db, ref_label, ref_range, min_index_level, max_index_level, search_mode, file_pos);
2574 }
2575 
2576 
2578  const string& ref_label,
2579  TSeqPos ref_pos,
2580  TSeqPos window,
2581  EIndexLevel min_index_level,
2582  EIndexLevel max_index_level,
2583  ESearchMode search_mode,
2584  const CBGZFPos* file_pos)
2585  : m_Reader(bam_db.GetFile())
2586 {
2587  CRange<TSeqPos> ref_range(ref_pos, ref_pos);
2588  if ( window && ref_pos < kInvalidSeqPos-window ) {
2589  ref_range.SetToOpen(ref_pos+window);
2590  }
2591  else {
2592  ref_range.SetToOpen(kInvalidSeqPos);
2593  }
2594  Select(bam_db, ref_label, ref_range, min_index_level, max_index_level, search_mode, file_pos);
2595 }
2596 
2597 
2599  const CBGZFPos* file_pos)
2600 {
2601  m_RefIndex = size_t(-1);
2603  if ( file_pos && *file_pos ) {
2604  m_Ranges.SetFrom(*file_pos);
2605  }
2606  else {
2607  m_Ranges.SetWhole(header);
2608  }
2610  m_MinIndexLevel = 0;
2611  m_MaxIndexLevel = 0;
2612  if ( x_UpdateRange() ) {
2613  Next();
2614  }
2615 }
2616 
2617 
2619  size_t ref_index,
2620  CRange<TSeqPos> ref_range,
2621  TIndexLevel min_index_level,
2622  TIndexLevel max_index_level,
2623  ESearchMode search_mode,
2624  const CBGZFPos* file_pos)
2625 {
2626  SBamIndexParams::operator=(index);
2627  m_RefIndex = ref_index;
2628  m_QueryRefRange = ref_range;
2629  m_Ranges.SetRanges(index, ref_index, ref_range, min_index_level, max_index_level, search_mode, file_pos);
2631  m_MinIndexLevel = min_index_level;
2632  m_MaxIndexLevel = max_index_level;
2633  m_SearchMode = search_mode;
2634  if ( x_UpdateRange() ) {
2635  Next();
2636  }
2637 }
2638 
2639 
2641 {
2642  if ( m_NextRange == m_Ranges.end() ) {
2644  return false;
2645  }
2646  else {
2647  m_CurrentRangeEnd = m_NextRange->second;
2648  m_Reader.Seek(m_NextRange->first, m_NextRange->second);
2649  ++m_NextRange;
2650  return true;
2651  }
2652 }
2653 
2654 
2656 {
2657  _ASSERT(*this);
2659  if ( m_RefIndex != size_t(-1) ) {
2660  // check for alignment validity
2661  if ( size_t(m_AlignInfo.get_ref_index()) != m_RefIndex ) {
2662  // wrong reference sequence
2663  return true;
2664  }
2665  if ( !IsMapped() ) {
2666  // unaligned read
2667  return true;
2668  }
2669  if ( GetCIGAROpsCount() == 0 ) {
2670  // empty CIGAR string
2671  return true;
2672  }
2673  }
2674  auto alignment = m_AlignInfo.get_cigar_alignment();
2675  m_AlignRefRange = alignment.first;
2676  m_AlignReadRange = alignment.second;
2677  if ( m_RefIndex == size_t(-1) ) {
2678  // unfiltered alignments
2679  return false;
2680  }
2682  // after search range
2683  x_Stop();
2684  return false;
2685  }
2686  if ( m_SearchMode == eSearchByOverlap ) {
2687  // any overlapping alignment
2689  // before search range
2690  return true;
2691  }
2692  }
2693  else {
2694  // only starting within the range
2696  // before search range
2697  return true;
2698  }
2699  }
2700  if ( m_MinIndexLevel != 0 || m_MaxIndexLevel != GetMaxIndexLevel() ) {
2701  TIndexLevel index_level = GetIndexLevel();
2702  if ( index_level < m_MinIndexLevel || index_level > m_MaxIndexLevel ) {
2703  // this index level is not requested
2704  return true;
2705  }
2706  }
2707  return false;
2708 }
2709 
2710 
2712 {
2713  while ( x_NextAnnot() && x_NeedToSkip() ) {
2714  // continue
2715  }
2716 }
2717 
2718 
2719 void CBamRawAlignIterator::GetSegments(vector<int>& starts, vector<TSeqPos>& lens) const
2720 {
2721  TSeqPos refpos = GetRefSeqPos();
2722  TSeqPos seqpos = 0;
2723 
2724  // ignore hard breaks
2725  // omit soft breaks in the alignment
2726  const char* ptr = m_AlignInfo.get_cigar_ptr();
2728  uint32_t op = SBamUtil::MakeUint4(ptr);
2729  ptr += 4;
2730  TSeqPos seglen = op >> 4;
2731  int refstart, seqstart;
2732  switch ( op & 0xf ) {
2734  case SBamAlignInfo::kCIGAR_P: // ?
2735  continue;
2737  seqpos += seglen;
2738  continue;
2742  refstart = refpos;
2743  refpos += seglen;
2744  seqstart = seqpos;
2745  seqpos += seglen;
2746  break;
2748  refstart = kInvalidSeqPos;
2749  seqstart = seqpos;
2750  seqpos += seglen;
2751  break;
2754  refstart = refpos;
2755  refpos += seglen;
2756  seqstart = kInvalidSeqPos;
2757  break;
2758  default:
2759  NCBI_THROW_FMT(CBamException, eBadCIGAR,
2760  "Bad CIGAR segment: " << (op & 0xf) << " in " <<GetCIGAR());
2761  }
2762  if ( seglen == 0 ) {
2763  NCBI_THROW_FMT(CBamException, eBadCIGAR,
2764  "Zero CIGAR segment: in " << GetCIGAR());
2765  }
2766  starts.push_back(refstart);
2767  starts.push_back(seqstart);
2768  lens.push_back(seglen);
2769  }
2770 }
2771 
2772 
static void sx_SetTitle(CSeq_graph &graph, CSeq_annot &annot, string title, string name)
Definition: bamindex.cpp:1401
static char * s_format(char *dst, uint32_t v)
Definition: bamindex.cpp:2268
static const size_t kIndexMagicLength
Definition: bamindex.cpp:65
BEGIN_LOCAL_NAMESPACE
Definition: bamindex.cpp:1208
NCBI_PARAM_DEF_EX(int, BAM, OVERLAP_MODE, 2, eParam_NoThread, BAM_OVERLAP_MODE)
static const char kBamExt[]
Definition: bamindex.cpp:63
static const float kEstimatedCompression
Definition: bamindex.cpp:74
static void s_ReadMagic(CBGZFStream &in, const char *magic)
Definition: bamindex.cpp:124
static CBGZFPos s_ReadFilePos(CNcbiIstream &in)
Definition: bamindex.cpp:162
static int s_GetRangesMode()
Definition: bamindex.cpp:1753
Uint8 s_EstimatedPos(CBGZFPos pos)
Definition: bamindex.cpp:755
static size_t ReadVDBFile(AutoArray< char > &data, const string &path)
Definition: bamindex.cpp:1153
static int32_t s_ReadInt32(CNcbiIstream &in)
Definition: bamindex.cpp:146
static int s_GetOverlapMode()
Definition: bamindex.cpp:491
static CBGZFPos s_GetOverlap(const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:435
static void s_AddReplacedExt(vector< string > &dst, const string &base_name, CTempString old_ext, CTempString new_ext)
Definition: bamindex.cpp:1974
END_LOCAL_NAMESPACE
Definition: bamindex.cpp:1248
NCBI_PARAM_DECL(int, BAM, OVERLAP_MODE)
static CBGZFPos s_GetNextFilePos(const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:460
static CBGZFPos s_GetFilePos(const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:448
static uint32_t s_ReadUInt32(CNcbiIstream &in)
Definition: bamindex.cpp:137
static const char kBaiExt[]
Definition: bamindex.cpp:66
static const size_t kGZipMagicLength
Definition: bamindex.cpp:60
static CBGZFRange s_ReadFileRange(CNcbiIstream &in)
Definition: bamindex.cpp:169
static COpenRange< TSeqPos > s_GetSeqRange(SBamIndexParams params, const pair< SBamIndexRefIndex::TBinsIter, SBamIndexRefIndex::TBinsIter > &iters)
Definition: bamindex.cpp:417
static const char kGZipMagic[]
Definition: bamindex.cpp:61
static const char kIndexMagicBAI[]
Definition: bamindex.cpp:67
static void s_ReadString(CBGZFStream &in, string &ret, size_t len)
Definition: bamindex.cpp:116
ostream & operator<<(ostream &out, const CBamFileRangeSet &ranges)
Definition: bamindex.cpp:1707
static void s_Read(CNcbiIstream &in, char *dst, size_t len)
Definition: bamindex.cpp:77
Uint8 s_EstimatedSize(CBGZFPos file_pos1, CBGZFPos file_pos2)
Definition: bamindex.cpp:762
static uint64_t s_ReadUInt64(CNcbiIstream &in)
Definition: bamindex.cpp:153
pair< CBGZFPos, CBGZFPos > CBGZFRange
Definition: bgzf.hpp:272
void SetPreviousReadStatistics(const pair< Uint8, double > &stats)
Definition: bgzf.hpp:331
pair< Uint8, double > GetReadStatistics() const
Definition: bgzf.hpp:327
pair< Uint8, double > GetUncompressStatistics() const
Definition: bgzf.cpp:402
TByteOffset GetByteOffset() const
Definition: bgzf.hpp:224
TFileBlockPos GetFileBlockPos() const
Definition: bgzf.hpp:220
static CBGZFPos GetInvalid()
Definition: bgzf.hpp:254
bool IsInvalid() const
Definition: bgzf.hpp:258
CBGZFPos GetSeekPos() const
Definition: bgzf.hpp:402
void Seek(CBGZFPos pos, CBGZFPos end_pos=CBGZFPos::GetInvalid())
Definition: bgzf.cpp:462
const char * m_AuxEnd
Definition: bamindex.hpp:1137
SBamAuxData m_AuxData
Definition: bamindex.hpp:1135
const char * m_AuxPtr
Definition: bamindex.hpp:1136
static int GetDebugLevel()
Definition: bamread.cpp:389
@ eFileNotFound
File not found.
virtual TErrCode GetErrCode(void) const
Definition: bamread.cpp:168
void SetFrom(CBGZFPos file_pos)
Definition: bamindex.hpp:948
void AddWhole(const CBamHeader &header)
Definition: bamindex.cpp:1889
const_iterator end() const
Definition: bamindex.hpp:971
TRanges m_Ranges
Definition: bamindex.hpp:984
void AddFrom(CBGZFPos file_pos)
Definition: bamindex.cpp:1880
const_iterator begin() const
Definition: bamindex.hpp:967
void AddSortedRanges(const vector< CBGZFRange > &ranges, const CBGZFPos *file_pos=nullptr)
Definition: bamindex.cpp:1724
void AddRanges(const CBamIndex &index, size_t ref_index, COpenRange< TSeqPos > ref_range, ESearchMode search_mode=eSearchByOverlap, const CBGZFPos *file_pos=nullptr)
Definition: bamindex.cpp:1859
void SetWhole(const CBamHeader &header)
Definition: bamindex.hpp:942
Uint8 GetFileSize() const
Definition: bamindex.cpp:1946
void SetRanges(const CBamIndex &index, size_t ref_index, COpenRange< TSeqPos > ref_range, ESearchMode search_mode=eSearchByOverlap, const CBGZFPos *file_pos=nullptr)
Definition: bamindex.cpp:1919
const SBamHeaderRefInfo & GetRef(size_t ref_index) const
Definition: bamindex.cpp:1598
void Read(CBGZFStream &stream)
Definition: bamindex.cpp:1581
TRefs m_Refs
Definition: bamindex.hpp:109
map< string, string > TSBamTags
Definition: bamindex.hpp:72
size_t GetRefCount() const
Definition: bamindex.hpp:84
size_t GetRefIndex(const string &name) const
Definition: bamindex.cpp:1608
size_t GetSBamRecords(TSBamRecords &records) const
Definition: bamindex.cpp:1617
CBGZFPos m_AlignStart
Definition: bamindex.hpp:110
list< TSBamRecord > TSBamRecords
Definition: bamindex.hpp:74
CBGZFPos GetAlignStart() const
Definition: bamindex.hpp:101
string m_Text
Definition: bamindex.hpp:107
pair< string, TSBamTags > TSBamRecord
Definition: bamindex.hpp:73
TSeqPos GetRefLength(size_t index) const
Definition: bamindex.hpp:94
map< string, size_t > m_RefByName
Definition: bamindex.hpp:108
const SBamIndexRefIndex & GetRef(size_t ref_index) const
Definition: bamindex.cpp:1363
Uint8 m_TotalReadBytes
Definition: bamindex.hpp:760
double m_TotalReadSeconds
Definition: bamindex.hpp:761
vector< uint64_t > CollectEstimatedCoverage(size_t ref_index, TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1482
TRefs m_Refs
Definition: bamindex.hpp:758
void Read(const string &index_file_name)
Definition: bamindex.cpp:1190
size_t GetRefCount() const
Definition: bamindex.hpp:468
Uint8 m_UnmappedCount
Definition: bamindex.hpp:759
string m_FileName
Definition: bamindex.hpp:757
pair< Uint8, double > GetReadStatistics() const
Definition: bamindex.hpp:751
CRef< CSeq_annot > MakeEstimatedCoverageAnnot(const CBamHeader &header, const string &ref_name, const string &seq_id, const string &annot_name, TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1416
CBGZFRange GetTotalFileRange(size_t ref_index) const
Definition: bamindex.cpp:1386
void SetLengthFromHeader(const CBamHeader &header)
Definition: bamindex.cpp:1373
Uint2 GetCIGAROpsCount() const
Definition: bamindex.hpp:1560
TSeqPos GetRefSeqPos() const
Definition: bamindex.hpp:1519
SBamAlignInfo m_AlignInfo
Definition: bamindex.hpp:1742
ESearchMode m_SearchMode
Definition: bamindex.hpp:1741
CBamFileRangeSet::const_iterator m_NextRange
Definition: bamindex.hpp:1746
string GetCIGAR() const
Definition: bamindex.hpp:1597
bool IsMapped() const
Definition: bamindex.hpp:1640
void GetSegments(vector< int > &starts, vector< TSeqPos > &lens) const
Definition: bamindex.cpp:2719
void Select(CBamRawDb &bam_db, const CBGZFPos *file_pos=nullptr)
Definition: bamindex.hpp:1412
TIndexLevel GetIndexLevel() const
Definition: bamindex.hpp:1610
void x_Select(const CBamHeader &header, const CBGZFPos *file_pos=nullptr)
Definition: bamindex.cpp:2598
TIndexLevel m_MaxIndexLevel
Definition: bamindex.hpp:1740
CBamFileRangeSet m_Ranges
Definition: bamindex.hpp:1745
TIndexLevel m_MinIndexLevel
Definition: bamindex.hpp:1740
COpenRange< TSeqPos > m_QueryRefRange
Definition: bamindex.hpp:1739
COpenRange< TSeqPos > m_AlignRefRange
Definition: bamindex.hpp:1743
CBGZFPos m_CurrentRangeEnd
Definition: bamindex.hpp:1747
COpenRange< TSeqPos > m_AlignReadRange
Definition: bamindex.hpp:1744
CBGZFStream m_Reader
Definition: bamindex.hpp:1748
double GetEstimatedSecondsPerByte() const
Definition: bamindex.cpp:2032
CRef< CBGZFFile > m_File
Definition: bamindex.hpp:1054
void Open(const string &bam_path)
Definition: bamindex.cpp:1966
CBamIndex m_Index
Definition: bamindex.hpp:1056
CBamHeader m_Header
Definition: bamindex.hpp:1055
CByte_graph –.
Definition: Byte_graph.hpp:66
CMemoryReader(const char *ptr, size_t size)
Definition: bamindex.cpp:1212
ERW_Result Read(void *buf, size_t count, size_t *bytes_read)
Read as many as "count" bytes into a buffer pointed to by the "buf" argument.
Definition: bamindex.cpp:1218
const char * m_Ptr
Definition: bamindex.cpp:1245
ERW_Result PendingCount(size_t *count)
Via parameter "count" (which is guaranteed to be supplied non-NULL) return the number of bytes that a...
Definition: bamindex.cpp:1238
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
@ fOwnReader
Own the underlying reader.
Definition: rwstreambuf.hpp:66
void clear()
Definition: bamindex.hpp:777
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
Definition: Seq_entry.hpp:56
CStopWatch –.
Definition: ncbitime.hpp:1937
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CZipStreamDecompressor – zlib based decompression stream processor.
Definition: zlib.hpp:817
A very basic data-read interface.
void erase(iterator pos)
Definition: map.hpp:167
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator upper_bound(const key_type &key) const
Definition: map.hpp:155
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
static ulg bb
static const char fp[]
Definition: des.c:87
static const char si[8][64]
Definition: des.c:146
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static const char * str(char *buf, int n)
Definition: stats.c:84
char data[12]
Definition: iconv.c:80
Uint8 uint64_t
Int4 int32_t
Uint2 uint16_t
Uint4 uint32_t
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
string
Definition: cgiapp.hpp:690
@ fGZip
Set of flags for gzip file support. See each flag description above.
Definition: zlib.hpp:120
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define NCBI_THROW_FMT(exception_class, err_code, message)
The same as NCBI_THROW but with message processed as output to ostream.
Definition: ncbiexpt.hpp:719
#define NCBI_PARAM_TYPE(section, name)
Generate typename for a parameter from its {section, name} attributes.
Definition: ncbi_param.hpp:149
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
int8_t Int1
1-byte (8-bit) signed integer
Definition: ncbitype.h:98
TThisType & SetFrom(position_type from)
Definition: range.hpp:170
position_type GetTo(void) const
Definition: range.hpp:142
TThisType & SetToOpen(position_type toOpen)
Definition: range.hpp:175
position_type GetToOpen(void) const
Definition: range.hpp:138
position_type GetFrom(void) const
Definition: range.hpp:134
static TThisType GetEmpty(void)
Definition: range.hpp:306
TThisType & SetLength(position_type length)
Definition: range.hpp:194
bool Empty(void) const
Definition: range.hpp:148
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
ERW_Result
Result codes for I/O operations.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
@ eRW_Eof
End of data, should be considered permanent.
@ eRW_Success
Everything is okay, I/O completed.
static string PrintableString(const CTempString str, TPrintableMode mode=fNewLine_Quote|fNonAscii_Passthru)
Get a printable version of the specified string.
Definition: ncbistr.cpp:3944
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5424
CTempString & assign(const char *src_str, size_type len)
Assign new values to the content of the a string.
Definition: tempstr.hpp:733
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1941
void SetA(TA value)
Assign a value to A data member.
void SetMin(TMin value)
Assign a value to Min data member.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
vector< char > TValues
Definition: Byte_graph_.hpp:89
void SetNumval(TNumval value)
Assign a value to Numval data member.
void SetComp(TComp value)
Assign a value to Comp data member.
TValues & SetValues(void)
Assign a value to Values data member.
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
void SetMax(TMax value)
Assign a value to Max data member.
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
void SetAxis(TAxis value)
Assign a value to Axis data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
static CStopWatch sw
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
FILE * file
char * buf
int i
int len
static constexpr streamsize bmax
range(_Ty, _Ty) -> range< _Ty >
void timsort(RandomAccessIterator const first, RandomAccessIterator const last)
Same as std::stable_sort(first, last).
Definition: timsort.hpp:650
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
@ eRead
Definition: ns_types.hpp:56
static unsigned cnt[256]
#define count
static size_t read_size(CNcbiIstream &stream, const char *name)
Definition: reader_snp.cpp:404
Reader-writer based streams.
#define uint32_t
Definition: config.h:42
#define uint64_t
Definition: config.h:48
bool operator()(const CBGZFPos p1, const CBGZFRange &p2) const
Definition: bamindex.cpp:280
bool operator()(const CBGZFRange &p1, const CBGZFPos p2) const
Definition: bamindex.cpp:284
bool operator()(const CBGZFPos p1, const SBamIndexBinInfo &p2) const
Definition: bamindex.cpp:268
bool operator()(const SBamIndexBinInfo &p1, const CBGZFPos p2) const
Definition: bamindex.cpp:272
string get_read() const
Definition: bamindex.cpp:2064
const char * get_cigar_ptr() const
Definition: bamindex.hpp:1240
uint32_t get_cigar_read_size() const
Definition: bamindex.cpp:2158
const char * m_ReadPtr
Definition: bamindex.hpp:1309
const char * m_CIGARPtr
Definition: bamindex.hpp:1308
uint32_t get_cigar_pos() const
Definition: bamindex.cpp:2110
CTempString get_short_seq_accession_id() const
Definition: bamindex.cpp:2511
int32_t get_ref_pos() const
Definition: bamindex.hpp:1165
pair< COpenRange< uint32_t >, COpenRange< uint32_t > > get_cigar_alignment(void) const
Definition: bamindex.cpp:2183
uint8_t get_read_name_len() const
Definition: bamindex.hpp:1170
const char * get_read_ptr() const
Definition: bamindex.hpp:1263
const char * get_aux_data_end() const
Definition: bamindex.hpp:1283
bool has_ambiguous_match() const
Definition: bamindex.cpp:2251
uint32_t get_cigar_ref_size() const
Definition: bamindex.cpp:2132
const char * get_read_name_ptr() const
Definition: bamindex.hpp:1232
static const char kBaseSymbols[]
Definition: bamindex.hpp:1292
string get_cigar() const
Definition: bamindex.cpp:2229
const char * m_RecordPtr
Definition: bamindex.hpp:1307
void Read(CBGZFStream &in)
Definition: bamindex.cpp:2520
const char * get_aux_data_ptr() const
Definition: bamindex.hpp:1279
CBGZFPos m_FilePos
Definition: bamindex.hpp:1306
uint16_t get_cigar_ops_count() const
Definition: bamindex.hpp:1194
static const char kCIGARSymbols[]
Definition: bamindex.hpp:1182
Uint4 m_RecordSize
Definition: bamindex.hpp:1310
int32_t get_ref_index() const
Definition: bamindex.hpp:1161
uint32_t get_read_len() const
Definition: bamindex.hpp:1216
SBamAuxData get_aux_data(char c1, char c2, bool allow_missing=false) const
Definition: bamindex.cpp:2496
char GetChar() const
Definition: bamindex.cpp:2426
char m_DataType
Definition: bamindex.hpp:1098
uint32_t m_ElementCount
Definition: bamindex.hpp:1100
bool IsChar() const
Definition: bamindex.hpp:1084
float GetFloat(size_t index=0) const
Definition: bamindex.cpp:2480
bool IsInt() const
Definition: bamindex.hpp:1087
char m_Tag[2]
Definition: bamindex.hpp:1097
char GetDataType() const
Definition: bamindex.hpp:1079
Int8 GetInt(size_t index=0) const
Definition: bamindex.cpp:2448
size_t size() const
Definition: bamindex.hpp:1082
bool IsFloat() const
Definition: bamindex.hpp:1086
bool m_IsArray
Definition: bamindex.hpp:1099
CTempString GetString() const
Definition: bamindex.cpp:2437
const char * m_DataPtr
Definition: bamindex.hpp:1101
bool IsString() const
Definition: bamindex.hpp:1085
void Read(CBGZFStream &in)
Definition: bamindex.cpp:1564
TSeqPos m_Length
Definition: bamindex.hpp:52
CBGZFPos GetEndFilePos() const
Definition: bamindex.hpp:365
CBGZFPos m_Overlap
Definition: bamindex.hpp:357
vector< CBGZFRange > m_Chunks
Definition: bamindex.hpp:359
CBGZFPos GetStartFilePos() const
Definition: bamindex.hpp:361
void Read(CNcbiIstream &in, SBamIndexParams params)
Definition: bamindex.cpp:198
COpenRange< TSeqPos > GetSeqRange(SBamIndexParams params) const
Definition: bamindex.hpp:350
uint32_t TBin
Definition: bamindex.hpp:120
static const TShift kLevelStepBinShift
Definition: bamindex.hpp:137
uint8_t TIndexLevel
Definition: bamindex.hpp:121
static const TShift kBAI_min_shift
Definition: bamindex.hpp:138
static const TIndexLevel kMinBinIndexLevel
Definition: bamindex.hpp:134
static const TIndexLevel kBAI_depth
Definition: bamindex.hpp:139
static const TBin kMaxBinNumber
Definition: bamindex.hpp:133
constexpr TSeqPos GetBinSize(TIndexLevel level) const
Definition: bamindex.hpp:196
constexpr TBin GetPseudoBin() const
Definition: bamindex.hpp:265
TIndexLevel depth
Definition: bamindex.hpp:157
constexpr TBin GetBinNumberBase(int level) const
Definition: bamindex.hpp:239
constexpr TBin GetFirstBin(TIndexLevel level) const
Definition: bamindex.hpp:257
TShift min_shift
Definition: bamindex.hpp:156
pair< TBin, TBin > GetBinRange(COpenRange< TSeqPos > ref_range, TIndexLevel index_level) const
Definition: bamindex.cpp:851
TBin GetBinNumber(TSeqPos pos, TIndexLevel level) const
Definition: bamindex.hpp:285
TBin GetBinNumberOffset(TSeqPos pos, TIndexLevel level) const
Definition: bamindex.hpp:277
bool IsOverflowBin(TBin bin, TIndexLevel level=0) const
Definition: bamindex.hpp:269
constexpr TShift GetMinLevelBinShift() const
Definition: bamindex.hpp:158
constexpr TSeqPos GetMinBinSize() const
Definition: bamindex.hpp:208
constexpr TShift GetLevelBinShift(TIndexLevel level) const
Definition: bamindex.hpp:187
constexpr TIndexLevel GetMaxIndexLevel() const
Definition: bamindex.hpp:162
constexpr TBin GetLastBin(TIndexLevel level) const
Definition: bamindex.hpp:261
vector< TSeqPos > GetAlnOverStarts(void) const
Definition: bamindex.cpp:498
pair< TBinsIter, TBinsIter > GetBinsIterRange(pair< TBin, TBin > bin_range) const
Definition: bamindex.cpp:911
vector< Uint8 > EstimateDataSizeByAlnStartPos(TSeqPos seqlen=kInvalidSeqPos) const
Definition: bamindex.cpp:1055
TSeqPos m_EstimatedLength
Definition: bamindex.hpp:442
CBGZFRange GetFileRange() const
Definition: bamindex.cpp:1031
pair< TBinsIter, TBinsIter > GetLevelBins(TIndexLevel level) const
Definition: bamindex.cpp:254
bool ProcessPseudoBin(SBamIndexBinInfo &bin)
Definition: bamindex.cpp:300
TBins::const_iterator TBinsIter
Definition: bamindex.hpp:418
CBGZFRange GetLimitRange(COpenRange< TSeqPos > &ref_range, ESearchMode search_mode) const
Definition: bamindex.cpp:784
const char * Read(const char *buffer_ptr, const char *buffer_end, SBamIndexParams params, int32_t ref_index)
Definition: bamindex.cpp:379
vector< uint64_t > CollectEstimatedCoverage(TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1107
CBGZFRange m_UnmappedChunk
Definition: bamindex.hpp:437
vector< CBGZFPos > m_Overlaps
Definition: bamindex.hpp:440
void SetLengthFromHeader(TSeqPos length)
Definition: bamindex.cpp:291
pair< TBinsIter, TBinsIter > AddLevelFileRanges(vector< CBGZFRange > &ranges, CBGZFRange limit_file_range, pair< TBin, TBin > bin_range) const
Definition: bamindex.cpp:887
void ProcessBin(const SBamIndexBinInfo &bin)
Definition: bamindex.cpp:314
vector< TSeqPos > GetAlnOverEnds(void) const
Definition: bamindex.cpp:735
static void x_AddDataSize(vector< Uint8 > &vv, size_t beg_pos, size_t end_pos, CBGZFPos file_beg, CBGZFPos file_end)
Definition: bamindex.cpp:925
size_t block_end
Definition: bamindex.cpp:920
size_t fill_beg_to
Definition: bamindex.cpp:921
void Init(size_t index)
Definition: bamindex.cpp:953
size_t block_beg
Definition: bamindex.cpp:920
CBGZFPos file_end
Definition: bamindex.cpp:922
void InitData(vector< Uint8 > &vv, const SBamIndexBinInfo &bin)
Definition: bamindex.cpp:958
CBGZFPos file_beg
Definition: bamindex.cpp:922
size_t fill_end_to
Definition: bamindex.cpp:921
SBamRangeBlock(vector< Uint8 > &vv, const vector< SBamRangeBlock > &bb, size_t bb_beg, size_t bb_end)
Definition: bamindex.cpp:1005
void ExpandData(vector< Uint8 > &vv, const SBamIndexBinInfo &bin)
Definition: bamindex.cpp:972
static Uint8 MakeUint8(const char *buf)
Definition: bgzf.hpp:167
static Uint4 MakeUint4(const char *buf)
Definition: bgzf.hpp:159
static Uint2 MakeUint2(const char *buf)
Definition: bgzf.hpp:153
static float MakeFloat(const char *buf)
Definition: bgzf.hpp:183
#define _ASSERT
ZLib Compression API.
Modified on Fri Sep 20 14:57:21 2024 by modify_doxy.py rev. 669887