NCBI C++ ToolKit
bamloader_impl.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: bamloader_impl.cpp 101549 2024-01-02 16:40:09Z vasilche $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Eugene Vasilchenko
27  *
28  * File Description: BAM file data loader
29  *
30  * ===========================================================================
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
38 #include <objects/seq/seq__.hpp>
40 
48 #include <serial/objistr.hpp>
49 #include <serial/serial.hpp>
51 
52 #include <sra/error_codes.hpp>
57 
59 
60 #include <algorithm>
61 #include <numeric>
62 #include <cmath>
63 
65 
66 #define NCBI_USE_ERRCODE_X BAMLoader
68 
70 
71 class CDataLoader;
72 
73 static const int kTSEId = 1;
74 static const Uint8 kSingleAlignBytes = 60; // estimated size of a single alignment in bytes
75 //static const size_t kGraphScale = 1000;
76 //static const size_t kGraphPoints = 20;
77 static const size_t kChunkDataSize = 250000; // target chunk size in bytes
78 static const size_t kChunkSize = kChunkDataSize/kSingleAlignBytes; // target chunk align count
80 static const size_t kSplitLevelsChunkDataSize = 2*kChunkDataSize; // split >0 index levels if size in bytes is bigger
81 
82 // With raw index BAM we can position within index bin, so we can split
83 // huge bins into smaller pieces.
84 // We limit the splitting by both minimal annot chunk range (to avoid
85 // too many alignments to overlap with the next chunk), and by expected
86 // alignment count in a single chunk.
87 // parameters for sub-page splitting, available only with raw BAM index access
88 static const Uint8 kDefaultSplitBinDataSize = 4*kChunkDataSize; // split if size in bytes is bigger
89 static const TSeqPos kDefaultSplitBinMinLength = 2048; // stop splitting if ref seq range is this small
90 
91 //#define SKIP_TOO_LONG_ALIGNMENTS
92 #define SEPARATE_PILEUP_READS
93 
94 // Regular chunks have size of one or more whole index bins.
95 // Extremely populated bins may be split into smaller regions.
96 // The pileup chunk for such smaller regions still covers whole bin.
97 // Since the pileup chunk id is smaller than any other chunks for
98 // the same region it will be loaded first by OM.
99 
100 // During the pileup graph generation we remember where each
101 // sub-range alignments start in the BAM file, and load all alignments
102 // that cross sub-range borders.
103 // So, the sub-range chunks have only alignments that are fully within
104 // their range for OM to index.
105 // All border crossing alignments will be loaded by the pileup chunk so
106 // it's reported to OM as having alignments for the whole range.
107 
108 // When chunks for highly populated bin are generated we first split
109 // alignments by their BAM index level, then if necessary split the bin
110 // into smaller chunks.
111 
112 // Regular chunks (eChunk_align) and chunks with alignments from higher
113 // index levels (eChunk_align2) are gathered by their starting positions.
114 // Their total alignment ranges (for OM indexing) may go beyond their
115 // range end.
116 
117 // The chunks with alignments from BAM index level =0 (eChunk_align1)
118 // are gathered only if they end also within the chunk range, so their
119 // OM indexing ranges do not need to be extended
120 
122  eChunk_pileup_graph, // pileup graphs over the range
123  eChunk_align, // all alignments starting in range
124  eChunk_align1, // alignments from BAM index level =0 within the range
125  eChunk_align2, // alignments from BAM index levels >0 starting in range
126  eChunk_short_seq_pileup, // reads for alignments loaded with pileup graph
127  eChunk_short_seq, // reads corresponding to eChunk_align
128  eChunk_short_seq1, // reads corresponding to eChunk_align1
129  eChunk_short_seq2, // reads corresponding to eChunk_align2
131 };
132 
133 #define PILEUP_NAME_SUFFIX "pileup graphs"
134 
135 NCBI_PARAM_DECL(int, BAM_LOADER, DEBUG);
136 NCBI_PARAM_DEF_EX(int, BAM_LOADER, DEBUG, 0,
137  eParam_NoThread, BAM_LOADER_DEBUG);
138 
139 static int GetDebugLevel(void)
140 {
141  static int value = NCBI_PARAM_TYPE(BAM_LOADER, DEBUG)::GetDefault();
142  return value;
143 }
144 
145 
146 NCBI_PARAM_DECL(string, BAM_LOADER, MAPPER_FILE);
147 NCBI_PARAM_DEF_EX(string, BAM_LOADER, MAPPER_FILE, "",
148  eParam_NoThread, BAM_LOADER_MAPPER_FILE);
149 
150 static string GetMapperFileName(void)
151 {
152  static CSafeStatic<NCBI_PARAM_TYPE(BAM_LOADER, MAPPER_FILE)> s_Value;
153  return s_Value->Get();
154 }
155 
156 
157 NCBI_PARAM_DECL(string, BAM_LOADER, MAPPER_CONTEXT);
158 NCBI_PARAM_DEF_EX(string, BAM_LOADER, MAPPER_CONTEXT, "",
159  eParam_NoThread, BAM_LOADER_MAPPER_CONTEXT);
160 
161 static string GetMapperContext(void)
162 {
163  static CSafeStatic<NCBI_PARAM_TYPE(BAM_LOADER, MAPPER_CONTEXT)> s_Value;
164  return s_Value->Get();
165 }
166 
167 
168 NCBI_PARAM_DECL(bool, BAM_LOADER, PILEUP_GRAPHS);
169 NCBI_PARAM_DEF(bool, BAM_LOADER, PILEUP_GRAPHS, true);
170 
172 {
173  return NCBI_PARAM_TYPE(BAM_LOADER, PILEUP_GRAPHS)::GetDefault();
174 }
175 
176 
178 {
179  NCBI_PARAM_TYPE(BAM_LOADER, PILEUP_GRAPHS)::SetDefault(param);
180 }
181 
182 
183 static inline bool GetPileupGraphsParam(void)
184 {
186 }
187 
188 
189 NCBI_PARAM_DECL(bool, BAM_LOADER, SKIP_EMPTY_PILEUP_GRAPHS);
190 NCBI_PARAM_DEF(bool, BAM_LOADER, SKIP_EMPTY_PILEUP_GRAPHS, true);
191 
193 {
194  return NCBI_PARAM_TYPE(BAM_LOADER, SKIP_EMPTY_PILEUP_GRAPHS)::GetDefault();
195 }
196 
197 
199 {
200  NCBI_PARAM_TYPE(BAM_LOADER, SKIP_EMPTY_PILEUP_GRAPHS)::SetDefault(param);
201 }
202 
203 
204 static inline bool GetSkipEmptyPileupGraphsParam(void)
205 {
207 }
208 
209 
210 NCBI_PARAM_DECL(int, BAM_LOADER, GAP_TO_INTRON_THRESHOLD);
211 NCBI_PARAM_DEF(int, BAM_LOADER, GAP_TO_INTRON_THRESHOLD, -1);
212 
214 {
215  static TSeqPos value = NCBI_PARAM_TYPE(BAM_LOADER, GAP_TO_INTRON_THRESHOLD)::GetDefault();
216  return value;
217 }
218 
219 
220 NCBI_PARAM_DECL(bool, BAM_LOADER, INTRON_GRAPH);
221 NCBI_PARAM_DEF(bool, BAM_LOADER, INTRON_GRAPH, false);
222 
223 static bool s_GetMakeIntronGraph(void)
224 {
225  static TSeqPos value = NCBI_PARAM_TYPE(BAM_LOADER, INTRON_GRAPH)::GetDefault();
226  return value;
227 }
228 
229 
230 NCBI_PARAM_DECL(bool, BAM_LOADER, ESTIMATED_COVERAGE_GRAPH);
231 NCBI_PARAM_DEF(bool, BAM_LOADER, ESTIMATED_COVERAGE_GRAPH, true);
232 
234 {
235  return NCBI_PARAM_TYPE(BAM_LOADER, ESTIMATED_COVERAGE_GRAPH)::GetDefault();
236 }
237 
238 
240 {
241  NCBI_PARAM_TYPE(BAM_LOADER, ESTIMATED_COVERAGE_GRAPH)::SetDefault(param);
242 }
243 
244 
245 static inline bool GetEstimatedCoverageGraphParam(void)
246 {
248 }
249 
250 
251 NCBI_PARAM_DECL(bool, BAM_LOADER, PREFER_RAW_INDEX_OVER_COVERAGE_GRAPH);
252 NCBI_PARAM_DEF_EX(bool, BAM_LOADER, PREFER_RAW_INDEX_OVER_COVERAGE_GRAPH, false,
253  eParam_NoThread, BAM_LOADER_PREFER_RAW_INDEX_OVER_COVERAGE_GRAPH);
254 
256 {
257  return NCBI_PARAM_TYPE(BAM_LOADER, PREFER_RAW_INDEX_OVER_COVERAGE_GRAPH)::GetDefault();
258 }
259 
260 
261 NCBI_PARAM_DECL(bool, BAM_LOADER, PREOPEN);
262 NCBI_PARAM_DEF(bool, BAM_LOADER, PREOPEN, false);
263 
265 {
266  return NCBI_PARAM_TYPE(BAM_LOADER, PREOPEN)::GetDefault();
267 }
268 
269 
271 {
272  NCBI_PARAM_TYPE(BAM_LOADER, PREOPEN)::SetDefault(param);
273 }
274 
275 
276 NCBI_PARAM_DECL(string, BAM_LOADER, INCLUDE_ALIGN_TAGS);
277 NCBI_PARAM_DEF_EX(string, BAM_LOADER, INCLUDE_ALIGN_TAGS, "",
278  eParam_NoThread, "");
279 
281 {
282  return NCBI_PARAM_TYPE(BAM_LOADER, INCLUDE_ALIGN_TAGS)::GetDefault();
283 }
284 
285 
287 {
288  NCBI_PARAM_TYPE(BAM_LOADER, INCLUDE_ALIGN_TAGS)::SetDefault(param);
289 }
290 
291 
292 static inline string GetIncludeAlignTagsParam(void)
293 {
295 }
296 
297 
298 NCBI_PARAM_DECL(int, BAM_LOADER, MIN_MAP_QUALITY);
299 NCBI_PARAM_DEF(int, BAM_LOADER, MIN_MAP_QUALITY, 1);
300 
302 {
303  return NCBI_PARAM_TYPE(BAM_LOADER, MIN_MAP_QUALITY)::GetDefault();
304 }
305 
306 
308 {
309  NCBI_PARAM_TYPE(BAM_LOADER, MIN_MAP_QUALITY)::SetDefault(param);
310 }
311 
312 
313 static inline bool GetMinMapQualityParam(void)
314 {
316 }
317 
318 
319 NCBI_PARAM_DECL(Uint8, BAM_LOADER, SPLIT_BIN_DATA_SIZE);
320 NCBI_PARAM_DEF_EX(Uint8, BAM_LOADER, SPLIT_BIN_DATA_SIZE,
322  eParam_NoThread, BAM_LOADER_SPLIT_BIN_DATA_SIZE);
323 
324 static inline Uint8 GetSplitBinDataSize(void)
325 {
326  return NCBI_PARAM_TYPE(BAM_LOADER, SPLIT_BIN_DATA_SIZE)::GetDefault();
327 }
328 
329 
330 NCBI_PARAM_DECL(TSeqPos, BAM_LOADER, SPLIT_BIN_MIN_LENGTH);
331 NCBI_PARAM_DEF_EX(TSeqPos, BAM_LOADER, SPLIT_BIN_MIN_LENGTH,
333  eParam_NoThread, BAM_LOADER_SPLIT_BIN_MIN_LENGTH);
334 
335 static inline TSeqPos GetSplitBinMinLength(void)
336 {
337  return NCBI_PARAM_TYPE(BAM_LOADER, SPLIT_BIN_MIN_LENGTH)::GetDefault();
338 }
339 
340 
341 template<class Call>
343 CallWithRetry(Call&& call,
344  const char* name,
345  int retry_count = 0)
346 {
347  const int kDefaultRetryCount = 4;
348  if ( retry_count == 0 ) {
349  retry_count = kDefaultRetryCount;
350  }
351  for ( int t = 1; t < retry_count; ++ t ) {
352  try {
353  return call();
354  }
355  catch ( CBlobStateException& ) {
356  // no retry
357  throw;
358  }
359  catch ( CException& exc ) {
360  LOG_POST(Warning<<name<<"() try "<<t<<" exception: "<<exc);
361  }
362  catch ( exception& exc ) {
363  LOG_POST(Warning<<name<<"() try "<<t<<" exception: "<<exc.what());
364  }
365  catch ( ... ) {
366  LOG_POST(Warning<<name<<"() try "<<t<<" exception");
367  }
368  if ( t >= 2 ) {
369  //double wait_sec = m_WaitTime.GetTime(t-2);
370  double wait_sec = 1;
371  LOG_POST(Warning<<name<<"(): waiting "<<wait_sec<<"s before retry");
372  SleepMilliSec(Uint4(wait_sec*1000));
373  }
374  }
375  return call();
376 }
377 #define RETRY(expr) CallWithRetry([&]()->auto{return (expr);}, #expr)
378 
379 
380 /////////////////////////////////////////////////////////////////////////////
381 // CBAMBlobId
382 /////////////////////////////////////////////////////////////////////////////
383 
385 {
386  SIZE_TYPE div = str.rfind('/');
387  m_BamName = str.substr(0, div);
388  CSeq_id id(str.substr(div+1));
390 }
391 
392 
393 CBAMBlobId::CBAMBlobId(const string& bam_name, const CSeq_id_Handle& seq_id)
394  : m_BamName(bam_name), m_SeqId(seq_id)
395 {
396 }
397 
398 
400 {
401 }
402 
403 
404 string CBAMBlobId::ToString(void) const
405 {
406  return m_BamName+'/'+m_SeqId.AsString();
407 }
408 
409 
410 bool CBAMBlobId::operator<(const CBlobId& id) const
411 {
412  const CBAMBlobId& bam2 = dynamic_cast<const CBAMBlobId&>(id);
413  return m_SeqId < bam2.m_SeqId ||
414  (m_SeqId == bam2.m_SeqId && m_BamName < bam2.m_BamName);
415 }
416 
417 
418 bool CBAMBlobId::operator==(const CBlobId& id) const
419 {
420  const CBAMBlobId& bam2 = dynamic_cast<const CBAMBlobId&>(id);
421  return m_SeqId == bam2.m_SeqId && m_BamName == bam2.m_BamName;
422 }
423 
424 
425 /////////////////////////////////////////////////////////////////////////////
426 // CBAMDataLoader_Impl
427 /////////////////////////////////////////////////////////////////////////////
428 
429 
431  const CBAMDataLoader::SLoaderParams& params)
432  : m_IdMapper(params.m_IdMapper)
433 {
434  if ( !m_IdMapper ) {
435  string mapper_file_name = GetMapperFileName();
436  if ( !mapper_file_name.empty() ) {
437  CNcbiIfstream in(mapper_file_name.c_str());
439  }
440  }
441  CSrzPath srz_path;
442  m_DirPath = srz_path.FindAccPathNoThrow(params.m_DirPath);
443  if ( m_DirPath.empty() ) {
444  m_DirPath = params.m_DirPath;
445  }
446 
447  if ( !m_DirPath.empty() && *m_DirPath.rbegin() != '/' ) {
448  m_DirPath += '/';
449  }
450  if ( params.m_BamFiles.empty() ||
451  (params.m_BamFiles.size() == 1 &&
452  params.m_BamFiles[0].m_BamName == SRZ_CONFIG_NAME) ) {
453  AddSrzDef();
454  }
455  else {
456  ITERATE (vector<CBAMDataLoader::SBamFileName>, it, params.m_BamFiles) {
457  AddBamFile(*it);
458  }
459  }
461  OpenBAMFiles();
462  }
463 }
464 
465 
466 static double s_CreateTime = 0;
467 static double s_AttachTime = 0;
468 
469 
471 {
472  if ( GetDebugLevel() >= 4 ) {
473  LOG_POST_X(21, Info<<"CBAMDataLoader: "
474  "Total create time: "<<s_CreateTime);
475  LOG_POST_X(22, Info<<"CBAMDataLoader: "
476  "Total attach time: "<<s_AttachTime);
477  }
478 }
479 
480 
482 {
483  AutoPtr<CIdMapper> mapper(new CIdMapper);
484 
485  string def_name = m_DirPath+SRZ_CONFIG_NAME;
486  CNcbiIfstream in(def_name.c_str());
487  if ( !in ) {
488  NCBI_THROW(CLoaderException, eNoData,
489  "CBAMDataLoader: no def file: "+def_name);
490  }
491  string line;
492  vector<string> tokens;
493  while ( getline(in, line) ) {
494  tokens.clear();
495  NStr::Split(line, "\t", tokens);
496  if ( tokens.size() < 4 ) {
497  NCBI_THROW(CLoaderException, eNoData,
498  "CBAMDataLoader: bad def line: \""+line+"\"");
499  }
501  info.m_BamSeqLabel = tokens[0];
502  info.m_Label = tokens[1];
503  if ( tokens[2].empty() ) {
504  info.m_SeqId = CSeq_id_Handle::GetHandle(info.m_Label);
505  }
506  else {
507  info.m_SeqId = CSeq_id_Handle::GetHandle(tokens[2]);
508  }
509  info.m_BamFileName = tokens[3];
510  if ( tokens.size() >= 4 ) {
511  info.m_CovFileName = tokens[4];
512  }
513  info.m_AnnotName = CDirEntry(info.m_BamFileName.m_BamName).GetBase();
514  m_SeqInfos.push_back(info);
515  CRef<CSeq_id> src_id;
516  try {
517  src_id = new CSeq_id(info.m_BamSeqLabel);
518  }
519  catch ( CException& /*ignored*/ ) {
520  src_id = new CSeq_id(CSeq_id::e_Local, info.m_BamSeqLabel);
521  }
522  mapper->AddMapping(CSeq_id_Handle::GetHandle(*src_id), info.m_SeqId);
523  }
524  m_IdMapper.reset(mapper.release());
525 }
526 
527 
529 {
530  CMutexGuard guard(m_Mutex);
531  return !m_BamFiles.empty();
532 }
533 
534 
536 {
537  CMutexGuard guard(m_Mutex);
538  if ( !m_BamFiles.empty() ) {
539  return;
540  }
541  ITERATE ( TSeqInfos, it, m_SeqInfos ) {
542  const SDirSeqInfo& info = *it;
543 
544  CRef<CBamFileInfo> bam_info;
545  auto iter = m_BamFiles.find(info.m_BamFileName.m_BamName);
546  if ( iter == m_BamFiles.end() ) {
547  bam_info = new CBamFileInfo(*this, info.m_BamFileName,
548  info.m_BamSeqLabel, info.m_SeqId);
549  m_BamFiles[info.m_BamFileName.m_BamName] = bam_info;
550  }
551  else {
552  bam_info = iter->second;
553  bam_info->AddRefSeq(info.m_BamSeqLabel, info.m_SeqId);
554  }
555  if ( !info.m_CovFileName.empty() ) {
556  string file_name = m_DirPath + info.m_CovFileName;
557  if ( !CFile(file_name).Exists() ) {
558  ERR_POST_X(2, "CBAMDataLoader: "
559  "no cov file: \""+file_name+"\"");
560  }
561  else {
562  if ( CBamRefSeqInfo* seq_info = bam_info->GetRefSeqInfo(info.m_SeqId) ) {
563  seq_info->SetCovFileName(file_name);
564  }
565  }
566  }
567  }
568 }
569 
570 
572 {
574 }
575 
576 
578 {
580  info.m_BamFileName = bam;
581  info.m_AnnotName = CDirEntry(info.m_BamFileName.m_BamName).GetBase();
582  m_SeqInfos.push_back(info);
583 }
584 
585 
587 {
588  OpenBAMFiles();
590  if ( bit == m_BamFiles.end() ) {
591  return 0;
592  }
593  CBamRefSeqInfo* info = bit->second->GetRefSeqInfo(blob_id.m_SeqId);
594  return info;
595 }
596 
597 
599  CTSE_LoadLock& load_lock)
600 {
601  RETRY(GetRefSeqInfo(blob_id)->LoadMainSplit(load_lock));
602 }
603 
604 
606  CTSE_Chunk_Info& chunk_info)
607 {
608  RETRY(GetRefSeqInfo(blob_id)->LoadChunk(chunk_info));
609 }
610 
611 
613  const CTSE_Chunk_Info& chunk,
614  Uint4 bytes)
615 {
616  return GetRefSeqInfo(blob_id)->EstimateLoadSeconds(chunk, bytes);
617 }
618 
619 
622 {
623  CRef<CBAMBlobId> ret;
624  if ( BAMFilesOpened() ) {
625  ITERATE ( TBamFiles, it, m_BamFiles ) {
626  it->second->GetShortSeqBlobId(ret, idh);
627  }
628  }
629  return ret;
630 }
631 
632 
635 {
636  CRef<CBAMBlobId> ret;
637  OpenBAMFiles();
638  ITERATE ( TBamFiles, it, m_BamFiles ) {
639  it->second->GetRefSeqBlobId(ret, idh);
640  }
641  return ret;
642 }
643 
644 
646 {
648  ITERATE ( TSeqInfos, it, m_SeqInfos ) {
649  const string& name = it->m_AnnotName;
650  if ( name.empty() ) {
651  names.push_back(CAnnotName());
652  }
653  else {
654  names.push_back(CAnnotName(name));
655  }
656  }
657  sort(names.begin(), names.end());
658  names.erase(unique(names.begin(), names.end()), names.end());
659  return names;
660 }
661 
662 
664 {
665  if ( IsShortSeq(idh) ) {
666  ids.push_back(idh);
667  }
668 }
669 
670 
673 {
675  if ( !idh.IsGi() && idh.GetSeqId()->GetTextseq_Id() && IsShortSeq(idh) ) {
676  ret.sequence_found = true;
677  ret.acc_ver = idh;
678  }
679  return ret;
680 }
681 
682 
685 {
687  if ( idh.IsGi() && IsShortSeq(idh) ) {
688  ret.sequence_found = true;
689  ret.gi = idh.GetGi();
690  }
691  return ret;
692 }
693 
694 
696 {
697  if ( IsShortSeq(idh) ) {
698  return objects::GetLabel(idh);
699  }
700  return string();
701 }
702 
703 
705 {
706  if ( IsShortSeq(idh) ) {
707  return ZERO_TAX_ID;
708  }
709  return INVALID_TAX_ID;
710 }
711 
712 
714 {
715  return GetShortSeqBlobId(idh) != null;
716 }
717 
718 
719 /////////////////////////////////////////////////////////////////////////////
720 // CBamFileInfo
721 /////////////////////////////////////////////////////////////////////////////
722 
723 
725  const CBAMDataLoader::SBamFileName& bam,
726  const string& refseq_label,
727  const CSeq_id_Handle& seq_id)
728 {
729  CStopWatch sw;
730  if ( GetDebugLevel() >= 1 ) {
731  sw.Start();
732  }
733  x_Initialize(impl, bam);
734  if ( seq_id ) {
735  AddRefSeq(refseq_label, seq_id);
736  }
737  else {
738  for ( CBamRefSeqIterator rit(m_BamDb); rit; ++rit ) {
739  string refseq_label = rit.GetRefSeqId();
740  CSeq_id_Handle seq_id = CSeq_id_Handle::GetHandle(*rit.GetRefSeq_id());
741  AddRefSeq(refseq_label, seq_id);
742  }
743  }
744  if ( GetDebugLevel() >= 1 ) {
745  LOG_POST_X(16, Info<<"CBAMDataLoader: "
746  "Opened BAM file "<<bam.m_BamName<<" in "<<sw.Elapsed());
747  }
748 }
749 
750 
752  const CBAMDataLoader::SBamFileName& bam)
753 {
754  m_BamName = bam.m_BamName;
756  m_BamDb = CBamDb(impl.m_Mgr,
757  impl.m_DirPath+bam.m_BamName,
758  impl.m_DirPath+(bam.m_IndexName.empty()?
759  bam.m_BamName:
760  bam.m_IndexName));
761  if ( impl.m_IdMapper.get() ) {
762  m_BamDb.SetIdMapper(impl.m_IdMapper.get(), eNoOwnership);
763  }
764  string include_tags = GetIncludeAlignTagsParam();
765  if ( !include_tags.empty() ) {
766  vector<string> tags;
767  NStr::Split(include_tags, ",", tags);
768  for ( auto& tag : tags ) {
770  }
771  }
772 }
773 
774 
775 void CBamFileInfo::AddRefSeq(const string& refseq_label,
776  const CSeq_id_Handle& refseq_id)
777 {
778  if ( GetDebugLevel() >= 2 ) {
779  LOG_POST_X(9, Info << "CBAMDataLoader(" << m_BamName << "): "
780  "Found "<<refseq_label<<" -> "<<refseq_id);
781  }
782  auto& slot = m_RefSeqs[refseq_id];
783  if ( slot ) {
784  ERR_POST_X(15, "CBAMDataLoader::AddSeqRef: "
785  "duplicate Seq-id "<<refseq_id<<" for ref "<<refseq_label<<" in "<<GetBamName());
786  }
787  else {
788  slot = new CBamRefSeqInfo(this, refseq_label, refseq_id);
789  }
790 }
791 
792 
794  const CSeq_id_Handle& idh) const
795 {
796  ITERATE ( TRefSeqs, it, m_RefSeqs ) {
797  it->second->GetShortSeqBlobId(ret, idh);
798  }
799 }
800 
801 
803  const CSeq_id_Handle& idh) const
804 {
805  const CBamRefSeqInfo* info = GetRefSeqInfo(idh);
806  if ( info ) {
807  info->SetBlobId(ret, idh);
808  }
809 }
810 
811 
813 {
815  if ( it == m_RefSeqs.end() ) {
816  return 0;
817  }
818  return it->second.GetNCPointer();
819 }
820 
821 
822 /////////////////////////////////////////////////////////////////////////////
823 // CBamRefSeqInfo
824 /////////////////////////////////////////////////////////////////////////////
825 
826 
829 {
830 public:
831  void AddSpotId(string& short_id, const CBamAlignIterator* iter) {
832  string seq = iter->GetShortSequence();
833  if ( iter->IsSetStrand() && IsReverse(iter->GetStrand()) ) {
835  }
836  CFastMutexGuard guard(m_Mutex);
837  SShortSeqInfo& info = m_ShortSeqs[short_id];
838  if ( info.spot1.empty() ) {
839  info.spot1 = seq;
840  short_id += ".1";
841  return;
842  }
843  if ( info.spot1 == seq ) {
844  short_id += ".1";
845  return;
846  }
847  if ( info.spot2.empty() ) {
848  info.spot2 = seq;
849  short_id += ".2";
850  return;
851  }
852  if ( info.spot2 == seq ) {
853  short_id += ".2";
854  return;
855  }
856  short_id += ".?";
857  }
858 
859 private:
861 
862  struct SShortSeqInfo {
863  string spot1, spot2;
864  };
866 };
867 
869  const string& refseqid,
870  const CSeq_id_Handle& seq_id)
871  : m_File(bam_file),
872  m_RefSeqId(refseqid),
873  m_RefSeq_id(seq_id),
874  m_MinMapQuality(GetMinMapQualityParam()),
875  m_LoadedRanges(false)
876 {
878 }
879 
880 
882  const CSeq_id_Handle& idh) const
883 {
885  if ( ret ) {
886  ERR_POST_X(1, "CBAMDataLoader::GetBlobId: "
887  "Seq-id "<<idh<<" appears in two files: "
888  <<ret->ToString()<<" & "<<id->ToString());
889  }
890  else {
891  ret = id;
892  }
893 }
894 
895 
897  const CSeq_id_Handle& idh) const
898 {
899  bool exists;
900  {{
902  exists = m_Seq2Chunk.find(idh) != m_Seq2Chunk.end();
903  }}
904  if ( exists ) {
905  SetBlobId(ret, idh);
906  }
907 }
908 
909 
910 namespace {
911  struct SRefStat {
912  SRefStat(void)
913  : m_RefPosQuery(0),
914  m_Count(0),
915  m_RefPosFirst(0),
916  m_RefPosLast(0),
917  m_RefPosMax(0),
918  m_RefLenMax(0)
919  {
920  }
921 
922  TSeqPos m_RefPosQuery;
923  unsigned m_Count;
924  TSeqPos m_RefPosFirst;
925  TSeqPos m_RefPosLast;
926  TSeqPos m_RefPosMax;
927  TSeqPos m_RefLenMax;
928 
929  void Collect(const CBamDb& bam_db, const string& ref_id,
930  TSeqPos ref_pos, unsigned count, int min_quality);
931 
932  unsigned GetStatCount(void) const {
933  return m_Count;
934  }
935  double GetStatLen(void) const {
936  return m_RefPosLast - m_RefPosFirst + .5;
937  }
938  };
939 
940 
941  void SRefStat::Collect(const CBamDb& bam_db, const string& ref_id,
942  TSeqPos ref_pos, unsigned count, int min_quality)
943  {
944  m_RefPosQuery = ref_pos;
945  size_t skipped = 0;
946  TSeqPos ref_len = bam_db.GetRefSeqLength(ref_id);
947  CBamAlignIterator ait(bam_db, ref_id, ref_pos);
948  for ( ; ait; ++ait ) {
949  TSeqPos pos = ait.GetRefSeqPos();
950  if ( pos < ref_pos ) {
951  // the alignment starts before current range
952  continue;
953  }
954  if ( min_quality > 0 && ait.GetMapQuality() < min_quality ) {
955  ++skipped;
956  continue;
957  }
958  TSeqPos len = ait.GetCIGARRefSize();
959  TSeqPos max = pos + len;
960  if ( max > ref_len ) {
961  ++skipped;
962  continue;
963  }
964  m_RefPosLast = pos;
965  if ( len > m_RefLenMax ) {
966  m_RefLenMax = len;
967  }
968  if ( max > m_RefPosMax ) {
969  m_RefPosMax = max;
970  }
971  if ( m_Count == 0 ) {
972  m_RefPosFirst = pos;
973  }
974  if ( ++m_Count == count ) {
975  break;
976  }
977  }
978  if ( GetDebugLevel() >= 3 ) {
979  LOG_POST_X(4, Info << "CBAMDataLoader: "
980  "Stat @ "<<m_RefPosQuery<<": "<<m_Count<<" entries: "<<
981  m_RefPosFirst<<"-"<<m_RefPosLast<<
982  "(+"<<m_RefPosMax-m_RefPosLast<<")"<<
983  " max len: "<<m_RefLenMax<<
984  " skipped: "<<skipped);
985  }
986  }
987 };
988 
989 
991 {
992  if ( m_LoadedRanges ) {
993  return;
994  }
995  _TRACE("Loading "<<GetRefSeqId()<<" -> "<<GetRefSeq_id());
996  if ( !x_LoadRangesCov() && !x_LoadRangesEstimated() ) {
998  }
999  _TRACE("Loaded ranges on "<<GetRefSeqId());
1000  m_LoadedRanges = true;
1001 }
1002 
1003 
1004 static const CUser_field& GetIdField(const CUser_field& field, int id)
1005 {
1007  const CUser_field& field = **it;
1008  if ( field.IsSetLabel() &&
1009  field.GetLabel().IsId() &&
1010  field.GetLabel().GetId() == id ) {
1011  return field;
1012  }
1013  }
1014  NCBI_THROW_FMT(CLoaderException, eOtherError, "CBAMDataLoader: "
1015  "outlier value not found: "<<id);
1016 }
1017 
1018 
1020 {
1021  if ( m_CovFileName.empty() ) {
1022  return false;
1023  }
1025  // use more precise index information
1026  return false;
1027  }
1028  try {
1029  CRef<CSeq_entry> entry(new CSeq_entry);
1030  CRef<CSeq_annot> annot;
1031  CRef<CSeq_graph> graph;
1033  m_CovFileName));
1034  *in >> *entry;
1035  const CBioseq::TAnnot& alist = entry->GetSeq().GetAnnot();
1036  if ( alist.size() != 1 ) {
1037  NCBI_THROW_FMT(CLoaderException, eOtherError, "CBAMDataLoader: "
1038  "wrong number of annots in cov entry: "<<
1039  alist.size());
1040  }
1041  annot = alist.front();
1042  const CSeq_annot::TData::TGraph& glist = annot->GetData().GetGraph();
1043  if ( glist.size() != 1 ) {
1044  NCBI_THROW_FMT(CLoaderException, eOtherError, "CBAMDataLoader: "
1045  "wrong number of graphs in cov entry: "<<
1046  glist.size());
1047  }
1048  graph = glist.front();
1049 
1050  CConstRef<CUser_object> params;
1051  ITERATE ( CAnnot_descr::Tdata, it, annot->GetDesc().Get() ) {
1052  if ( (*it)->IsUser() &&
1053  (*it)->GetUser().GetType().GetStr() == "BAM coverage" ) {
1054  params = &(*it)->GetUser();
1055  break;
1056  }
1057  }
1058 
1059  TSeqPos slot = graph->GetComp();
1060  TSeqPos cnt = graph->GetNumval();
1061  CConstRef<CUser_field> outliers = params->GetFieldRef("Outliers");
1062  double vmul = graph->GetA();
1063  double vadd = graph->GetB();
1064  double outliers_mul = 1./slot;
1065 
1066  size_t non_zero_count = 0;
1067  vector<double> cov(cnt);
1068  if ( graph->GetGraph().IsByte() ) {
1069  const CByte_graph& g = graph->GetGraph().GetByte();
1070  const CByte_graph::TValues& vv = g.GetValues();
1071  int vmin = g.GetMin();
1072  int vmax = g.GetMax();
1073  for ( TSeqPos i = 0; i < cnt; ++i ) {
1074  double v;
1075  int vg = Uint1(vv[i]);
1076  if ( vg < vmin ) {
1077  continue;
1078  }
1079  if ( vg > vmax ) {
1080  v = GetIdField(*outliers, i).GetData().GetReal()*outliers_mul;
1081  }
1082  else {
1083  v = vmul*vg+vadd;
1084  }
1085  cov[i] = v;
1086  ++non_zero_count;
1087  }
1088  }
1089  else {
1090  const CInt_graph& g = graph->GetGraph().GetInt();
1091  const CInt_graph::TValues& vv = g.GetValues();
1092  int vmin = g.GetMin();
1093  int vmax = g.GetMax();
1094  for ( TSeqPos i = 0; i < cnt; ++i ) {
1095  double v;
1096  int vg = vv[i];
1097  if ( vg < vmin ) {
1098  continue;
1099  }
1100  if ( vg > vmax ) {
1101  v = GetIdField(*outliers, i).GetData().GetReal()*outliers_mul;
1102  }
1103  else {
1104  v = vmul*vg+vadd;
1105  }
1106  cov[i] = v;
1107  ++non_zero_count;
1108  }
1109  }
1110 
1111  m_MinMapQuality = params->GetField("MinMapQuality").GetData().GetInt();
1112  int align_cnt = params->GetField("AlignCount").GetData().GetInt();
1113  double avg_cov = params->GetField("AvgCoverage").GetData().GetReal();
1114  double total_cov = avg_cov*non_zero_count*slot;
1115  double avg_align_len = total_cov/align_cnt;
1116  int max_align_len;
1117  CConstRef<CUser_field> len_field = params->GetFieldRef("MaxAlignSpan");
1118  if ( len_field ) {
1119  max_align_len = len_field->GetData().GetInt();
1120  }
1121  else {
1122  max_align_len = int(avg_align_len*2+50);
1123  }
1124  double cov_to_align_cnt = (align_cnt*slot)/total_cov;
1125 
1126  TSeqPos cur_first = kInvalidSeqPos, cur_last = kInvalidSeqPos;
1127  double cur_cnt = 0;
1128 
1129  for ( TSeqPos i = 0; i <= cnt; ++i ) {
1130  bool empty = i==cnt || !cov[i];
1131  double next_cnt = i==cnt? 0: cov[i] * cov_to_align_cnt;
1132  if ( cur_first != kInvalidSeqPos &&
1133  (i == cnt ||
1134  cur_cnt >= kChunkSize*1.5 ||
1135  (next_cnt >= kChunkSize*2 && cur_cnt >= kChunkSize*.7) ||
1136  (empty && cur_cnt >= kChunkSize)) ) {
1137  // flush collected slots
1138  if ( GetDebugLevel() >= 3 ) {
1139  LOG_POST_X(8, Info << "CBAMDataLoader:"
1140  " Chunk "<<m_Chunks.size()<<
1141  " Slots "<<cur_first<<"-"<<cur_last<<
1142  " exp: "<<cur_cnt);
1143  }
1144  CBamRefSeqChunkInfo chunk;
1145  chunk.m_AlignCount = Uint8(cur_cnt+1);
1146  chunk.m_RefSeqRange.SetFrom(cur_first*slot);
1147  TSeqPos end = cur_last*slot+slot;
1148  chunk.m_RefSeqRange.SetToOpen(end+max_align_len);
1149  chunk.m_MaxRefSeqFrom = end-1;
1150  m_Chunks.push_back(chunk);
1151  cur_first = kInvalidSeqPos;
1152  cur_last = kInvalidSeqPos;
1153  cur_cnt = 0;
1154  }
1155  if ( empty ) continue;
1156  if ( cur_first == kInvalidSeqPos ) {
1157  cur_first = i;
1158  }
1159  cur_last = i;
1160  cur_cnt += max(next_cnt, 1e-9);
1161  }
1162 
1163  m_CovEntry = entry;
1164  return true;
1165  }
1166  catch ( CException& exc ) {
1167  ERR_POST_X(3, "CBAMDataLoader: "
1168  "failed to load cov file: "<<m_CovFileName<<": "<<exc);
1169  return false;
1170  }
1171 }
1172 
1173 
1174 static inline
1175 TSeqPos s_GetEnd(const vector<TSeqPos>& over_ends, TSeqPos i, TSeqPos bin_size)
1176 {
1177  return i < over_ends.size()? over_ends[i]: i*bin_size+bin_size-1;
1178 }
1179 
1180 
1182 {
1183  if ( !m_File->GetBamDb().UsesRawIndex() ) {
1184  return false;
1185  }
1186 
1187  CBamRawDb& raw_db = m_File->GetBamDb().GetRawDb();
1188  auto refseq_index = raw_db.GetRefIndex(GetRefSeqId());
1189  auto& refseq = raw_db.GetIndex().GetRef(refseq_index);
1190  vector<Uint8> data_sizes = refseq.EstimateDataSizeByAlnStartPos(raw_db.GetRefSeqLength(refseq_index));
1191  vector<Uint4> over_ends = refseq.GetAlnOverEnds();
1192  TSeqPos bin_count = TSeqPos(data_sizes.size());
1193  TSeqPos bin_size = raw_db.GetIndex().GetMinBinSize();
1194  if ( GetDebugLevel() >= 2 ) {
1195  LOG_POST_X(26, Info<<"CBAMDataLoader:"
1196  " Bin size: "<<bin_size<<
1197  " count: "<<data_sizes.size()<<
1198  " length: "<<(bin_size*data_sizes.size()));
1199  LOG_POST_X(26, Info<<"CBAMDataLoader:"
1200  " Total cov: "<<accumulate(data_sizes.begin(), data_sizes.end(), Uint8(0)));
1201  }
1202  static const TSeqPos kZeroBlocks = 8;
1203  static const TSeqPos kMaxChunkLength = 300*1024*1024;
1204 
1205  m_Chunks.clear();
1206  TSeqPos last_pos = 0;
1207  TSeqPos zero_count = 0;
1208  Uint8 cur_data_size = 0;
1209  bool has_pileup = GetPileupGraphsParam();
1210  Uint8 split_bin_data_size = has_pileup? GetSplitBinDataSize(): 0;
1211  TSeqPos split_bin_min_length = has_pileup? GetSplitBinMinLength(): 0;
1212  if ( split_bin_data_size == 0 || split_bin_min_length == 0 ) {
1213  split_bin_data_size = kMax_UI8;
1214  split_bin_min_length = kInvalidSeqPos;
1215  }
1216  for ( TSeqPos i = 0; i <= bin_count; ++i ) {
1217  if ( i < bin_count && !data_sizes[i] ) {
1218  ++zero_count;
1219  continue;
1220  }
1221  TSeqPos pos = i*bin_size;
1222  if ( i == bin_count || zero_count >= kZeroBlocks ) {
1223  // add non-empty chunk at last_pos
1224  if ( cur_data_size ) {
1225  _ASSERT(i > zero_count);
1226  _ASSERT(last_pos < pos-zero_count*bin_size);
1227  _ASSERT(cur_data_size > 0);
1228  TSeqPos non_zero_end = pos - zero_count*bin_size;
1230  info.m_DataSize = cur_data_size;
1231  info.m_RefSeqRange.SetFrom(last_pos);
1232  info.m_RefSeqRange.SetTo(s_GetEnd(over_ends, i-zero_count-1, bin_size));
1233  info.m_MaxRefSeqFrom = non_zero_end-1;
1234  if ( GetDebugLevel() >= 3 ) {
1235  LOG_POST_X(23, Info << "CBAMDataLoader:"
1236  " Chunk "<<m_Chunks.size()<<
1237  " Range "<<info.m_RefSeqRange.GetFrom()<<"-"<<info.m_MaxRefSeqFrom<<
1238  " (.."<<info.m_RefSeqRange.GetTo()<<")"
1239  " size: "<<info.m_DataSize);
1240  }
1241  _ASSERT(info.m_RefSeqRange.GetLength() > 0);
1242  m_Chunks.push_back(info);
1243 
1244  last_pos = non_zero_end;
1245  cur_data_size = 0;
1246  }
1247  // add remaining empty chunk
1248  if ( zero_count > 0 ) {
1249  _ASSERT(i > 0);
1250  _ASSERT(zero_count > 0);
1251  _ASSERT(last_pos == pos-zero_count*bin_size);
1252  _ASSERT(cur_data_size == 0);
1254  info.m_DataSize = 0;
1255  info.m_RefSeqRange.SetFrom(last_pos);
1256  info.m_RefSeqRange.SetTo(s_GetEnd(over_ends, i-zero_count, bin_size));
1257  info.m_MaxRefSeqFrom = last_pos;
1258  if ( GetDebugLevel() >= 3 ) {
1259  LOG_POST_X(24, Info << "CBAMDataLoader:"
1260  " Chunk "<<m_Chunks.size()<<
1261  " Range "<<info.m_RefSeqRange.GetFrom()<<"-"<<info.m_MaxRefSeqFrom<<
1262  " (.."<<info.m_RefSeqRange.GetTo()<<")"
1263  " size: "<<info.m_DataSize);
1264  }
1265  _ASSERT(info.m_RefSeqRange.GetLength() > 0);
1266  m_Chunks.push_back(info);
1267 
1268  last_pos = pos;
1269  zero_count = 0;
1270  }
1271  }
1272  if ( i == bin_count ) {
1273  break;
1274  }
1275  zero_count = 0;
1276  cur_data_size += data_sizes[i];
1277  if ( cur_data_size >= kChunkDataSize ||
1278  pos+bin_size-last_pos >= kMaxChunkLength ||
1279  (i+1 < bin_count && data_sizes[i+1] > split_bin_data_size) ) {
1280  if ( has_pileup &&
1281  data_sizes[i] > split_bin_data_size &&
1282  bin_size > split_bin_min_length ) {
1283  // special split to sub-page size
1284  _ASSERT(last_pos == pos);
1285  _ASSERT(cur_data_size == data_sizes[i]);
1286  int split_shift = 0;
1287  while ( (cur_data_size >> split_shift) > split_bin_data_size &&
1288  (bin_size >> split_shift) > split_bin_min_length ) {
1289  ++split_shift;
1290  }
1291  int sub_chunk_count = 1 << split_shift;
1292  auto sub_chunk_data_size = cur_data_size >> split_shift;
1293  TSeqPos sub_chunk_len = bin_size >> split_shift;
1294  TSeqPos ref_end = s_GetEnd(over_ends, i, bin_size);
1295  if ( GetDebugLevel() >= 3 ) {
1296  LOG_POST_X(27, Info << "CBAMDataLoader:"
1297  " Huge Chunk "<<m_Chunks.size()<<
1298  " Range "<<last_pos<<"-"<<(pos+bin_size)<<
1299  " (.."<<ref_end<<")"<<
1300  " size: "<<cur_data_size);
1301  }
1302  for ( int i = 0; i < sub_chunk_count; ++i ) {
1304  info.m_DataSize = sub_chunk_data_size;
1305  info.m_RefSeqRange.SetFrom(last_pos+i*sub_chunk_len);
1306  info.m_MaxRefSeqFrom = pos+(i+1)*sub_chunk_len-1;
1307  info.m_RefSeqRange.SetTo(i==0? ref_end: info.m_MaxRefSeqFrom);
1308  info.m_PileupChunkCount = i==0? sub_chunk_count: 0;
1309  if ( GetDebugLevel() >= 3 ) {
1310  LOG_POST_X(28, Info << "CBAMDataLoader:"
1311  " Huge Chunk "<<m_Chunks.size()<<
1312  " Range "<<info.m_RefSeqRange.GetFrom()<<"-"<<info.m_MaxRefSeqFrom<<
1313  " (.."<<info.m_RefSeqRange.GetTo()<<")"
1314  " size: "<<info.m_DataSize);
1315  }
1316  _ASSERT(info.m_RefSeqRange.GetLength() > 0);
1317  m_Chunks.push_back(info);
1318  }
1319  last_pos = pos+bin_size;
1320  cur_data_size = 0;
1321  }
1322  else {
1323  // add chunk from last_pos to pos
1324  _ASSERT(last_pos <= pos);
1325  _ASSERT(cur_data_size > 0);
1327  info.m_DataSize = cur_data_size;
1328  info.m_RefSeqRange.SetFrom(last_pos);
1329  info.m_RefSeqRange.SetTo(s_GetEnd(over_ends, i, bin_size));
1330  info.m_MaxRefSeqFrom = pos+bin_size-1;
1331  if ( GetDebugLevel() >= 3 ) {
1332  LOG_POST_X(25, Info << "CBAMDataLoader:"
1333  " Chunk "<<m_Chunks.size()<<
1334  " Range "<<info.m_RefSeqRange.GetFrom()<<"-"<<info.m_MaxRefSeqFrom<<
1335  " (.."<<info.m_RefSeqRange.GetTo()<<")"
1336  " size: "<<info.m_DataSize);
1337  }
1338  _ASSERT(info.m_RefSeqRange.GetLength() > 0);
1339  m_Chunks.push_back(info);
1340  last_pos = pos+bin_size;
1341  cur_data_size = 0;
1342  }
1343  }
1344  }
1345 
1347  CRef<CSeq_entry> entry(new CSeq_entry);
1348  entry->SetSet().SetSeq_set();
1349  CBam2Seq_graph cvt;
1350  cvt.SetRefLabel(GetRefSeqId());
1351  cvt.SetRefId(*GetRefSeq_id().GetSeqId());
1353  cvt.SetEstimated();
1355  entry->SetAnnot().push_back(cvt.MakeSeq_annot(raw_db, m_File->GetBamName()));
1356  m_CovEntry = entry;
1357  }
1358  return true;
1359 }
1360 
1361 
1363 {
1364  const unsigned kNumStat = 10;
1365  const unsigned kStatCount = 1000;
1366  vector<SRefStat> stat(kNumStat);
1367  TSeqPos ref_begin = 0, ref_end_min = 0, ref_end = 0, max_len = 0;
1368  double stat_len = 0, stat_cnt = 0;
1369  const unsigned scan_first = 1;
1370  if ( scan_first ) {
1371  stat[0].Collect(*m_File, GetRefSeqId(), 0,
1372  kStatCount, m_MinMapQuality);
1373  if ( stat[0].m_Count != kStatCount ) {
1374  // single chunk
1375  if ( stat[0].m_Count > 0 ) {
1376  CBamRefSeqChunkInfo chunk;
1377  chunk.m_AlignCount = stat[0].m_Count;
1378  chunk.m_RefSeqRange.SetFrom(stat[0].m_RefPosFirst);
1379  chunk.m_RefSeqRange.SetToOpen(stat[0].m_RefPosMax);
1380  chunk.m_MaxRefSeqFrom = stat[0].m_RefPosLast;
1381  m_Chunks.push_back(chunk);
1382  }
1383  m_LoadedRanges = true;
1384  return;
1385  }
1386  ref_begin = stat[0].m_RefPosFirst;
1387  ref_end_min = stat[0].m_RefPosLast;
1388  max_len = stat[0].m_RefLenMax;
1389  stat_len = stat[0].GetStatLen();
1390  stat_cnt = stat[0].GetStatCount();
1391  }
1392  ref_end = m_File->GetRefSeqLength(GetRefSeqId());
1393  if ( ref_end == kInvalidSeqPos ) {
1394  TSeqPos min = ref_end_min;
1396  while ( max > min+max_len+1 ) {
1397  TSeqPos mid = min + (max - min)/2;
1398  _TRACE("binary: "<<min<<"-"<<max<<" -> "<<mid);
1399  if ( CBamAlignIterator(*m_File, GetRefSeqId(), mid) ) {
1400  min = mid;
1401  }
1402  else {
1403  max = mid;
1404  }
1405  }
1406  ref_end = max;
1407  _TRACE("binary: end: "<<max);
1408  }
1409  for ( unsigned k = scan_first; k < kNumStat; ++k ) {
1410  TSeqPos ref_pos = ref_begin +
1411  TSeqPos(double(ref_end - ref_begin)*k/kNumStat);
1412  if ( k && ref_pos < stat[k-1].m_RefPosLast ) {
1413  ref_pos = stat[k-1].m_RefPosLast;
1414  }
1415  _TRACE("stat["<<k<<"] @ "<<ref_pos);
1416  stat[k].Collect(*m_File, GetRefSeqId(), ref_pos,
1417  kStatCount, m_MinMapQuality);
1418  stat_len += stat[k].GetStatLen();
1419  stat_cnt += stat[k].GetStatCount();
1420  if ( stat[k].m_RefLenMax > max_len ) {
1421  max_len = stat[k].m_RefLenMax;
1422  }
1423  }
1424  double density = stat_cnt / stat_len;
1425  double exp_count = (ref_end-ref_begin)*density;
1426  unsigned chunks = unsigned(exp_count/kChunkSize+1);
1427  chunks = min(chunks, unsigned(sqrt(exp_count)+1));
1428  max_len *= 2;
1429  if ( GetDebugLevel() >= 2 ) {
1430  LOG_POST_X(5, Info << "CBAMDataLoader: "
1431  "Total range: "<<ref_begin<<"-"<<ref_end-1<<
1432  " exp count: "<<exp_count<<" chunks: "<<chunks);
1433  }
1434  vector<TSeqPos> pp(chunks+1);
1435  for ( unsigned k = 1; k < chunks; ++k ) {
1436  TSeqPos pos = ref_begin +
1437  TSeqPos(double(ref_end-ref_begin)*k/chunks);
1438  pp[k] = pos;
1439  }
1440  pp[chunks] = ref_end;
1441  for ( unsigned k = 0; k < chunks; ++k ) {
1442  CBamRefSeqChunkInfo chunk;
1443  chunk.m_AlignCount = 1;
1444  TSeqPos pos = pp[k];
1445  TSeqPos end = pp[k+1];
1446  chunk.m_RefSeqRange.SetFrom(pos);
1447  TSeqPos end2 = min(end+max_len, ref_end);
1448  if ( k+1 < chunks ) {
1449  end2 = min(end2, pp[k+2]);
1450  }
1451  chunk.m_RefSeqRange.SetToOpen(end2);
1452  chunk.m_MaxRefSeqFrom = end-1;
1453  m_Chunks.push_back(chunk);
1454  }
1455 }
1456 
1457 
1459 {
1461  vector<TRange> rr;
1462  int min_quality = m_MinMapQuality;
1463  TSeqPos ref_len = m_File->GetRefSeqLength(GetRefSeqId());
1464  for ( CBamAlignIterator ait(*m_File, GetRefSeqId(), 0); ait; ++ait ) {
1465  if ( min_quality > 0 && ait.GetMapQuality() < min_quality ) {
1466  continue;
1467  }
1468  TSeqPos ref_pos = ait.GetRefSeqPos();
1469  TSeqPos ref_end = ref_pos + ait.GetCIGARRefSize() - 1;
1470  if ( ref_end > ref_len ) {
1471  continue;
1472  }
1473  rr.push_back(TRange(ref_pos, ref_end));
1474  }
1475  if ( !rr.empty() ) {
1476  sort(rr.begin(), rr.end());
1477  for ( size_t p = 0; p < rr.size(); ) {
1478  size_t e = min(p+kChunkSize, rr.size())-1;
1479  TSeqPos min_from = rr[p].GetFrom();
1480  TSeqPos max_from = rr[e++].GetFrom();
1481  while ( e < rr.size() && rr[e].GetFrom() == max_from ) {
1482  ++e;
1483  }
1484  TSeqPos max_to_open = max_from;
1485  for ( size_t i = p; i < e; ++i ) {
1486  max_to_open = max(max_to_open, rr[i].GetToOpen());
1487  }
1488  CBamRefSeqChunkInfo chunk;
1489  chunk.AddRefSeqRange(TRange(min_from, max_to_open-1));
1490  chunk.AddRefSeqRange(TRange(max_from, max_to_open-1));
1491  _TRACE("Chunk "<<m_Chunks.size()<<" count: "<<e-p<<
1492  " "<<TRange(min_from, max_from)<<" "<<max_to_open-1);
1493  m_Chunks.push_back(chunk);
1494  p = e;
1495  }
1496  if ( GetDebugLevel() >= 2 ) {
1497  LOG_POST_X(6, Info<<"CBAMDataLoader: "
1498  "Total range: "<<
1499  rr[0].GetFrom()<<"-"<<rr.back().GetTo()<<
1500  " count: "<<rr.size()<<" chunks: "<<m_Chunks.size());
1501  }
1502  }
1503 }
1504 
1505 
1507 {
1508  auto& chunk = m_Chunks[range_id];
1510  auto chunk_count = chunk.m_PileupChunkCount;
1511  auto last_range_id = range_id;
1512  if ( chunk_count ) {
1513  last_range_id += chunk_count-1;
1514  range.SetToOpen(m_Chunks[last_range_id].GetRefSeqRange().GetToOpen());
1515  }
1516  if ( last_range_id+1 < m_Chunks.size() ) {
1517  range.SetToOpen(m_Chunks[last_range_id+1].GetRefSeqRange().GetFrom());
1518  }
1519  return range;
1520 }
1521 
1522 
1524 {
1525  LoadRanges();
1526  CRef<CSeq_entry> entry(new CSeq_entry);
1527  entry->SetSet().SetId().SetId(kTSEId);
1528  if ( m_CovEntry ) {
1529  entry->SetSet().SetAnnot() = m_CovEntry->GetAnnot();
1530  }
1531  load_lock->SetSeq_entry(*entry);
1532  CreateChunks(load_lock->GetSplitInfo());
1533 }
1534 
1535 
1537 {
1538  CStopWatch sw;
1539  if ( GetDebugLevel() >= 1 ) {
1540  sw.Start();
1541  }
1542  CRef<CSeq_entry> entry(new CSeq_entry);
1543  entry->SetSet().SetId().SetId(kTSEId);
1544  load_lock->SetSeq_entry(*entry);
1545  CTSE_Split_Info& split_info = load_lock->GetSplitInfo();
1546  bool has_pileup = GetPileupGraphsParam();
1547  CAnnotName name, pileup_name;
1548  if ( !m_File->GetAnnotName().empty() ) {
1549  string base = m_File->GetAnnotName();
1550  name = CAnnotName(base);
1551  if ( has_pileup ) {
1552  pileup_name = CAnnotName(base + ' ' + PILEUP_NAME_SUFFIX);
1553  }
1554  }
1555 
1558  if ( m_CovEntry || !m_CovFileName.empty() ||
1560  chunk->x_AddAnnotType(name,
1562  GetRefSeq_id(),
1563  whole_range);
1564  }
1565  if ( has_pileup ) {
1566  chunk->x_AddAnnotType(pileup_name,
1568  GetRefSeq_id(),
1569  whole_range);
1570  }
1571  chunk->x_AddAnnotType(name,
1573  GetRefSeq_id(),
1574  whole_range);
1575  split_info.AddChunk(*chunk);
1576  if ( GetDebugLevel() >= 1 ) {
1577  LOG_POST_X(17, Info<<"CBAMDataLoader: "
1578  "Initialized BAM refseq "<<GetRefSeq_id()<<" in "<<sw.Elapsed());
1579  }
1580 }
1581 
1582 
1584 {
1585  LoadRanges();
1586  if ( m_CovEntry ) {
1589  chunk_info.x_LoadAnnot(place, **it);
1590  }
1591  }
1592  CreateChunks(const_cast<CTSE_Split_Info&>(chunk_info.GetSplitInfo()));
1593  chunk_info.SetLoaded();
1594 }
1595 
1596 
1597 static const double k_make_graph_seconds = 7.5e-9; // 133 MB/s
1598 static const double k_make_align_seconds = 250e-9; // 4 MB/s
1599 static const double k_make_read_seconds = 80e-9; // 12 MB/s
1600 
1601 
1603 {
1604  bool has_pileup = GetPileupGraphsParam();
1605  CAnnotName name, pileup_name;
1606  if ( !m_File->GetAnnotName().empty() ) {
1607  string base = m_File->GetAnnotName();
1608  name = CAnnotName(base);
1609  if ( has_pileup ) {
1610  pileup_name = CAnnotName(base + ' ' + PILEUP_NAME_SUFFIX);
1611  }
1612  }
1613 
1614  CBamRawDb* raw_db = 0;
1615  size_t refseq_index = size_t(-1);
1616  if ( m_File->GetBamDb().UsesRawIndex() ) {
1617  raw_db = &m_File->GetBamDb().GetRawDb();
1618  refseq_index = raw_db->GetRefIndex(GetRefSeqId());
1619  }
1620 
1621  // create chunk info for alignments
1622  for ( size_t range_id = 0; range_id < m_Chunks.size(); ++range_id ) {
1623  int base_id = int(range_id*kChunkIdMul);
1624  auto align_count = m_Chunks[range_id].GetAlignCount();
1625  auto data_size = m_Chunks[range_id].m_DataSize;
1626  if ( align_count == 0 && data_size != 0 ) {
1627  align_count = data_size / kSingleAlignBytes + 1;
1628  }
1629  else if ( data_size == 0 && align_count != 0 ) {
1630  data_size = align_count * kSingleAlignBytes;
1631  }
1632  CRange<TSeqPos> wide_range = m_Chunks[range_id].GetAlignRange(); // includes overhang after this range
1633  if ( has_pileup && m_Chunks[range_id].m_PileupChunkCount ) {
1634  _ASSERT(raw_db);
1636  CRange<TSeqPos> pileup_range = GetChunkGraphRange(range_id);
1637  {{
1638  Uint8 bytes = data_size*m_Chunks[range_id].m_PileupChunkCount;
1639  chunk->x_SetLoadBytes(Uint4(min<size_t>(bytes, kMax_UI4)));
1640  chunk->x_AddAnnotType(pileup_name,
1642  GetRefSeq_id(),
1643  pileup_range);
1644  if ( GetDebugLevel() >= 2 ) {
1645  LOG_POST_X(13, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1646  "Pileup Chunk id="<<chunk->GetChunkId()<<": "<<pileup_range<<
1647  " with "<<bytes<<" bytes");
1648  }
1649  if ( m_Chunks[range_id].m_PileupChunkCount > 1 ) {
1650  // include annots crossing sub-range borders
1651  chunk->x_AddAnnotType(name,
1653  GetRefSeq_id(),
1654  pileup_range);
1655  if ( GetDebugLevel() >= 2 ) {
1656  LOG_POST_X(13, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1657  "Pileup Chunk id="<<chunk->GetChunkId()<<": aligns: "<<pileup_range);
1658  }
1659  }
1660  }}
1661  split_info.AddChunk(*chunk);
1662  if ( m_Chunks[range_id].m_PileupChunkCount > 1 ) {
1663  // add separate chunk for high index level overlapping aligns
1664  if ( Uint8 bytes = CBamFileRangeSet(raw_db->GetIndex(), refseq_index, pileup_range,
1668  chunk->x_SetLoadBytes(Uint4(min<size_t>(bytes, kMax_UI4)));
1669  //chunk->x_SetLoadSeconds(bytes*align_seconds);
1670  chunk->x_AddAnnotType(name,
1672  GetRefSeq_id(),
1673  wide_range);
1674  if ( GetDebugLevel() >= 2 ) {
1675  LOG_POST_X(12, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1676  "Align Chunk id="<<chunk->GetChunkId()<<": "<<wide_range<<
1677  " with "<<bytes<<" bytes");
1678  }
1679  split_info.AddChunk(*chunk);
1680  }
1681  }
1682  }
1683  if ( align_count ) {
1684  CRange<TSeqPos> start_range = m_Chunks[range_id].GetAlignStartRange();
1685  if ( m_Chunks[range_id].m_PileupChunkCount != 1 ) {
1686  // add single sub-page chunk for in-range aligns
1687  _ASSERT(raw_db);
1689  chunk->x_SetLoadBytes(Uint4(min<size_t>(data_size, kMax_UI4)));
1690  //chunk->x_SetLoadSeconds(bytes*align_seconds);
1691  chunk->x_AddAnnotType(name,
1693  GetRefSeq_id(),
1694  start_range);
1695  if ( GetDebugLevel() >= 2 ) {
1696  LOG_POST_X(12, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1697  "Align sub-page Chunk id="<<chunk->GetChunkId()<<": "<<start_range<<
1698  " with "<<data_size<<" bytes");
1699  }
1700  split_info.AddChunk(*chunk);
1701  }
1702  else if ( raw_db && data_size >= kSplitLevelsChunkDataSize ) {
1703  // add two separate chunks for in-range and overlapping aligns
1704  if ( Uint8 bytes = CBamFileRangeSet(raw_db->GetIndex(), refseq_index, start_range,
1708  chunk->x_SetLoadBytes(Uint4(min<size_t>(bytes, kMax_UI4)));
1709  //chunk->x_SetLoadSeconds(bytes*align_seconds);
1710  chunk->x_AddAnnotType(name,
1712  GetRefSeq_id(),
1713  start_range);
1714  split_info.AddChunk(*chunk);
1715  if ( GetDebugLevel() >= 2 ) {
1716  LOG_POST_X(12, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1717  "Align Chunk id="<<chunk->GetChunkId()<<": "<<start_range<<
1718  " with "<<bytes<<" bytes");
1719  }
1720  }
1721  if ( Uint8 bytes = CBamFileRangeSet(raw_db->GetIndex(), refseq_index, start_range,
1725  chunk->x_SetLoadBytes(Uint4(min<size_t>(bytes, kMax_UI4)));
1726  //chunk->x_SetLoadSeconds(bytes*align_seconds);
1727  chunk->x_AddAnnotType(name,
1729  GetRefSeq_id(),
1730  wide_range);
1731  split_info.AddChunk(*chunk);
1732  if ( GetDebugLevel() >= 2 ) {
1733  LOG_POST_X(12, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1734  "Align Chunk id="<<chunk->GetChunkId()<<": "<<wide_range<<
1735  " with "<<bytes<<" bytes");
1736  }
1737  }
1738  }
1739  else {
1740  // add single chunk for in-range and overlapping aligns
1742  if ( raw_db ) {
1743  data_size = CBamFileRangeSet(raw_db->GetIndex(), refseq_index, start_range,
1746  if ( GetDebugLevel() >= 2 ) {
1747  LOG_POST_X(12, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1748  "Align Chunk id="<<chunk->GetChunkId()<<": "<<wide_range<<
1749  " with "<<data_size<<" bytes");
1750  }
1751  }
1752  else {
1753  if ( GetDebugLevel() >= 2 ) {
1754  LOG_POST_X(12, Info << "CBAMDataLoader: "<<GetRefSeq_id()<<": "
1755  "Align Chunk id="<<chunk->GetChunkId()<<": "<<wide_range<<
1756  " with "<<align_count<<" aligns");
1757  }
1758  }
1759  chunk->x_SetLoadBytes(Uint4(min<size_t>(data_size, kMax_UI4)));
1760  //chunk->x_SetLoadSeconds(bytes*align_seconds);
1761  chunk->x_AddAnnotType(name,
1763  GetRefSeq_id(),
1764  wide_range);
1765  split_info.AddChunk(*chunk);
1766  }
1767  }
1768  }
1769  {{
1771  m_Seq2Chunk.clear();
1772  }}
1773 }
1774 
1775 
1777  Uint4 bytes) const
1778 {
1779  CBamRawDb* raw_db = 0;
1780  if ( m_File->GetBamDb().UsesRawIndex() ) {
1781  raw_db = &m_File->GetBamDb().GetRawDb();
1782  }
1783  double get_data_seconds = raw_db? raw_db->GetEstimatedSecondsPerByte(): 0;
1784  return bytes*(get_data_seconds + k_make_align_seconds);
1785 }
1786 
1787 
1789  Uint4 bytes) const
1790 {
1791  CBamRawDb* raw_db = 0;
1792  if ( m_File->GetBamDb().UsesRawIndex() ) {
1793  raw_db = &m_File->GetBamDb().GetRawDb();
1794  }
1795  double get_data_seconds = raw_db? raw_db->GetEstimatedSecondsPerByte(): 0;
1796  return bytes*(get_data_seconds + k_make_graph_seconds);
1797 }
1798 
1799 
1801  Uint4 bytes) const
1802 {
1803  CBamRawDb* raw_db = 0;
1804  if ( m_File->GetBamDb().UsesRawIndex() ) {
1805  raw_db = &m_File->GetBamDb().GetRawDb();
1806  }
1807  double get_data_seconds = raw_db? raw_db->GetEstimatedSecondsPerByte(): 0;
1808  return bytes*(get_data_seconds + k_make_read_seconds);
1809 }
1810 
1811 
1813  Uint4 bytes) const
1814 {
1815  switch ( chunk_info.GetChunkId() % kChunkIdMul ) {
1816  case eChunk_pileup_graph:
1817  return EstimatePileupLoadSeconds(chunk_info, bytes);
1818  case eChunk_align:
1819  case eChunk_align1:
1820  case eChunk_align2:
1821  return EstimateAlignLoadSeconds(chunk_info, bytes);
1823  case eChunk_short_seq:
1824  case eChunk_short_seq1:
1825  case eChunk_short_seq2:
1826  return EstimateSeqLoadSeconds(chunk_info, bytes);
1827  }
1828  return 0;
1829 }
1830 
1831 
1833 {
1834  if ( chunk_info.GetChunkId() == kMainChunkId ) {
1835  LoadMainChunk(chunk_info);
1836  return;
1837  }
1838  switch ( chunk_info.GetChunkId() % kChunkIdMul ) {
1839  case eChunk_pileup_graph:
1840  LoadPileupChunk(chunk_info);
1841  break;
1842  case eChunk_align:
1843  case eChunk_align1:
1844  case eChunk_align2:
1845  LoadAlignChunk(chunk_info);
1846  break;
1848  case eChunk_short_seq:
1849  case eChunk_short_seq1:
1850  case eChunk_short_seq2:
1851  LoadSeqChunk(chunk_info);
1852  break;
1853  }
1854 }
1855 
1856 
1858  TSeqPos& max_end_pos,
1859  CTSE_Chunk_Info& chunk_info,
1860  int base_id)
1861 {
1862  int range_id = chunk_info.GetChunkId() / kChunkIdMul;
1863  int sub_chunk = (chunk_info.GetChunkId() % kChunkIdMul) - base_id + eChunk_align;
1864  _ASSERT(sub_chunk >= eChunk_pileup_graph && sub_chunk <= eChunk_align2);
1865 #ifdef SKIP_TOO_LONG_ALIGNMENTS
1866  max_end_pos = m_File->GetRefSeqLength(GetRefSeqId());
1867 #else
1868  max_end_pos = kInvalidSeqPos;
1869 #endif
1870  if ( sub_chunk == eChunk_align1 ) {
1871  auto start_range = m_Chunks[range_id].GetAlignStartRange();
1873  start_range.GetFrom(), start_range.GetLength(),
1876  &m_Chunks[range_id].m_FilePosFirstStarting);
1877  // we also skip alignments that end beyond current region
1878  max_end_pos = min(max_end_pos, start_range.GetToOpen());
1879  }
1880  else if ( sub_chunk == eChunk_align2 ) {
1881  auto start_range = GetChunkGraphRange(range_id);
1883  start_range.GetFrom(), start_range.GetLength(),
1886  }
1887  else {
1888  _ASSERT(sub_chunk == eChunk_align);
1889  auto start_range = m_Chunks[range_id].GetAlignStartRange();
1891  start_range.GetFrom(), start_range.GetLength(),
1893  }
1894  if ( m_SpotIdDetector ) {
1896  }
1897 }
1898 
1899 
1901  const vector<CSeq_id_Handle>& short_ids)
1902 {
1903  int range_id = chunk_info.GetChunkId() / kChunkIdMul;
1904  int sub_chunk = chunk_info.GetChunkId() % kChunkIdMul;
1905  _ASSERT(sub_chunk >= eChunk_pileup_graph && sub_chunk <= eChunk_align2);
1906  int seq_chunk_id = range_id*kChunkIdMul+eChunk_short_seq+(sub_chunk-eChunk_align);
1907  vector<CSeq_id_Handle> new_short_ids;
1908  {{
1910  for ( auto& id : short_ids ) {
1911  if ( m_Seq2Chunk.insert(make_pair(id, seq_chunk_id)).first->second == seq_chunk_id ) {
1912  new_short_ids.push_back(id);
1913  }
1914  }
1915  }}
1916  if ( new_short_ids.empty() ) {
1917  return;
1918  }
1919  CRef<CTSE_Chunk_Info> chunk(new CTSE_Chunk_Info(seq_chunk_id));
1920  for ( auto& id : new_short_ids ) {
1921  chunk->x_AddBioseqId(id);
1922  }
1923  CStopWatch sw_attach;
1924  if ( GetDebugLevel() >= 4 ) {
1925  sw_attach.Start();
1926  }
1927  if ( m_File->GetBamDb().UsesRawIndex() ) {
1928  chunk->x_SetLoadBytes(Uint4(m_Chunks[range_id].GetAlignCount()));
1929  }
1930  chunk->x_AddBioseqPlace(kTSEId);
1931  CTSE_Split_Info& split_info =
1932  const_cast<CTSE_Split_Info&>(chunk_info.GetSplitInfo());
1933  split_info.AddChunk(*chunk);
1934  if ( GetDebugLevel() >= 4 ) {
1935  double time = sw_attach.Elapsed();
1936  s_AttachTime += time;
1937  LOG_POST_X(20, Info<<"CBAMDataLoader: "
1938  "Created short reads chunk "<<
1939  range_id<<"/"<<(seq_chunk_id-range_id*kChunkIdMul)<<" "<<
1940  GetRefSeqId()<<" @ "<<
1941  m_Chunks[range_id].GetRefSeqRange()<<" in "<<time);
1942  }
1943 }
1944 
1945 
1947 {
1948  CStopWatch sw;
1949  CStopWatch sw_create;
1950  if ( GetDebugLevel() >= 3 ) {
1951  sw.Start();
1952  }
1953  int range_id = chunk_info.GetChunkId() / kChunkIdMul;
1955  int min_quality = m_MinMapQuality;
1956  _TRACE("Loading aligns "<<GetRefSeqId()<<" @ "<<m_Chunks[range_id].GetRefSeqRange());
1957  size_t skipped = 0, count = 0, repl_count = 0, fail_count = 0;
1958  vector<CSeq_id_Handle> short_ids;
1959 
1960  CRef<CSeq_annot> annot;
1961  CSeq_annot::TData::TAlign* align_list = 0;
1962 
1963  CBamAlignIterator ait;
1964  TSeqPos max_end_pos;
1965  x_InitAlignIterator(ait, max_end_pos, chunk_info, eChunk_align);
1966  for( ; ait; ++ait ) {
1967  TSeqPos align_pos = ait.GetRefSeqPos();
1968  // should be filtered by CBamAlignIterator
1969  _ASSERT(align_pos >= m_Chunks[range_id].GetRefSeqRange().GetFrom());
1970  if ( min_quality > 0 && ait.GetMapQuality() < min_quality ) {
1971  ++skipped;
1972  continue;
1973  }
1974  if ( max_end_pos != kInvalidSeqPos ) {
1975  TSeqPos align_end = align_pos + ait.GetCIGARRefSize();
1976  if ( align_end > max_end_pos ) {
1977  ++skipped;
1978  continue;
1979  }
1980  }
1981  ++count;
1982 
1983  if ( !align_list ) {
1984  annot = ait.GetSeq_annot(m_File->GetAnnotName());
1985  align_list = &annot->SetData().SetAlign();
1986  }
1987  if ( GetDebugLevel() >= 4 ) {
1988  sw_create.Start();
1989  }
1990  align_list->push_back(ait.GetMatchAlign());
1991  if ( ait.GetShortSequenceLength() != 0 ) {
1992  short_ids.push_back(CSeq_id_Handle::GetHandle(*ait.GetShortSeq_id()));
1993  }
1994  if ( GetDebugLevel() >= 4 ) {
1995  sw_create.Stop();
1996  }
1997  }
1998  if ( GetDebugLevel() >= 4 ) {
1999  double time = sw_create.Elapsed();
2000  LOG_POST_X(19, Info<<"CBAMDataLoader: "
2001  "Created alignments "<<GetRefSeqId()<<
2002  " id="<<chunk_info.GetChunkId()<<
2003  " @ "<<m_Chunks[range_id].GetRefSeqRange()<<": "<<
2004  count<<" repl: "<<repl_count<<" fail: "<<fail_count<<
2005  " skipped: "<<skipped<<" in "<<time);
2006  s_CreateTime += time;
2007  }
2008  if ( annot ) {
2009  CStopWatch sw_attach(CStopWatch::eStart);
2010  chunk_info.x_LoadAnnot(place, *annot);
2011  chunk_info.x_AddUsedMemory(count*2500+10000);
2012  if ( GetDebugLevel() >= 4 ) {
2013  double time = sw_attach.Elapsed();
2014  LOG_POST_X(19, Info<<"CBAMDataLoader: "
2015  "Attached alignments "<<GetRefSeqId()<<
2016  " id="<<chunk_info.GetChunkId()<<
2017  " @ "<<m_Chunks[range_id].GetRefSeqRange()<<" in "<<time);
2018  s_AttachTime += time;
2019  }
2020  }
2021  x_AddSeqChunk(chunk_info, short_ids);
2022  if ( GetDebugLevel() >= 3 ) {
2023  LOG_POST_X(7, Info<<"CBAMDataLoader: "
2024  "Loaded "<<GetRefSeqId()<<
2025  " id="<<chunk_info.GetChunkId()<<
2026  " @ "<<m_Chunks[range_id].GetRefSeqRange()<<": "<<
2027  count<<" repl: "<<repl_count<<" fail: "<<fail_count<<
2028  " skipped: "<<skipped<<" in "<<sw.Elapsed());
2029  }
2030  chunk_info.SetLoaded();
2031 }
2032 
2033 
2035 {
2036  CStopWatch sw;
2037  if ( GetDebugLevel() >= 3 ) {
2038  sw.Start();
2039  }
2040  int chunk_id = chunk_info.GetChunkId();
2041  int range_id = chunk_id/kChunkIdMul;
2042  int sub_chunk = chunk_id%kChunkIdMul;
2043  const CBamRefSeqChunkInfo& chunk = m_Chunks[range_id];
2044  auto start_range = chunk.GetAlignStartRange();
2046  int min_quality = m_MinMapQuality;
2047  _TRACE("Loading seqs "<<GetRefSeqId()<<" @ "<<chunk.GetRefSeqRange());
2048  size_t count = 0, skipped = 0, dups = 0, far_refs = 0;
2049  set<CSeq_id_Handle> loaded;
2050 
2051  list< CRef<CBioseq> > bioseqs;
2052 
2053  if ( sub_chunk == eChunk_short_seq_pileup ) {
2054 #ifdef SEPARATE_PILEUP_READS
2055  int sub_chunk_count = chunk.m_PileupChunkCount;
2056  for ( int i = 1; i < sub_chunk_count; ++i ) {
2057  TSeqPos split_pos = m_Chunks[range_id+i].GetRefSeqRange().GetFrom();
2059  start_range.GetFrom(), split_pos-start_range.GetFrom(),
2062  &m_Chunks[range_id].m_FilePosFirstCrossing);
2063  if ( m_SpotIdDetector ) {
2065  }
2067  for ( ; ait; ++ait ) {
2068  TSeqPos align_pos = ait.GetRefSeqPos();
2069  _ASSERT(align_pos<split_pos);
2070  if ( min_quality > 0 && ait.GetMapQuality() < min_quality ) {
2071  ++skipped;
2072  continue;
2073  }
2074  TSeqPos align_end = align_pos + ait.GetCIGARRefSize();
2075  if ( align_end <= split_pos ) {
2076  ++skipped;
2077  continue;
2078  }
2079 
2080  if ( ait.GetShortSequenceLength() == 0 ) {
2081  // far reference
2082  ++far_refs;
2083  continue;
2084  }
2085 
2086  CSeq_id_Handle seq_id =
2088  auto iter = m_Seq2Chunk.find(seq_id);
2089  if ( iter == m_Seq2Chunk.end() || iter->second != chunk_id ) {
2090  ++skipped;
2091  continue;
2092  }
2093 
2094  if ( !loaded.insert(seq_id).second ) {
2095  ++dups;
2096  continue;
2097  }
2098  bioseqs.push_back(ait.GetShortBioseq());
2099  ++count;
2100  }
2101  }
2102 #endif
2103  }
2104  else {
2105  TSeqPos max_end_pos = kInvalidSeqPos;
2106  CBamAlignIterator ait;
2107  x_InitAlignIterator(ait, max_end_pos, chunk_info, eChunk_short_seq);
2109  for( ; ait; ++ait ){
2110  TSeqPos align_pos = ait.GetRefSeqPos();
2111  if ( align_pos < start_range.GetFrom() ) {
2112  // the alignments starts before current chunk range
2113  ++skipped;
2114  continue;
2115  }
2116  if ( min_quality > 0 && ait.GetMapQuality() < min_quality ) {
2117  ++skipped;
2118  continue;
2119  }
2120  if ( max_end_pos != kInvalidSeqPos ) {
2121  TSeqPos align_end = align_pos + ait.GetCIGARRefSize();
2122  if ( align_end > max_end_pos ) {
2123  ++skipped;
2124  continue;
2125  }
2126  }
2127 
2128  if ( ait.GetShortSequenceLength() == 0 ) {
2129  // far reference
2130  ++far_refs;
2131  continue;
2132  }
2133 
2134  CSeq_id_Handle seq_id =
2136  auto iter = m_Seq2Chunk.find(seq_id);
2137  if ( iter == m_Seq2Chunk.end() || iter->second != chunk_id ) {
2138  ++skipped;
2139  continue;
2140  }
2141 
2142  if ( !loaded.insert(seq_id).second ) {
2143  ++dups;
2144  continue;
2145  }
2146  bioseqs.push_back(ait.GetShortBioseq());
2147  ++count;
2148  }
2149  }
2150  chunk_info.x_LoadBioseqs(place, bioseqs);
2151 
2152  if ( GetDebugLevel() >= 3 ) {
2153  LOG_POST_X(10, Info<<"CBAMDataLoader: "
2154  "Loaded seqs "<<GetRefSeqId()<<
2155  " id="<<chunk_info.GetChunkId()<<
2156  " @ "<<chunk.GetRefSeqRange()<<": "<<
2157  count<<" skipped: "<<skipped<<" dups: "<<dups<<" far: "<<far_refs<<" in "<<sw.Elapsed());
2158  }
2159 
2160  chunk_info.SetLoaded();
2161 }
2162 
2163 #define USE_NEW_PILEUP_COLLECTOR
2164 
2165 #if defined USE_NEW_PILEUP_COLLECTOR && !defined HAVE_NEW_PILEUP_COLLECTOR
2166 # undef USE_NEW_PILEUP_COLLECTOR
2167 #endif
2168 
2169 #ifdef USE_NEW_PILEUP_COLLECTOR
2170 
2171 static Uint8 total_pileup_range;
2172 static Uint8 total_pileup_aligns;
2173 static double total_pileup_time_collect;
2174 static double total_pileup_time_max;
2175 static double total_pileup_time_make;
2176 
2177 static struct STimePrinter {
2178  ~STimePrinter() {
2179  if ( total_pileup_range ) {
2180  LOG_POST_X(18, Info<<"CBAMDataLoader: "
2181  "Total pileup bases: "<<total_pileup_range<<
2182  " aligns: "<<total_pileup_aligns<<
2183  " collect time: "<<total_pileup_time_collect<<
2184  " max: "<<total_pileup_time_max<<
2185  " make: "<<total_pileup_time_make);
2186  }
2187  }
2188 } s_TimePrinter;
2189 
2190 struct SPileupGraphCreator : public CBamDb::ICollectPileupCallback
2191 {
2193  static const int kStat_Gap = CBamDb::SPileupValues::kStat_Gap;
2194  static const int kStat_Intron = CBamDb::SPileupValues::kStat_Intron;
2195  static const int kNumStat_ACGT = CBamDb::SPileupValues::kNumStat_ACGT;
2196  static const int kNumStat = CBamDb::SPileupValues::kNumStat;
2197  typedef CBamDb::SPileupValues::TCount TCount;
2198 
2199  string annot_name;
2200  CRef<CSeq_id> ref_id;
2201  CRange<TSeqPos> ref_range;
2202  int min_quality;
2203  bool make_intron;
2204 
2205  struct SGraph {
2206  SGraph()
2207  : bytes(0),
2208  ints(0),
2209  max_value(0)
2210  {
2211  }
2212 
2213  CRef<CSeq_graph> graph;
2214  CByte_graph::TValues* bytes;
2215  CInt_graph::TValues* ints;
2216  TCount max_value;
2217  };
2218  SGraph graphs[kNumStat];
2219  struct SSplit {
2220  SSplit(TSeqPos seq_pos)
2221  : seq_pos(seq_pos),
2222  file_pos_first_crossing(),
2223  file_pos_first_starting()
2224  {
2225  }
2226  TSeqPos seq_pos;
2227  CBGZFPos file_pos_first_crossing;
2228  CBGZFPos file_pos_first_starting;
2229  };
2230 
2231  typedef vector<SSplit> TSplits;
2232  TSplits splits;
2233  TSplits::iterator cur_split;
2234  CRef<CSeq_annot> annot;
2235  CSeq_annot::TData::TAlign* align_list = 0;
2236  typedef map<CSeq_id_Handle, int> TSeq2Chunk;
2237 #ifdef SEPARATE_PILEUP_READS
2238  vector<CSeq_id_Handle> short_ids;
2239 #else
2240  TSeq2Chunk* seq2chunk = 0;
2241  int seq_chunk_id = 0;
2242  size_t seq_skipped = 0;
2243  size_t seq_dups = 0;
2244  size_t seq_count = 0;
2245  set<CSeq_id_Handle> loaded;
2246  list<CRef<CBioseq>> bioseqs;
2247 #endif
2248 
2249  SPileupGraphCreator(const string& annot_name,
2250  const CSeq_id_Handle& ref_id,
2251  CRange<TSeqPos> ref_range,
2252  int min_quality)
2253  : annot_name(annot_name),
2254  ref_id(SerialClone(*ref_id.GetSeqId())),
2255  ref_range(ref_range),
2256  min_quality(min_quality),
2257  make_intron(s_GetMakeIntronGraph()),
2258  cur_split(splits.begin())
2259  {
2260  }
2261 
2262  bool AcceptAlign(const CBamAlignIterator& ait) override
2263  {
2264  if ( min_quality > 0 && ait.GetMapQuality() < min_quality ) {
2265  return false;
2266  }
2267  while ( cur_split != splits.end() ) {
2268  // skip split points that are before this alignment
2269  if ( !cur_split->file_pos_first_starting ) {
2270  cur_split->file_pos_first_starting = ait.GetRawIndexIteratorPtr()->GetFilePos();
2271  }
2272  TSeqPos pos = ait.GetRefSeqPos();
2273  if ( pos >= cur_split->seq_pos ) {
2274  ++cur_split;
2275  continue;
2276  }
2277  if ( !ait.GetRawIndexIteratorPtr()->IsOnMinBinIndexLevel() ) {
2278  // ignore non level 0 aligns - they will be collected by eChunk_align2
2279  break;
2280  }
2281  TSeqPos len = ait.GetCIGARRefSize();
2282  TSeqPos end = pos + len;
2283  if ( end > cur_split->seq_pos ) {
2284  // alignment crosses the split point
2285  if ( !cur_split->file_pos_first_crossing ) {
2286  cur_split->file_pos_first_crossing = ait.GetRawIndexIteratorPtr()->GetFilePos();
2287  }
2288  if ( !align_list ) {
2289  annot = ait.GetSeq_annot(annot_name);
2290  align_list = &annot->SetData().SetAlign();
2291  }
2292  align_list->push_back(ait.GetMatchAlign());
2293  if ( ait.GetShortSequenceLength() != 0 ) {
2295 #ifdef SEPARATE_PILEUP_READS
2296  short_ids.push_back(seq_id);
2297 #else
2298  auto iter = seq2chunk->insert(make_pair(seq_id, seq_chunk_id)).first;
2299  if ( iter->second != seq_chunk_id ) {
2300  ++seq_skipped;
2301  }
2302  else if ( !loaded.insert(seq_id).second ) {
2303  ++seq_dups;
2304  }
2305  else {
2306  bioseqs.push_back(ait.GetShortBioseq());
2307  ++seq_count;
2308  }
2309 #endif
2310  }
2311  }
2312  break;
2313  }
2314  return true;
2315  }
2316 
2317  void x_CreateGraph(SGraph& g)
2318  {
2319  _ASSERT(!g.graph);
2320  CRef<CSeq_graph> graph(new CSeq_graph);
2321  static const char* const titles[kNumStat] = {
2322  "Number of A bases",
2323  "Number of C bases",
2324  "Number of G bases",
2325  "Number of T bases",
2326  "Number of inserts",
2327  "Number of matches",
2328  "Number of introns"
2329  };
2330  graph->SetTitle(titles[&g-graphs]);
2331  CSeq_interval& loc = graph->SetLoc().SetInt();
2332  loc.SetId(*ref_id);
2333  loc.SetFrom(ref_range.GetFrom());
2334  loc.SetTo(ref_range.GetTo());
2335  TSeqPos length = ref_range.GetLength();
2336  graph->SetNumval(length);
2337  g.graph = graph;
2338  }
2339  void x_FinalizeGraph(SGraph& g)
2340  {
2341  if ( !g.graph ) {
2342  return;
2343  }
2344  if ( g.max_value < 256 ) {
2345  _ASSERT(g.bytes);
2346  _ASSERT(g.graph->GetGraph().IsByte());
2347  _ASSERT(g.graph->GetGraph().GetByte().GetValues().size() == ref_range.GetLength());
2348  CByte_graph& data = g.graph->SetGraph().SetByte();
2349  data.SetMin(0);
2350  data.SetMax(g.max_value);
2351  data.SetAxis(0);
2352  }
2353  else {
2354  _ASSERT(g.ints);
2355  _ASSERT(g.graph->GetGraph().IsInt());
2356  _ASSERT(g.graph->GetGraph().GetInt().GetValues().size() == ref_range.GetLength());
2357  CInt_graph& data = g.graph->SetGraph().SetInt();
2358  data.SetMin(0);
2359  data.SetMax(g.max_value);
2360  data.SetAxis(0);
2361  }
2362  }
2363 
2364  void x_AdjustACGT(TSeqPos ref_offset)
2365  {
2367  // empty graphs can be skipped -> no adjustment
2368  return;
2369  }
2370  bool have_acgt = false;
2371  for ( int k = 0; k < kNumStat_ACGT; ++k ) {
2372  if ( graphs[k].graph ) {
2373  have_acgt = true;
2374  break;
2375  }
2376  }
2377  if ( have_acgt ) {
2378  for ( int k = 0; k < kNumStat_ACGT; ++k ) {
2379  SGraph& g = graphs[k];
2380  if ( g.graph ) {
2381  // graph already created
2382  continue;
2383  }
2384  x_CreateGraph(g);
2385  g.bytes = &g.graph->SetGraph().SetByte().SetValues();
2386  g.bytes->reserve(ref_range.GetLength());
2387  NFast::AppendZerosAligned16(*g.bytes, ref_offset);
2388  }
2389  }
2390  }
2391  void Finalize()
2392  {
2393  if ( !GetSkipEmptyPileupGraphsParam() ) {
2394  // make missing empty graphs
2395  TSeqPos len = ref_range.GetLength();
2396  for ( int k = 0; k < kNumStat; ++k ) {
2397  SGraph& g = graphs[k];
2398  if ( g.graph ) {
2399  // graph already created
2400  continue;
2401  }
2404  // do not generate empty 'matches' graph
2405  continue;
2406  }
2407  x_CreateGraph(g);
2408  g.bytes = &g.graph->SetGraph().SetByte().SetValues();
2409  g.bytes->reserve(len);
2410  NFast::AppendZeros(*g.bytes, len);
2411  }
2412  }
2413  for ( int k = 0; k < kNumStat; ++k ) {
2414  x_FinalizeGraph(graphs[k]);
2415  }
2416  }
2417 
2418  virtual void AddZerosBy16(TSeqPos len) override
2419  {
2420  for ( int k = 0; k < kNumStat; ++k ) {
2421  SGraph& g = graphs[k];
2422  if ( g.graph ) {
2423  if ( g.ints ) {
2425  }
2426  else {
2428  }
2429  }
2430  }
2431  }
2432 
2433  bool x_UpdateMaxIsInt(SGraph& g, TCount max_added, TSeqPos ref_offset)
2434  {
2435  if ( !g.graph ) {
2436  _ASSERT(!g.max_value);
2437  g.max_value = max_added;
2438  x_CreateGraph(g);
2439  if ( max_added >= 256 ) {
2440  g.ints = &g.graph->SetGraph().SetInt().SetValues();
2441  g.ints->reserve(ref_range.GetLength());
2442  NFast::AppendZerosAligned16(*g.ints, ref_offset);
2443  return true;
2444  }
2445  else {
2446  g.bytes = &g.graph->SetGraph().SetByte().SetValues();
2447  g.bytes->reserve(ref_range.GetLength());
2448  NFast::AppendZerosAligned16(*g.bytes, ref_offset);
2449  return false;
2450  }
2451  }
2452  else if ( max_added >= 256 ) {
2453  g.max_value = max(g.max_value, max_added);
2454  if ( g.bytes ) {
2455  CRef<CInt_graph> int_graph(new CInt_graph);
2456  g.ints = &int_graph->SetValues();
2457  g.ints->reserve(ref_range.GetLength());
2458  size_t size = g.bytes->size();
2459  NFast::ConvertBuffer(g.bytes->data(), size,
2461  g.bytes = 0;
2462  g.graph->SetGraph().SetInt(*int_graph);
2463  }
2464  return true;
2465  }
2466  else if ( g.ints ) {
2467  return true;
2468  }
2469  else {
2470  g.max_value = max(g.max_value, max_added);
2471  return false;
2472  }
2473  }
2474 
2475  void x_AddValuesBy16(SGraph& g, TSeqPos len, const TCount* src)
2476  {
2477  if ( g.bytes ) {
2479  }
2480  else if ( g.ints ) {
2482  }
2483  }
2484  void x_AddValues(SGraph& g, TSeqPos len, const TCount* src)
2485  {
2486  if ( g.bytes ) {
2487  copy_n(src, len, NFast::AppendUninitialized(*g.bytes, len));
2488  }
2489  else if ( g.ints ) {
2490  copy_n(src, len, NFast::AppendUninitialized(*g.ints, len));
2491  }
2492  }
2493  void x_AddValuesBy16(TSeqPos len, const CBamDb::SPileupValues& values)
2494  {
2495  x_AddValuesBy16(graphs[kStat_Match], len, values.cc_match.data());
2496  x_AddValuesBy16(graphs[kStat_Gap], len, values.get_gap_counts());
2497  if ( make_intron ) {
2498  x_AddValuesBy16(graphs[kStat_Intron], len, values.get_intron_counts());
2499  }
2500  int dst_byte = 0, dst_int = 0;
2501  for ( int k = 0; k < kNumStat_ACGT; ++k ) {
2502  SGraph& g = graphs[k];
2503  if ( g.bytes ) {
2504  ++dst_byte;
2505  }
2506  else if ( g.ints ) {
2507  ++dst_int;
2508  }
2509  }
2510  if ( dst_byte == kNumStat_ACGT ) {
2512  NFast::AppendUninitialized(*graphs[0].bytes, len),
2513  NFast::AppendUninitialized(*graphs[1].bytes, len),
2514  NFast::AppendUninitialized(*graphs[2].bytes, len),
2515  NFast::AppendUninitialized(*graphs[3].bytes, len));
2516  }
2517  else if ( dst_int == kNumStat_ACGT ) {
2518  NFast::SplitBufferInto4(values.cc_acgt[0].cc, len,
2519  NFast::AppendUninitialized(*graphs[0].ints, len),
2520  NFast::AppendUninitialized(*graphs[1].ints, len),
2521  NFast::AppendUninitialized(*graphs[2].ints, len),
2522  NFast::AppendUninitialized(*graphs[3].ints, len));
2523  }
2524  else {
2525  // use split ACGT arrays
2526  for ( int k = 0; k < kNumStat_ACGT; ++k ) {
2527  SGraph& g = graphs[k];
2528  x_AddValuesBy16(g, len, values.get_split_acgt_counts(k, len));
2529  }
2530  }
2531  }
2532  void x_AddValues(TSeqPos len, const CBamDb::SPileupValues& values)
2533  {
2534  x_AddValues(graphs[kStat_Match], len, values.cc_match.data());
2535  x_AddValues(graphs[kStat_Gap], len, values.cc_gap.data());
2536  if ( make_intron ) {
2537  x_AddValues(graphs[kStat_Intron], len, values.cc_intron.data());
2538  }
2539  // use split ACGT into separate arrays
2540  for ( int k = 0; k < kNumStat_ACGT; ++k ) {
2541  SGraph& g = graphs[k];
2542  x_AddValues(g, len, values.get_split_acgt_counts(k, len));
2543  }
2544  }
2545  virtual void AddValuesBy16(TSeqPos len, const CBamDb::SPileupValues& values) override
2546  {
2547  _ASSERT(len > 0);
2548  _ASSERT(values.m_RefFrom >= ref_range.GetFrom());
2549  _ASSERT(values.m_RefFrom+len <= ref_range.GetToOpen());
2550  _ASSERT((values.m_RefFrom - ref_range.GetFrom())%16 == 0);
2551  _ASSERT(len%16 == 0);
2552  TSeqPos ref_offset = values.m_RefFrom-ref_range.GetFrom();
2553  for ( int k = 0; k < kNumStat; ++k ) {
2554  SGraph& g = graphs[k];
2555  TCount max_added = values.get_max_count(k);
2556  if ( max_added != 0 || g.graph ) {
2557  x_UpdateMaxIsInt(g, max_added, ref_offset);
2558  }
2559  }
2560  x_AdjustACGT(ref_offset);
2561  x_AddValuesBy16(len, values);
2562  }
2563  virtual void AddValuesTail(TSeqPos len, const CBamDb::SPileupValues& values) override
2564  {
2565  _ASSERT(len > 0);
2566  _ASSERT(values.m_RefFrom >= ref_range.GetFrom());
2567  _ASSERT(values.m_RefFrom+len <= ref_range.GetToOpen());
2568  _ASSERT((values.m_RefFrom - ref_range.GetFrom())%16 == 0);
2569  _ASSERT(len%16 == 0 || values.m_RefFrom+len == ref_range.GetToOpen());
2570  TSeqPos ref_offset = values.m_RefFrom-ref_range.GetFrom();
2571  for ( int k = 0; k < kNumStat; ++k ) {
2572  SGraph& g = graphs[k];
2573  TCount max_added = values.get_max_count(k);
2574  if ( max_added != 0 || g.graph ) {
2575  x_UpdateMaxIsInt(g, max_added, ref_offset);
2576  }
2577  }
2578  x_AdjustACGT(ref_offset);
2579  x_AddValues(len, values);
2580  }
2581 };
2582 
2584 {
2585  CStopWatch sw;
2586  if ( GetDebugLevel() >= 2 ) {
2587  sw.Start();
2588  }
2589  size_t range_id = chunk_info.GetChunkId()/kChunkIdMul;
2590  const CBamRefSeqChunkInfo& chunk = m_Chunks[range_id];
2591  _ASSERT(chunk.m_PileupChunkCount);
2592  auto graph_range = GetChunkGraphRange(range_id);
2594  int min_quality = m_MinMapQuality;
2595  _TRACE("Loading pileup "<<GetRefSeqId()<<" @ "<<chunk.GetRefSeqRange());
2596 
2597  SPileupGraphCreator gg(m_File->GetAnnotName(), GetRefSeq_id(), graph_range, min_quality);
2598 
2599  CMutexGuard seq2chunk_guard(eEmptyGuard);
2600  if ( chunk.m_PileupChunkCount ) {
2601  for ( int i = 1; i < chunk.m_PileupChunkCount; ++i ) {
2602  gg.splits.push_back(m_Chunks[range_id+i].GetRefSeqRange().GetFrom());
2603  }
2604  gg.cur_split = gg.splits.begin();
2605 #ifndef SEPARATE_PILEUP_READS
2606  seq2chunk_guard.Guard(m_Seq2ChunkMutex);
2607  gg.seq_chunk_id = chunk_info.GetChunkId();
2608  gg.seq2chunk = &m_Seq2Chunk;
2609 #endif
2610  }
2611 
2617  TSeqPos gap_to_intron_threshold = s_GetGapToIntronThreshold();
2618  size_t count = m_File->GetBamDb().CollectPileup(ss, GetRefSeqId(), graph_range, &gg,
2619  intron_mode, gap_to_intron_threshold);
2620 
2621 #ifndef SEPARATE_PILEUP_READS
2622  gg.seq2chunk = 0;
2623  seq2chunk_guard.Release();
2624 #endif
2625 
2626  for ( int i = 1; i < chunk.m_PileupChunkCount; ++i ) {
2627  m_Chunks[range_id+i].m_FilePosFirstCrossing = gg.splits[i-1].file_pos_first_crossing;
2628  m_Chunks[range_id+i].m_FilePosFirstStarting = gg.splits[i-1].file_pos_first_starting;
2629  }
2630 
2631  if ( GetDebugLevel() >= 3 ) {
2632  LOG_POST_X(11, Info<<"CBAMDataLoader: "
2633  "Collected pileup counts "<<GetRefSeqId()<<
2634  " id="<<chunk_info.GetChunkId()<<
2635  " @ "<<chunk.GetRefSeqRange()<<": "<<
2636  count<<" in "<<sw.Elapsed());
2637  }
2638  if ( count == 0 ) {
2639  // zero pileup graphs
2640  chunk_info.SetLoaded();
2641  return;
2642  }
2643 
2644  gg.Finalize();
2645  CRef<CSeq_annot> annot(new CSeq_annot);
2646  {
2647  string name = m_File->GetAnnotName();
2648  name += ' ';
2649  name += PILEUP_NAME_SUFFIX;
2650  CRef<CAnnotdesc> desc(new CAnnotdesc);
2651  desc->SetName(name);
2652  annot->SetDesc().Set().push_back(desc);
2653  }
2654  size_t total_bytes = 0;
2655  for ( int k = 0; k < ss.kNumStat; ++k ) {
2656  SPileupGraphCreator::SGraph& g = gg.graphs[k];
2657  if ( g.graph ) {
2658  annot->SetData().SetGraph().push_back(g.graph);
2659  if ( g.bytes ) {
2660  total_bytes += g.bytes->size()*sizeof(g.bytes->front())+10000;
2661  }
2662  else {
2663  total_bytes += g.ints->size()*sizeof(g.ints->front())+10000;
2664  }
2665  }
2666  }
2667  chunk_info.x_LoadAnnot(place, *annot);
2668  size_t align_count = 0;
2669  if ( gg.annot ) {
2670  // add alignments that cross sub-ranges borders
2671  align_count = gg.annot->GetData().GetAlign().size();
2672  //LOG_POST("Pileup align annot: "<<MSerial_AsnText<<*gg.annot);
2673  chunk_info.x_LoadAnnot(place, *gg.annot);
2674 #ifdef SEPARATE_PILEUP_READS
2675  x_AddSeqChunk(chunk_info, gg.short_ids);
2676 #else
2677  chunk_info.x_LoadBioseqs(place, gg.bioseqs);
2678 #endif
2679  }
2680  chunk_info.x_AddUsedMemory(total_bytes);
2681 
2682  if ( GetDebugLevel() >= 3 ) {
2683  LOG_POST_X(11, Info<<"CBAMDataLoader: "
2684  "Loaded pileup "<<GetRefSeqId()<<
2685  " id="<<chunk_info.GetChunkId()<<
2686  " @ "<<chunk.GetRefSeqRange()<<": "<<
2687  count<<" ("<<align_count<<" aligns) in "<<sw.Elapsed());
2688  }
2689 
2690  chunk_info.SetLoaded();
2691 }
2692 
2693 #else // !USE_NEW_PILEUP
2694 
2696 
2698 {
2699  typedef unsigned TCount;
2700 
2701  enum EStat {
2702  kStat_A = 0,
2703  kStat_C = 1,
2704  kStat_G = 2,
2705  kStat_T = 3,
2709  kNumStat = 7
2710  };
2711 
2715  vector<TCount> cc[kNumStat];
2716 
2717  explicit
2719  : len(len),
2722  {
2723  for ( int k = 0; k < kNumStat; ++k ) {
2724  if ( k == kStat_Intron && !make_intron_graph ) {
2725  continue;
2726  }
2727  cc[k].resize(len);
2728  }
2729  }
2730 
2731  void add_match(TSeqPos pos)
2732  {
2733  if ( pos < len ) {
2734  cc[kStat_Match][pos] += 1;
2735  }
2736  }
2737  void add_base(TSeqPos pos, char b)
2738  {
2739  if ( pos < len ) {
2740  switch ( b ) {
2741  case 'A': cc[kStat_A][pos] += 1; break;
2742  case 'C': cc[kStat_C][pos] += 1; break;
2743  case 'G': cc[kStat_G][pos] += 1; break;
2744  case 'T': cc[kStat_T][pos] += 1; break;
2745  case '=': cc[kStat_Match][pos] += 1; break;
2746  // others including N are unknown mismatch, no pileup information
2747  }
2748  }
2749  }
2751  {
2752  if ( pos < len ) {
2753  switch ( b ) {
2754  case 1: /* A */ cc[kStat_A][pos] += 1; break;
2755  case 2: /* C */ cc[kStat_C][pos] += 1; break;
2756  case 4: /* G */ cc[kStat_G][pos] += 1; break;
2757  case 8: /* T */ cc[kStat_T][pos] += 1; break;
2758  case 0: /* = */ cc[kStat_Match][pos] += 1; break;
2759  // others including N (=15) are unknown mismatch, no pileup information
2760  }
2761  }
2762  }
2763  void x_add_gap_or_intron(TSignedSeqPos gap_pos, TSeqPos gap_len, EStat stat)
2764  {
2765  _ASSERT(stat == kStat_Gap || stat == kStat_Intron);
2766  if ( gap_pos < 0 ) {
2767  if ( TSignedSeqPos(gap_len + gap_pos) <= 0 ) {
2768  // gap is fully before graph segment
2769  return;
2770  }
2771  gap_len += gap_pos;
2772  gap_pos = 0;
2773  }
2774  else if ( TSeqPos(gap_pos) >= len ) {
2775  // gap is fully after graph segment
2776  return;
2777  }
2778  TSeqPos gap_end = gap_pos + gap_len;
2779  if ( gap_end > len ) {
2780  // gap goes beyond end of graph segment
2781  gap_end = len;
2782  }
2783  cc[stat][gap_pos] += 1;
2784  if ( gap_end < len ) {
2785  cc[stat][gap_end] -= 1;
2786  }
2787  }
2788  void add_intron(TSignedSeqPos gap_pos, TSeqPos gap_len)
2789  {
2790  if ( make_intron_graph ) {
2791  x_add_gap_or_intron(gap_pos, gap_len, kStat_Intron);
2792  }
2793  }
2794  void add_gap(TSignedSeqPos gap_pos, TSeqPos gap_len)
2795  {
2796  if ( gap_len > gap_to_intron_threshold ) {
2797  add_intron(gap_pos, gap_len);
2798  }
2799  else {
2800  x_add_gap_or_intron(gap_pos, gap_len, kStat_Gap);
2801  }
2802  }
2803  void x_finish_add(EStat stat)
2804  {
2805  _ASSERT(stat == kStat_Gap || stat == kStat_Intron);
2806  TCount g = 0;
2807  for ( TSeqPos i = 0; i < len; ++i ) {
2808  g += cc[stat][i];
2809  cc[stat][i] = g;
2810  }
2811  }
2812  void finish_add()
2813  {
2815  if ( make_intron_graph ) {
2817  }
2818  }
2819 
2821  {
2822  return cc[type].empty()? 0: *max_element(cc[type].begin(), cc[type].end());
2823  }
2824  void get_maxs(TCount (&c_max)[kNumStat]) const
2825  {
2826  for ( int k = 0; k < kNumStat; ++k ) {
2827  c_max[k] = get_max_count(k);
2828  }
2829  }
2830 };
2831 
2832 
2833 static inline
2835 {
2836  Uint1 b2 = read_raw[pos/2];
2837  return pos%2? b2&0xf: b2>>4;
2838 }
2839 
2840 
2842 
2843 
2845 {
2846  CStopWatch sw;
2847  if ( GetDebugLevel() >= 3 ) {
2848  sw.Start();
2849  }
2850  size_t range_id = chunk_info.GetChunkId()/kChunkIdMul;
2851  const CBamRefSeqChunkInfo& chunk = m_Chunks[range_id];
2852  auto graph_range = GetChunkGraphRange(range_id);
2854  int min_quality = m_MinMapQuality;
2855  _TRACE("Loading pileup "<<GetRefSeqId()<<" @ "<<chunk.GetRefSeqRange());
2856  size_t count = 0, skipped = 0;
2857 
2858  SBaseStats ss(graph_range.GetLength());
2859 
2860 #ifdef SKIP_TOO_LONG_ALIGNMENTS
2861  TSeqPos ref_len = m_File->GetRefSeqLength(GetRefSeqId());
2862 #endif
2863  CBamAlignIterator ait(*m_File, GetRefSeqId(), graph_range.GetFrom(), graph_range.GetLength());
2864  if ( m_SpotIdDetector ) {
2866  }
2867  if ( CBamRawAlignIterator* rit = ait.GetRawIndexIteratorPtr() ) {
2868  for( ; ait; ++ait ) {
2869  if ( !ss.AcceptAlign(ait) ) {
2870  ++skipped;
2871  continue;
2872  }
2873  TSeqPos align_pos = rit->GetRefSeqPos();
2874 #ifdef SKIP_TOO_LONG_ALIGNMENTS
2875  TSeqPos align_end = align_pos + ait.GetCIGARRefSize();
2876  if ( align_end > ref_len ) {
2877  ++skipped;
2878  continue;
2879  }
2880 #endif
2881  ++count;
2882 
2883  TSeqPos ss_pos = align_pos - graph_range.GetFrom();
2884  TSeqPos read_pos = 0;
2885  CTempString read_raw = rit->GetShortSequenceRaw();
2886  for ( Uint2 i = 0, count = rit->GetCIGAROpsCount(); i < count; ++i ) {
2887  Uint4 op = rit->GetCIGAROp(i);
2888  Uint4 seglen = op >> 4;
2889  switch ( op & 0xf ) {
2890  case SBamAlignInfo::kCIGAR_eq: // =
2891  // match
2892  for ( TSeqPos i = 0; i < seglen; ++i ) {
2893  ss.add_match(ss_pos);
2894  ++ss_pos;
2895  }
2896  read_pos += seglen;
2897  break;
2898  case SBamAlignInfo::kCIGAR_M: // M
2899  case SBamAlignInfo::kCIGAR_X: // X
2900  // mismatch ('X') or
2901  // unspecified 'alignment match' ('M') that can be a mismatch too
2902  for ( TSeqPos i = 0; i < seglen; ++i ) {
2903  ss.add_base_raw(ss_pos, sx_GetBaseRaw(read_raw, read_pos));
2904  ++ss_pos;
2905  ++read_pos;
2906  }
2907  break;
2908  case SBamAlignInfo::kCIGAR_I: // I
2909  case SBamAlignInfo::kCIGAR_S: // S
2910  read_pos += seglen;
2911  break;
2912  case SBamAlignInfo::kCIGAR_N: // N
2913  // intron
2914  ss.add_intron(ss_pos, seglen);
2915  ss_pos += seglen;
2916  break;
2917  case SBamAlignInfo::kCIGAR_D: // D
2918  // gap or intron
2919  ss.add_gap(ss_pos, seglen);
2920  ss_pos += seglen;
2921  break;
2922  default: // P
2923  break;
2924  }
2925  }
2926  }
2927  }
2928  else {
2929  for( ; ait; ++ait ) {
2930  if ( !ss.AcceptAlign(ait) ) {
2931  ++skipped;
2932  continue;
2933  }
2934  TSeqPos align_pos = ait.GetRefSeqPos();
2935 #ifdef SKIP_TOO_LONG_ALIGNMENTS
2936  TSeqPos align_end = align_pos + ait.GetCIGARRefSize();
2937  if ( align_end > ref_len ) {
2938  ++skipped;
2939  continue;
2940  }
2941 #endif
2942  ++count;
2943 
2944  TSignedSeqPos ss_pos = align_pos - graph_range.GetFrom();
2945  TSeqPos read_pos = ait.GetCIGARPos();
2946  CTempString read = ait.GetShortSequence();
2947  CTempString cigar = ait.GetCIGAR();
2948  const char* ptr = cigar.data();
2949  const char* end = ptr + cigar.size();
2950  while ( ptr != end ) {
2951  char type = *ptr;
2952  TSeqPos seglen = 0;
2953  for ( ; ++ptr != end; ) {
2954  char c = *ptr;
2955  if ( c >= '0' && c <= '9' ) {
2956  seglen = seglen*10+(c-'0');
2957  }
2958  else {
2959  break;
2960  }
2961  }
2962  if ( seglen == 0 ) {
2963  ERR_POST_X(4, "CBAMDataLoader: Bad CIGAR length: "<<type<<"0 in "<<cigar);
2964  break;
2965  }
2966  if ( type == '=' ) {
2967  // match
2968  for ( TSeqPos i = 0; i < seglen; ++i ) {
2969  ss.add_match(ss_pos);
2970  ++ss_pos;
2971  }
2972  read_pos += seglen;
2973  }
2974  else if ( type == 'M' || type == 'X' ) {
2975  // mismatch ('X') or
2976  // unspecified 'alignment match' ('M') that can be a mismatch too
2977  for ( TSeqPos i = 0; i < seglen; ++i ) {
2978  ss.add_base(ss_pos, read[read_pos]);
2979  ++ss_pos;
2980  ++read_pos;
2981  }
2982  }
2983  else if ( type == 'I' || type == 'S' ) {
2984  if ( type == 'S' ) {
2985  // soft clipping already accounted in seqpos
2986  continue;
2987  }
2988  read_pos += seglen;
2989  }
2990  else if ( type == 'N' ) {
2991  // intron
2992  ss.add_intron(ss_pos, seglen);
2993  ss_pos += seglen;
2994  }
2995  else if ( type == 'D' ) {
2996  // gap or intron
2997  ss.add_gap(ss_pos, seglen);
2998  ss_pos += seglen;
2999  }
3000  else if ( type != 'P' ) {
3001  ERR_POST_X(14, "CBAMDataLoader: Bad CIGAR char: "<<type<<" in "<<cigar);
3002  break;
3003  }
3004  }
3005  }
3006  }
3007  if ( GetDebugLevel() >= 3 ) {
3008  LOG_POST_X(11, Info<<"CBAMDataLoader: "
3009  "Collected pileup counts "<<GetRefSeqId()<<
3010  " id="<<chunk_info.GetChunkId()<<
3011  " @ "<<chunk.GetRefSeqRange()<<": "<<
3012  count<<" skipped: "<<skipped<<" in "<<sw.Elapsed());
3013  }
3014  if ( count == 0 ) {
3015  // zero pileup graphs
3016  chunk_info.SetLoaded();
3017  return;
3018  }
3019 
3020  ss.finish_add();
3021 
3022  CRef<CSeq_annot> annot(new CSeq_annot);
3023  {
3024  string name = m_File->GetAnnotName();
3025  name += ' ';
3026  name += PILEUP_NAME_SUFFIX;
3027  CRef<CAnnotdesc> desc(new CAnnotdesc);
3028  desc->SetName(name);
3029  annot->SetDesc().Set().push_back(desc);
3030  }
3031  size_t total_bytes = 0;
3033  for ( int k = 0; k < SBaseStats::kNumStat; ++k ) {
3035  if ( max == 0 ) {
3037  // do not generate empty 'matches' or 'introns' graph
3038  continue;
3039  }
3041  // do not generate empty graph (configurable)
3042  continue;
3043  }
3044  }
3045  CRef<CSeq_graph> graph(new CSeq_graph);
3046  static const char* const titles[SBaseStats::kNumStat] = {
3047  "Number of A bases",
3048  "Number of C bases",
3049  "Number of G bases",
3050  "Number of T bases",
3051  "Number of inserts",
3052  "Number of matches",
3053  "Number of introns"
3054  };
3055  graph->SetTitle(titles[k]);
3056  CSeq_interval& loc = graph->SetLoc().SetInt();
3057  loc.SetId(*ref_id);
3058  loc.SetFrom(graph_range.GetFrom());
3059  loc.SetTo(graph_range.GetTo());
3060  graph->SetNumval(graph_range.GetLength());
3061 
3062  if ( max < 256 ) {
3063  CByte_graph& data = graph->SetGraph().SetByte();
3064  data.SetValues().assign(ss.cc[k].begin(), ss.cc[k].end());
3065  data.SetMin(0);
3066  data.SetMax(max);
3067  data.SetAxis(0);
3068  total_bytes += graph_range.GetLength()*sizeof(data.GetValues()[0])+10000;
3069  }
3070  else {
3071  CInt_graph& data = graph->SetGraph().SetInt();
3072  data.SetValues().assign(ss.cc[k].begin(), ss.cc[k].end());
3073  data.SetMin(0);
3074  data.SetMax(max);
3075  data.SetAxis(0);
3076  total_bytes += graph_range.GetLength()*sizeof(data.GetValues()[0])+10000;
3077  }
3078  annot->SetData().SetGraph().push_back(graph);
3079  }
3080  chunk_info.x_LoadAnnot(place, *annot);
3081  chunk_info.x_AddUsedMemory(total_bytes);
3082 
3083  if ( GetDebugLevel() >= 3 ) {
3084  LOG_POST_X(11, Info<<"CBAMDataLoader: "
3085  "Loaded pileup "<<GetRefSeqId()<<" @ "<<
3086  " id="<<chunk_info.GetChunkId()<<
3087  chunk.GetRefSeqRange()<<": "<<
3088  count<<" skipped: "<<skipped<<" in "<<sw.Elapsed());
3089  }
3090 
3091  chunk_info.SetLoaded();
3092 }
3093 #endif // USE_NEW_PILEUP
3094 
3095 
3096 /////////////////////////////////////////////////////////////////////////////
3097 // CBamRefSeqChunkInfo
3098 /////////////////////////////////////////////////////////////////////////////
3099 
3100 
3102 {
3103  ++m_AlignCount;
3104  m_RefSeqRange += range;
3105  m_MaxRefSeqFrom = max(m_MaxRefSeqFrom, range.GetFrom());
3106 }
3107 
3108 
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
static const size_t kChunkSize
static bool GetMinMapQualityParam(void)
static const Uint8 kDefaultSplitBinDataSize
NCBI_DEFINE_ERR_SUBCODE_X(30)
#define RETRY(expr)
static double s_CreateTime
BEGIN_LOCAL_NAMESPACE
static string GetMapperContext(void)
static const size_t kChunkDataSize
static const double k_make_align_seconds
static const TSeqPos kDefaultSplitBinMinLength
static string GetIncludeAlignTagsParam(void)
std::invoke_result< Call >::type CallWithRetry(Call &&call, const char *name, int retry_count=0)
END_LOCAL_NAMESPACE
static const CUser_field & GetIdField(const CUser_field &field, int id)
static TSeqPos s_GetGapToIntronThreshold(void)
static int GetDebugLevel(void)
NCBI_PARAM_DEF(bool, BAM_LOADER, PILEUP_GRAPHS, true)
static const double k_make_graph_seconds
static const Uint8 kSingleAlignBytes
static const double k_make_read_seconds
static Uint8 GetSplitBinDataSize(void)
static bool GetPreferRawIndexOverCoverageGraphParam(void)
static string GetMapperFileName(void)
static bool GetSkipEmptyPileupGraphsParam(void)
static TSeqPos s_GetEnd(const vector< TSeqPos > &over_ends, TSeqPos i, TSeqPos bin_size)
static bool s_GetMakeIntronGraph(void)
static const size_t kSplitLevelsChunkDataSize
static bool GetEstimatedCoverageGraphParam(void)
static bool GetPileupGraphsParam(void)
NCBI_PARAM_DEF_EX(int, BAM_LOADER, DEBUG, 0, eParam_NoThread, BAM_LOADER_DEBUG)
static TSeqPos GetSplitBinMinLength(void)
static const int kMainChunkId
static double s_AttachTime
static Uint1 sx_GetBaseRaw(CTempString read_raw, TSeqPos pos)
NCBI_PARAM_DECL(int, BAM_LOADER, DEBUG)
#define PILEUP_NAME_SUFFIX
EChunkIdType
@ eChunk_align1
@ eChunk_align
@ eChunk_align2
@ eChunk_short_seq1
@ eChunk_short_seq2
@ eChunk_pileup_graph
@ eChunk_short_seq
@ eChunk_short_seq_pileup
@ kChunkIdMul
static const int kTSEId
#define SRZ_CONFIG_NAME
Definition: bamread.hpp:108
AutoPtr –.
Definition: ncbimisc.hpp:401
CAnnotdesc –.
Definition: Annotdesc.hpp:66
string m_BamName
bool operator==(const CBlobId &id) const
CSeq_id_Handle m_SeqId
bool operator<(const CBlobId &id) const
CBAMBlobId(const CTempString &str)
string ToString(void) const
Get string representation of blob id.
CDataSource::SGiFound GetGi(const CSeq_id_Handle &idh)
CRef< CBAMBlobId > GetRefSeqBlobId(const CSeq_id_Handle &idh)
CBamRefSeqInfo * GetRefSeqInfo(const CBAMBlobId &blob_id)
CBAMDataLoader::TAnnotNames GetPossibleAnnotNames(void) const
bool IsShortSeq(const CSeq_id_Handle &idh)
void LoadChunk(const CBAMBlobId &blob_id, CTSE_Chunk_Info &chunk)
double EstimateLoadSeconds(const CBAMBlobId &blob_id, const CTSE_Chunk_Info &chunk, Uint4 bytes)
CDataSource::SAccVerFound GetAccVer(const CSeq_id_Handle &idh)
string GetLabel(const CSeq_id_Handle &idh)
CBAMDataLoader_Impl(const CBAMDataLoader::SLoaderParams &params)
void GetIds(const CSeq_id_Handle &idh, TIds &ids)
TTaxId GetTaxId(const CSeq_id_Handle &idh)
vector< CSeq_id_Handle > TIds
bool BAMFilesOpened() const
friend class CBamFileInfo
void AddBamFile(const CBAMDataLoader::SBamFileName &bam)
void LoadBAMEntry(const CBAMBlobId &blob_id, CTSE_LoadLock &load_lock)
AutoPtr< IIdMapper > m_IdMapper
CRef< CBAMBlobId > GetShortSeqBlobId(const CSeq_id_Handle &idh)
vector< SDirSeqInfo > TSeqInfos
static void SetMinMapQualityParamDefault(int param)
static void SetIncludeAlignTagsParamDefault(const string &param)
static void SetPileupGraphsParamDefault(bool param)
static string GetIncludeAlignTagsParamDefault(void)
static bool GetPreOpenParam(void)
static void SetEstimatedCoverageGraphParamDefault(bool param)
static void SetSkipEmptyPileupGraphsParamDefault(bool param)
static bool GetPileupGraphsParamDefault(void)
static bool GetEstimatedCoverageGraphParamDefault(void)
vector< CAnnotName > TAnnotNames
Definition: bamloader.hpp:122
static int GetMinMapQualityParamDefault(void)
static void SetPreOpenParam(bool param)
static bool GetSkipEmptyPileupGraphsParamDefault(void)
CBam2Seq_graph.
Definition: bamgraph.hpp:73
void SetEstimated(bool estimated=true)
Definition: bamgraph.cpp:171
void SetAnnotName(const string &name)
Definition: bamgraph.cpp:97
CRef< CSeq_annot > MakeSeq_annot(CBamMgr &mgr, const string &bam_file, const string &bam_index)
Generate Seq-annot for BAM file using BAM file index.
Definition: bamgraph.cpp:708
void SetRefLabel(const string &ref_label)
Definition: bamgraph.cpp:79
void SetRefId(const CSeq_id &ref_id)
Definition: bamgraph.cpp:85
void SetMinMapQuality(int qual)
Definition: bamgraph.cpp:127
ISpotIdDetector interface is used to detect spot id in case of incorrect flag combination.
Definition: bamread.hpp:720
CRef< CSeq_align > GetMatchAlign(void) const
Definition: bamread.cpp:2940
CRef< CBioseq > GetShortBioseq(void) const
Definition: bamread.cpp:2860
CTempString GetShortSequence(void) const
Definition: bamread.cpp:2287
TSeqPos GetShortSequenceLength(void) const
Definition: bamread.cpp:2305
CRef< CSeq_id > GetShortSeq_id(void) const
Definition: bamread.cpp:2561
CRef< CSeq_annot > GetSeq_annot(void) const
Definition: bamread.hpp:929
TSeqPos GetRefSeqPos(void) const
Definition: bamread.cpp:2207
CTempString GetCIGAR(void) const
Definition: bamread.cpp:2347
TSeqPos GetCIGARRefSize(void) const
Definition: bamread.cpp:2394
CBamRawAlignIterator * GetRawIndexIteratorPtr() const
Definition: bamread.hpp:748
Uint1 GetMapQuality(void) const
Definition: bamread.cpp:2671
void SetSpotIdDetector(ISpotIdDetector *spot_id_detector)
Definition: bamread.hpp:729
TSeqPos GetCIGARPos(void) const
Definition: bamread.cpp:2335
bool IsSetStrand(void) const
Definition: bamread.cpp:2644
ENa_strand GetStrand(void) const
Definition: bamread.cpp:2656
size_t CollectPileup(SPileupValues &values, const string &ref_id, CRange< TSeqPos > graph_range, ICollectPileupCallback *callback=0, SPileupValues::EIntronMode intron_mode=SPileupValues::eNoCountIntron, TSeqPos gap_to_intron_threshold=kInvalidSeqPos) const
bool UsesRawIndex() const
Definition: bamread.hpp:216
bool IncludeAlignTag(CTempString tag)
Definition: bamread.cpp:1622
void SetIdMapper(IIdMapper *idmapper, EOwnership ownership)
Definition: bamread.hpp:236
TSeqPos GetRefSeqLength(const string &str) const
Definition: bamread.cpp:1023
CBamRawDb & GetRawDb()
Definition: bamread.hpp:220
CBamDb & GetBamDb(void)
const string & GetBamName(void) const
void GetRefSeqBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
const string & GetAnnotName(void) const
TRefSeqs m_RefSeqs
void GetShortSeqBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
void x_Initialize(const CBAMDataLoader_Impl &impl, const CBAMDataLoader::SBamFileName &bam)
CBamRefSeqInfo * GetRefSeqInfo(const CSeq_id_Handle &seq_id) const
void AddRefSeq(const string &refseq_label, const CSeq_id_Handle &refseq_id)
CBamFileInfo(const CBAMDataLoader_Impl &impl, const CBAMDataLoader::SBamFileName &bam, const string &refseq_label=kEmptyStr, const CSeq_id_Handle &seq_id=CSeq_id_Handle())
TSeqPos GetRefSeqLength(const string &id) const
static Uint8 GetFileSize(CBGZFRange range)
Definition: bamindex.cpp:1940
const SBamIndexRefIndex & GetRef(size_t ref_index) const
Definition: bamindex.cpp:1363
CBGZFPos GetFilePos() const
Definition: bamindex.hpp:1510
bool IsOnMinBinIndexLevel() const
Definition: bamindex.hpp:1614
size_t GetRefIndex(const string &ref_label) const
Definition: bamindex.hpp:1026
double GetEstimatedSecondsPerByte() const
Definition: bamindex.cpp:2032
const CBamIndex & GetIndex() const
Definition: bamindex.hpp:1014
TSeqPos GetRefSeqLength(size_t ref_index) const
Definition: bamindex.hpp:1034
void AddRefSeqRange(const TRange &range)
TRange GetAlignStartRange() const
const TRange & GetRefSeqRange(void) const
void LoadMainChunk(CTSE_Chunk_Info &chunk_info)
void SetBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
void x_LoadRangesScan(void)
CBamRefSeqInfo(CBamFileInfo *bam_file, const string &refseqid, const CSeq_id_Handle &seq_id)
void x_AddSeqChunk(CTSE_Chunk_Info &chunk_info, const vector< CSeq_id_Handle > &short_ids)
void x_LoadRangesStat(void)
void LoadSeqChunk(CTSE_Chunk_Info &chunk_info)
void LoadPileupChunk(CTSE_Chunk_Info &chunk_info)
const string & GetRefSeqId(void) const
void GetShortSeqBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
CRange< TSeqPos > TRange
CRef< CSeq_entry > m_CovEntry
double EstimatePileupLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
void LoadRanges(void)
void LoadMainSplit(CTSE_LoadLock &load_lock)
void LoadAlignChunk(CTSE_Chunk_Info &chunk_info)
double EstimateLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
const CSeq_id_Handle & GetRefSeq_id(void) const
double EstimateSeqLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
void CreateChunks(CTSE_Split_Info &split_info)
CRange< TSeqPos > GetChunkGraphRange(size_t range_id)
void LoadMainEntry(CTSE_LoadLock &load_lock)
CBamFileInfo * m_File
bool x_LoadRangesEstimated(void)
TSeq2Chunk m_Seq2Chunk
double EstimateAlignLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
void LoadChunk(CTSE_Chunk_Info &chunk_info)
CIRef< CBamAlignIterator::ISpotIdDetector > m_SpotIdDetector
bool x_LoadRangesCov(void)
void x_InitAlignIterator(CBamAlignIterator &ait, TSeqPos &max_end_pos, CTSE_Chunk_Info &chunk_info, int base_id)
Blob state exceptions, used by GenBank loader.
CByte_graph –.
Definition: Byte_graph.hpp:66
void AddSpotId(string &short_id, const CBamAlignIterator *iter)
map< string, SShortSeqInfo > m_ShortSeqs
CDirEntry –.
Definition: ncbifile.hpp:262
CFastMutex –.
Definition: ncbimtx.hpp:667
CFile –.
Definition: ncbifile.hpp:1604
IdMapper implementation using an external configuration file.
Definition: idmapper.hpp:189
IdMapper base class implementation.
Definition: idmapper.hpp:56
CInt_graph –.
Definition: Int_graph.hpp:66
Data loader exceptions, used by GenBank loader.
CObject –.
Definition: ncbiobj.hpp:180
CSafeStatic<>::
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
@ e_Iupacna
Definition: sequtil.hpp:47
Definition: Seq_entry.hpp:56
const TAnnot & GetAnnot(void) const
Definition: Seq_entry.cpp:179
TAnnot & SetAnnot(void)
Definition: Seq_entry.cpp:195
const value_type * data() const
string FindAccPathNoThrow(const string &acc)
Definition: bamread.hpp:131
CStopWatch –.
Definition: ncbitime.hpp:1938
pair< TBioseqId, TBioseq_setId > TPlace
void x_LoadAnnot(const TPlace &place, const CSeq_annot &annot)
void x_LoadBioseqs(const TPlace &place, const list< CRef< CBioseq > > &bioseqs)
void x_AddUsedMemory(size_t size)
void SetLoaded(CObject *obj=0)
void x_AddBioseqPlace(TBioseq_setId id)
TChunkId GetChunkId(void) const
void x_AddAnnotType(const CAnnotName &annot_name, const SAnnotTypeSelector &annot_type, const TLocationId &location_id)
void x_SetLoadBytes(Uint4 bytes)
const CTSE_Split_Info & GetSplitInfo(void) const
void x_AddBioseqId(const TBioseqId &id)
CTSE_Split_Info & GetSplitInfo(void)
Definition: tse_info.cpp:1395
void SetSeq_entry(CSeq_entry &entry, CTSE_SetObjectInfo *set_info=0)
Definition: tse_info.cpp:351
void AddChunk(CTSE_Chunk_Info &chunk_info)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CConstRef< CUser_field > GetFieldRef(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Definition: User_object.cpp:84
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user object.
Definition: User_object.cpp:71
static void CopyBuffer(const int *src, size_t count, int *dest)
Copy memory buffer (only when source and destination do not overlap!).
Definition: ncbi_fast.hpp:409
static void ConvertBuffer(const char *src, size_t count, int *dest)
Convert memory buffer elements from one type to another.
Definition: ncbi_fast.hpp:449
static void AppendZeros(vector< V, A > &dest, size_t count)
Append count zeros to dest vector vector must have enough memory reserved.
Definition: ncbi_fast.hpp:347
static void SplitBufferInto4(const int *src, size_t count, int *dest0, int *dest1, int *dest2, int *dest3)
Split source memory buffer into 4 buffers Source buffer contains 4*count elements Each destination bu...
Definition: ncbi_fast.hpp:478
static V * AppendUninitialized(vector< V, A > &dest, size_t count)
Append count unitialized elements to dest vector return pointer to appended elements for proper initi...
Definition: ncbi_fast.hpp:330
static void AppendZerosAligned16(vector< V, A > &dest, size_t count)
Append count zeros to dest vector vector must have enough memory reserved dst.end() pointer and count...
Definition: ncbi_fast.hpp:352
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
Include a standard set of the NCBI C++ Toolkit most basic headers.
@ kStat_Match
@ kNumStat
constexpr auto begin(const ct_const_array< T, N > &in) noexcept
constexpr auto end(const ct_const_array< T, N > &in) noexcept
const char * file_name[]
static const struct name_t names[]
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
char data[12]
Definition: iconv.c:80
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
element_type * release(void)
Release will release ownership of pointer to caller.
Definition: ncbimisc.hpp:472
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
string
Definition: cgiapp.hpp:687
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define LOG_POST_X(err_subcode, message)
Definition: ncbidiag.hpp:553
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define NCBI_THROW_FMT(exception_class, err_code, message)
The same as NCBI_THROW but with message processed as output to ostream.
Definition: ncbiexpt.hpp:719
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
string GetBase(void) const
Get the base entry name without extension.
Definition: ncbifile.hpp:3924
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
CConstRef< CSeq_id > GetSeqId(void) const
bool IsGi(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
string GetLabel(const CSeq_id &id)
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
TGi GetGi(void) const
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
TObjectType * GetNCPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1174
#define NCBI_PARAM_TYPE(section, name)
Generate typename for a parameter from its {section, name} attributes.
Definition: ncbi_param.hpp:149
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
#define kMax_UI8
Definition: ncbi_limits.h:222
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define kMax_UI4
Definition: ncbi_limits.h:219
position_type GetLength(void) const
Definition: range.hpp:158
TThisType & SetToOpen(position_type toOpen)
Definition: range.hpp:175
position_type GetToOpen(void) const
Definition: range.hpp:138
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Stop(void)
Suspend the timer.
Definition: ncbitime.hpp:2793
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1942
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TData & GetData(void) const
Get the Data member data.
const TFields & GetFields(void) const
Get the variant data.
vector< CRef< CUser_field > > TFields
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
TInt GetInt(void) const
Get the variant data.
TReal GetReal(void) const
Get the variant data.
const TLabel & GetLabel(void) const
Get the Label member data.
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
@ e_Local
local use
Definition: Seq_id_.hpp:95
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
vector< char > TValues
Definition: Byte_graph_.hpp:89
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_graph_.cpp:131
void SetNumval(TNumval value)
Assign a value to Numval data member.
const TGraph & GetGraph(void) const
Get the Graph member data.
TB GetB(void) const
Get the B member data.
const TByte & GetByte(void) const
Get the variant data.
Definition: Seq_graph_.cpp:153
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
vector< int > TValues
Definition: Int_graph_.hpp:88
bool IsByte(void) const
Check if variant Byte is selected.
Definition: Seq_graph_.hpp:757
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
TNumval GetNumval(void) const
Get the Numval member data.
TA GetA(void) const
Get the A member data.
TComp GetComp(void) const
Get the Comp member data.
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
void SetId(TId &value)
Assign a value to Id data member.
Definition: Bioseq_set_.cpp:93
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
const Tdata & Get(void) const
Get the member data.
const TGraph & GetGraph(void) const
Get the variant data.
Definition: Seq_annot_.hpp:661
list< CRef< CSeq_graph > > TGraph
Definition: Seq_annot_.hpp:195
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
TName & SetName(void)
Select the variant.
Definition: Annotdesc_.hpp:508
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Seq_annot_.hpp:852
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
list< CRef< CAnnotdesc > > Tdata
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
@ eEmptyGuard
Definition: guard.hpp:94
static CStopWatch sw
#define DEBUG
Definition: config.h:32
Definition of all error codes used in SRA C++ support libraries.
int i
int len
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
string s_Value(TValue value)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
void SleepMilliSec(unsigned long ml_sec, EInterruptOnSignal onsignal=eRestartOnSignal)
T max(T x_, T y_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
static unsigned cnt[256]
Helper classes and templates to implement plugins.
const unsigned int kDefaultRetryCount
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
vector< SBamFileName > m_BamFiles
Definition: bamloader.hpp:71
TCount get_max_count(int type) const
Definition: bamread.hpp:488
const TCount * get_intron_counts() const
Definition: bamread.hpp:363
CSimpleBufferT< TCount > cc_gap
Definition: bamread.hpp:330
CSimpleBufferT< TCount > cc_intron
Definition: bamread.hpp:332
CSimpleBufferT< TCount > cc_match
Definition: bamread.hpp:331
CSimpleBufferT< SCountACGT > cc_acgt
Definition: bamread.hpp:328
const TCount * get_acgt_counts() const
Definition: bamread.hpp:343
const TCount * get_split_acgt_counts(int k, TSeqPos len) const
Definition: bamread.hpp:348
const TCount * get_gap_counts() const
Definition: bamread.hpp:359
Better replacement of GetAccVer(), this method should be defined in data loaders, GetAccVer() is left...
Better replacement of GetGi(), this method should be defined in data loaders, GetGi() is left for com...
constexpr TSeqPos GetMinBinSize() const
Definition: bamindex.hpp:208
vector< Uint8 > EstimateDataSizeByAlnStartPos(TSeqPos seqlen=kInvalidSeqPos) const
Definition: bamindex.cpp:1055
TSeqPos gap_to_intron_threshold
SBaseStats(TSeqPos len)
void add_match(TSeqPos pos)
TCount get_max_count(int type) const
void add_gap(TSignedSeqPos gap_pos, TSeqPos gap_len)
void x_finish_add(EStat stat)
void add_base_raw(TSeqPos pos, Uint1 b)
void x_add_gap_or_intron(TSignedSeqPos gap_pos, TSeqPos gap_len, EStat stat)
void add_base(TSeqPos pos, char b)
void get_maxs(TCount(&c_max)[kNumStat]) const
vector< TCount > cc[kNumStat]
void add_intron(TSignedSeqPos gap_pos, TSeqPos gap_len)
Definition: type.c:6
#define _ASSERT
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
Modified on Mon Apr 22 04:02:57 2024 by modify_doxy.py rev. 669887