NCBI C++ ToolKit
bamloader_impl.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_DATA_LOADERS_BAM___BAMLOADER_IMPL__HPP
2 #define OBJTOOLS_DATA_LOADERS_BAM___BAMLOADER_IMPL__HPP
3 
4 /* $Id: bamloader_impl.hpp 101440 2023-12-13 17:55:55Z vasilche $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Eugene Vasilchenko
30  *
31  * File Description: BAM file data loader
32  *
33  * ===========================================================================
34  */
35 
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbimtx.hpp>
41 
44 
45 class CDataLoader;
48 class CBamRefSeqInfo;
49 class CBamFileInfo;
50 
51 class CBAMBlobId : public CBlobId
52 {
53 public:
54  explicit CBAMBlobId(const CTempString& str);
55  CBAMBlobId(const string& bam_name, const CSeq_id_Handle& seq_id);
56  ~CBAMBlobId(void);
57 
58  string m_BamName;
60 
61  string ToString(void) const;
62  bool operator<(const CBlobId& id) const;
63  bool operator==(const CBlobId& id) const;
64 };
65 
66 
68 {
69 public:
72  {
73  }
74 
75  // estimated number of alignments
76  Uint8 GetAlignCount(void) const
77  {
78  return m_AlignCount;
79  }
80 
82 
83  void AddRefSeqRange(const TRange& range);
84 
85  // full reference sequence range covered by alignments in this chunk
86  const TRange& GetRefSeqRange(void) const
87  {
88  return m_RefSeqRange;
89  }
90  // max start position of an alignmnent in this chunk
92  {
93  return m_MaxRefSeqFrom;
94  }
95  // All alignments have their start position in range
96  // GetRefSeqRange().GetFrom()...GetMaxRefSeqFrom().
97  // The chunk is registered to have alignments in range GetRefSeqRange(),
98  // and having pileup graphs over range GetRefSeqRange() excluding start of the next chunk.
99  const TRange& GetAlignRange() const
100  {
101  return GetRefSeqRange();
102  }
104  {
106  range.SetTo(GetMaxRefSeqFrom());
107  return range;
108  }
109  /*
110  tuple<TRange, int> GetGraphRangeUnfiltered() const
111  {
112  TRange range = GetRefSeqRange();
113  return make_tuple(GetRefSeqRange(), m_PileupChunkCount);
114  }
115  */
116 
117 protected:
118  friend class CBamRefSeqInfo;
119 
120  Uint8 m_AlignCount; // 0 - graph only
127 };
128 
129 
130 class CBamRefSeqInfo : public CObject
131 {
132 public:
133  CBamRefSeqInfo(CBamFileInfo* bam_file,
134  const string& refseqid,
135  const CSeq_id_Handle& seq_id);
136 
137  const string& GetRefSeqId(void) const
138  {
139  return m_RefSeqId;
140  }
141  const CSeq_id_Handle& GetRefSeq_id(void) const
142  {
143  return m_RefSeq_id;
144  }
145 
146  void SetCovFileName(const string& name)
147  {
148  m_CovFileName = name;
149  }
150 
151  void LoadRanges(void);
152  // return sequence range covered by range_id pileup chunk
153  // second value is number of covered alignment chunks
154  // or 0 if the range_id should not be a pileup chunk
155  //tuple<CRange<TSeqPos>, size_t> GetChunkGraphRange(size_t range_id);
156  CRange<TSeqPos> GetChunkGraphRange(size_t range_id);
157 
158  void LoadMainSplit(CTSE_LoadLock& load_lock);
159  void LoadMainEntry(CTSE_LoadLock& load_lock);
160  void CreateChunks(CTSE_Split_Info& split_info);
161  double EstimateLoadSeconds(const CTSE_Chunk_Info& chunk,
162  Uint4 bytes) const;
163  double EstimatePileupLoadSeconds(const CTSE_Chunk_Info& chunk,
164  Uint4 bytes) const;
165  double EstimateAlignLoadSeconds(const CTSE_Chunk_Info& chunk,
166  Uint4 bytes) const;
167  double EstimateSeqLoadSeconds(const CTSE_Chunk_Info& chunk,
168  Uint4 bytes) const;
169  void LoadChunk(CTSE_Chunk_Info& chunk_info);
170  void LoadMainChunk(CTSE_Chunk_Info& chunk_info);
171  void LoadAlignChunk(CTSE_Chunk_Info& chunk_info);
172  void LoadSeqChunk(CTSE_Chunk_Info& chunk_info);
173  void LoadPileupChunk(CTSE_Chunk_Info& chunk_info);
174 
176  const CSeq_id_Handle& idh) const;
177  void SetBlobId(CRef<CBAMBlobId>& ret,
178  const CSeq_id_Handle& idh) const;
179 
180 protected:
182  typedef vector<CBamRefSeqChunkInfo> TChunks;
184 
185  void x_LoadRangesScan(void);
186  void x_LoadRangesStat(void);
187  bool x_LoadRangesCov(void);
188  bool x_LoadRangesEstimated(void);
189  void x_InitAlignIterator(CBamAlignIterator& ait, TSeqPos& max_end_pos,
190  CTSE_Chunk_Info& chunk_info, int base_id);
191  void x_AddSeqChunk(CTSE_Chunk_Info& chunk_info,
192  const vector<CSeq_id_Handle>& short_ids);
193 
195  string m_RefSeqId;
205 };
206 
207 
208 class CBamFileInfo : public CObject
209 {
210 public:
212  const CBAMDataLoader::SBamFileName& bam,
213  const string& refseq_label = kEmptyStr,
214  const CSeq_id_Handle& seq_id = CSeq_id_Handle());
215 
216  const string& GetBamName(void) const
217  {
218  return m_BamName;
219  }
220  const string& GetAnnotName(void) const
221  {
222  return m_AnnotName;
223  }
224 
226  const CSeq_id_Handle& idh) const;
228  const CSeq_id_Handle& idh) const;
229 
230  CBamRefSeqInfo* GetRefSeqInfo(const CSeq_id_Handle& seq_id) const;
231 
232  TSeqPos GetRefSeqLength(const string& id) const
233  {
234  return m_BamDb.GetRefSeqLength(id);
235  }
236 
237  CMutex& GetMutex(void) const
238  {
239  return m_BamMutex;
240  }
241 
243  {
244  return m_BamDb;
245  }
246  operator CBamDb&(void)
247  {
248  return GetBamDb();
249  }
250 
251  void AddRefSeq(const string& refseq_label,
252  const CSeq_id_Handle& refseq_id);
253 
254 protected:
256 
258  const CBAMDataLoader::SBamFileName& bam);
259 
260  string m_BamName;
261  string m_AnnotName;
265 };
266 
267 
269 {
270 public:
271  explicit CBAMDataLoader_Impl(const CBAMDataLoader::SLoaderParams& params);
272  ~CBAMDataLoader_Impl(void);
273 
274  void AddSrzDef(void);
275  void AddBamFile(const CBAMDataLoader::SBamFileName& bam);
276  void OpenBAMFiles();
277  void OpenBAMFilesOnce();
278  bool BAMFilesOpened() const;
279 
282 
283  typedef pair<CBamFileInfo*, const CBamRefSeqInfo*> TRefSeqInfo;
284  CBamRefSeqInfo* GetRefSeqInfo(const CBAMBlobId& blob_id);
285  void LoadBAMEntry(const CBAMBlobId& blob_id,
286  CTSE_LoadLock& load_lock);
287  void LoadChunk(const CBAMBlobId& blob_id,
288  CTSE_Chunk_Info& chunk);
289  double EstimateLoadSeconds(const CBAMBlobId& blob_id,
290  const CTSE_Chunk_Info& chunk,
291  Uint4 bytes);
292 
294 
295  bool IsShortSeq(const CSeq_id_Handle& idh);
296  typedef vector<CSeq_id_Handle> TIds;
297  void GetIds(const CSeq_id_Handle& idh, TIds& ids);
300  string GetLabel(const CSeq_id_Handle& idh);
301  TTaxId GetTaxId(const CSeq_id_Handle& idh);
302 
303 protected:
304  friend class CBamFileInfo;
305  struct SDirSeqInfo {
309  string m_Label;
311  string m_AnnotName;
312  };
313 
314 private:
316  typedef vector<SDirSeqInfo> TSeqInfos;
317 
318  // mutex guarding input into the map
319  mutable CMutex m_Mutex;
321  string m_DirPath;
325 };
326 
329 
330 #endif // OBJTOOLS_DATA_LOADERS_BAM___BAMLOADER_IMPL__HPP
string m_BamName
bool operator==(const CBlobId &id) const
CSeq_id_Handle m_SeqId
bool operator<(const CBlobId &id) const
CBAMBlobId(const CTempString &str)
string ToString(void) const
Get string representation of blob id.
CDataSource::SGiFound GetGi(const CSeq_id_Handle &idh)
CRef< CBAMBlobId > GetRefSeqBlobId(const CSeq_id_Handle &idh)
CBamRefSeqInfo * GetRefSeqInfo(const CBAMBlobId &blob_id)
CBAMDataLoader::TAnnotNames GetPossibleAnnotNames(void) const
bool IsShortSeq(const CSeq_id_Handle &idh)
void LoadChunk(const CBAMBlobId &blob_id, CTSE_Chunk_Info &chunk)
double EstimateLoadSeconds(const CBAMBlobId &blob_id, const CTSE_Chunk_Info &chunk, Uint4 bytes)
CDataSource::SAccVerFound GetAccVer(const CSeq_id_Handle &idh)
string GetLabel(const CSeq_id_Handle &idh)
CBAMDataLoader_Impl(const CBAMDataLoader::SLoaderParams &params)
map< string, CRef< CBamFileInfo > > TBamFiles
void GetIds(const CSeq_id_Handle &idh, TIds &ids)
TTaxId GetTaxId(const CSeq_id_Handle &idh)
vector< CSeq_id_Handle > TIds
bool BAMFilesOpened() const
pair< CBamFileInfo *, const CBamRefSeqInfo * > TRefSeqInfo
void AddBamFile(const CBAMDataLoader::SBamFileName &bam)
void LoadBAMEntry(const CBAMBlobId &blob_id, CTSE_LoadLock &load_lock)
AutoPtr< IIdMapper > m_IdMapper
CRef< CBAMBlobId > GetShortSeqBlobId(const CSeq_id_Handle &idh)
vector< SDirSeqInfo > TSeqInfos
vector< CAnnotName > TAnnotNames
Definition: bamloader.hpp:122
TSeqPos GetRefSeqLength(const string &str) const
Definition: bamread.cpp:1023
CBamDb & GetBamDb(void)
const string & GetBamName(void) const
void GetRefSeqBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
const string & GetAnnotName(void) const
TRefSeqs m_RefSeqs
void GetShortSeqBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
void x_Initialize(const CBAMDataLoader_Impl &impl, const CBAMDataLoader::SBamFileName &bam)
CBamRefSeqInfo * GetRefSeqInfo(const CSeq_id_Handle &seq_id) const
void AddRefSeq(const string &refseq_label, const CSeq_id_Handle &refseq_id)
CBamFileInfo(const CBAMDataLoader_Impl &impl, const CBAMDataLoader::SBamFileName &bam, const string &refseq_label=kEmptyStr, const CSeq_id_Handle &seq_id=CSeq_id_Handle())
CMutex & GetMutex(void) const
TSeqPos GetRefSeqLength(const string &id) const
map< CSeq_id_Handle, CRef< CBamRefSeqInfo > > TRefSeqs
void AddRefSeqRange(const TRange &range)
CRange< TSeqPos > TRange
TSeqPos GetMaxRefSeqFrom(void) const
TRange GetAlignStartRange() const
const TRange & GetAlignRange() const
const TRange & GetRefSeqRange(void) const
Uint8 GetAlignCount(void) const
void LoadMainChunk(CTSE_Chunk_Info &chunk_info)
void SetBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
void x_LoadRangesScan(void)
CBamRefSeqInfo(CBamFileInfo *bam_file, const string &refseqid, const CSeq_id_Handle &seq_id)
void x_AddSeqChunk(CTSE_Chunk_Info &chunk_info, const vector< CSeq_id_Handle > &short_ids)
void x_LoadRangesStat(void)
void LoadSeqChunk(CTSE_Chunk_Info &chunk_info)
void LoadPileupChunk(CTSE_Chunk_Info &chunk_info)
const string & GetRefSeqId(void) const
void GetShortSeqBlobId(CRef< CBAMBlobId > &ret, const CSeq_id_Handle &idh) const
CRange< TSeqPos > TRange
CRef< CSeq_entry > m_CovEntry
double EstimatePileupLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
void LoadRanges(void)
void LoadMainSplit(CTSE_LoadLock &load_lock)
void SetCovFileName(const string &name)
map< CSeq_id_Handle, int > TSeq2Chunk
void LoadAlignChunk(CTSE_Chunk_Info &chunk_info)
double EstimateLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
const CSeq_id_Handle & GetRefSeq_id(void) const
double EstimateSeqLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
vector< CBamRefSeqChunkInfo > TChunks
void CreateChunks(CTSE_Split_Info &split_info)
CRange< TSeqPos > GetChunkGraphRange(size_t range_id)
void LoadMainEntry(CTSE_LoadLock &load_lock)
CBamFileInfo * m_File
bool x_LoadRangesEstimated(void)
TSeq2Chunk m_Seq2Chunk
double EstimateAlignLoadSeconds(const CTSE_Chunk_Info &chunk, Uint4 bytes) const
CSeq_id_Handle m_RefSeq_id
void LoadChunk(CTSE_Chunk_Info &chunk_info)
CIRef< CBamAlignIterator::ISpotIdDetector > m_SpotIdDetector
bool x_LoadRangesCov(void)
void x_InitAlignIterator(CBamAlignIterator &ait, TSeqPos &max_end_pos, CTSE_Chunk_Info &chunk_info, int base_id)
CMutex –.
Definition: ncbimtx.hpp:749
CObject –.
Definition: ncbiobj.hpp:180
CRef –.
Definition: ncbiobj.hpp:618
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const char * str(char *buf, int n)
Definition: stats.c:84
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
range(_Ty, _Ty) -> range< _Ty >
Multi-threading – mutexes; rw-locks; semaphore.
CBAMDataLoader::SBamFileName m_BamFileName
Better replacement of GetAccVer(), this method should be defined in data loaders, GetAccVer() is left...
Better replacement of GetGi(), this method should be defined in data loaders, GetGi() is left for com...
Modified on Fri Sep 20 14:58:18 2024 by modify_doxy.py rev. 669887