NCBI C++ ToolKit
remote_blastdb_adapter.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_DATA_LOADERS_BLASTDB___REMOTE_BLASTDB_ADAPTER__HPP
2 #define OBJTOOLS_DATA_LOADERS_BLASTDB___REMOTE_BLASTDB_ADAPTER__HPP
3 
4 /* $Id: remote_blastdb_adapter.hpp 100101 2023-06-15 14:10:29Z merezhuk $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Christiam Camacho
30  *
31  * ===========================================================================
32  */
33 
34 /** @file remote_blastdb_adapter.hpp
35  * Declaration of the CRemoteBlastDbAdapter class.
36  */
37 
39 #include <cmath>
40 
43 
44 /// This class defines a "bundle" of elements to cache which are then returned
45 /// by CRemoteBlastDbAdapter. The actual data for this object comes from the
46 /// remote BLAST databases accessed by the blast4 server.
48 public:
49  /// Default constructor, needed to insert objects in std::map
50  CCachedSeqDataForRemote() : m_Length(0), m_UseFixedSizeSlices(0) {}
51 
52  /// Sets the length of the sequence data for a given Bioseq
53  void SetLength(TSeqPos length, bool use_fixed_size_slices) {
54  _ASSERT(m_UseFixedSizeSlices == 0);
55  m_UseFixedSizeSlices = use_fixed_size_slices;
56  _ASSERT(m_SeqDataVector.size() == 0);
57  m_Length = length;
58  m_SeqDataVector.resize(x_CalculateNumberOfSlices());
59  _ASSERT(m_SeqDataVector.size() != 0);
60  }
61 
62  /// Retrieve the sequence length
63  TSeqPos GetLength() const { return m_Length; }
64 
65  /// Sets the Seq-id's associated with a given sequence
66  /// param idlist IDs to assign to this object [in]
67  void SetIdList(const IBlastDbAdapter::TSeqIdList& idlist) {
68  m_IdList.clear();
69  copy(idlist.begin(), idlist.end(), back_inserter(m_IdList));
70  }
71 
72  /// Retrieve the Seq-id's associated with a given sequence
73  IBlastDbAdapter::TSeqIdList GetIdList() const { return m_IdList; }
74 
75  /// Set the Bioseq associated with a given sequence
76  /// @param bioseq Bioseq to assign to this object [in]
77  void SetBioseq(CRef<CBioseq> bioseq) {
78  m_Bioseq = bioseq;
79  }
80 
81  /// Retrieve the Bioseq associated with a given sequence
82  CRef<CBioseq> GetBioseq() const { return m_Bioseq; }
83 
84  /// Returns true if this object has been properly initialized and it's
85  /// ready to be used
86  bool IsValid() {
87  return m_Bioseq.NotEmpty() && GetLength() != 0 && !m_IdList.empty();
88  }
89 
90  /// Returns true if the requested range has sequence data already
91  /// @param begin starting offset in the sequence [in]
92  /// @param end ending offset in the sequence [in]
93  bool HasSequenceData(int begin, int end) {
94  return GetSeqDataChunk(begin, end).NotEmpty();
95  }
96 
97  /// Access the sequence data chunk for a given starting and ending offset
98  /// @param begin starting offset in sequence of interest [in]
99  /// @param end ending offset in sequence of interest [in]
100  CRef<CSeq_data>& GetSeqDataChunk(int begin, int end) {
101  _ASSERT(m_Length);
102  _ASSERT(m_SeqDataVector.size());
103  _ASSERT((begin % kRmtSequenceSliceSize) == 0);
104 
105  TSeqPos idx = 0;
106  if (m_UseFixedSizeSlices) {
107  idx = begin / kRmtSequenceSliceSize;
108  _ASSERT((end == (begin + (int)kRmtSequenceSliceSize)) ||
109  (idx+1 == m_SeqDataVector.size()));
110  } else {
111  if (((end-begin) % kRmtSequenceSliceSize) == 0) {
112  idx = ilog2( (end-begin)/kRmtSequenceSliceSize );
113  } else {
114  idx = static_cast<unsigned int>(m_SeqDataVector.size() - 1);
115  }
116  _ASSERT((end == (begin + (int)(0x1<<idx)*kRmtSequenceSliceSize)) ||
117  ((idx+1) == m_SeqDataVector.size()));
118  }
119  _ASSERT(m_SeqDataVector.size() > idx);
120  CRef<CSeq_data> & retval = m_SeqDataVector[idx];
121  return retval;
122  }
123 
124 private:
125  /// length of the sequence data
127  /// each element in this vector represents a "chunk" of the sequence data
128  vector< CRef<CSeq_data> > m_SeqDataVector;
129  /// List of Seq-id's associated with this sequence
131  /// the bioseq object for this object
133  /// Determines whether sequences should be fetched in fixed size slices or
134  /// in incrementally larger sizes.
136 
137  /// Calculates the number of slices in the same manner as the
138  /// CCachedSequence class in its SplitSeqData method.
139  /// FIXME: these methods should be kept in sync, refactoring is necessary
141  {
142  _ASSERT(m_Length);
143  TSeqPos retval = 0;
144  if (m_UseFixedSizeSlices) {
145  retval = (m_Length + kRmtSequenceSliceSize - 1) /
147  } else {
148  TSeqPos slice_size = kRmtSequenceSliceSize;
149  for (TSeqPos pos = 0; pos < m_Length; retval++) {
150  TSeqPos end = m_Length;
151  if ((end - pos) > slice_size) {
152  end = pos + slice_size;
153  }
154  pos += slice_size;
155  slice_size *= kSliceGrowthFactor;
156  }
157  }
158  return retval;
159  }
160 
162  {
163  Int4 lg = 0;
164 
165  if (x == 0)
166  return 0;
167 
168  while ((x = x >> 1))
169  lg++;
170 
171  return lg;
172  }
173 
174 };
175 
176 /** This class allows retrieval of sequence data from BLAST databases at NCBI.
177  */
179 {
180 public:
181  /// Constructor
182  CRemoteBlastDbAdapter(const string& db_name, CSeqDB::ESeqType db_type,
183  bool use_fixed_size_slices);
184 
185  /** @inheritDoc */
187  /** @inheritDoc */
188  virtual int GetSeqLength(int oid);
189  /** @inheritDoc */
190  virtual TSeqIdList GetSeqIDs(int oid);
191  /** @inheritDoc */
192  virtual CRef<CBioseq> GetBioseqNoData(int oid, TGi target_gi = ZERO_GI, const CSeq_id * target_id = NULL);
193  /** @inheritDoc */
194  virtual CRef<CSeq_data> GetSequence(int oid, int begin = 0, int end = 0);
195  /// Batch-version of GetSequence
196  /// @param oids OIDs of the sequences to fetch, must be of same size as
197  /// ranges [in]
198  /// @param ranges sequence ranges for the OIDs above, must be of same size as
199  /// oids. If any of the ranges is TSeqRange::GetEmpty, the whole sequence
200  /// will be fetched (assuming no splitting of the sequence occurred),
201  /// otherwise the ranges are expected to be spanning a give sequence chunk
202  /// @sa x_CalculateNumberOfSlices [in]
203  /// @param sequence_data output parameter for the sequence data to fetch
204  /// [out]
205  void GetSequenceBatch(const vector<int>& oids,
206  const vector<TSeqRange>& ranges,
207  vector< CRef<CSeq_data> >& sequence_data);
208  /** @inheritDoc */
209  virtual bool SeqidToOid(const CSeq_id & id, int & oid);
210  /// Batch-version of SeqidToOid
211  /// @param ids Seq-IDs to fetch [in]
212  /// @param oids the OIDs to which the IDs correspond [out]
213  bool SeqidToOidBatch(const vector< CRef<CSeq_id> >& ids,
214  vector<int>& oids);
215 
216 private:
217  /// BLAST database name
218  string m_DbName;
219  /// Sequence type of the BLAST database
221  /// Internal cache, maps OIDs to CCachedSeqDataForRemote
223  /// Our local "OID generator"
225  /// Determines whether sequences should be fetched in fixed size slices or
226  /// in incrementally larger sizes.
228 
229  /// This method actually retrieves the sequence data.
230  /// @param oid OID for the sequence of interest [in]
231  /// @param begin starting offset of the sequence of interest [in]
232  /// @param end ending offset of the sequence of interst [in]
233  void x_FetchData(int oid, int begin, int end);
234 
235  void x_FetchDataByBatch(const vector<int>& oids,
236  const vector<TSeqRange>& ranges);
237 };
238 
241 
242 #endif /* OBJTOOLS_DATA_LOADERS_BLASTDB___REMOTE_BLASTDB_ADAPTER__HPP */
Interface definition of IBlastDbAdapter.
#define kSliceGrowthFactor
When fixed size slices are not used, each subsequent slice grows its size by this factor.
@ kRmtSequenceSliceSize
Same as above, but used for fetching sequences from remote BLAST databases.
This class defines a "bundle" of elements to cache which are then returned by CRemoteBlastDbAdapter.
CRef< CBioseq > m_Bioseq
the bioseq object for this object
CRef< CBioseq > GetBioseq() const
Retrieve the Bioseq associated with a given sequence.
void SetBioseq(CRef< CBioseq > bioseq)
Set the Bioseq associated with a given sequence.
bool m_UseFixedSizeSlices
Determines whether sequences should be fetched in fixed size slices or in incrementally larger sizes.
IBlastDbAdapter::TSeqIdList GetIdList() const
Retrieve the Seq-id's associated with a given sequence.
CCachedSeqDataForRemote()
Default constructor, needed to insert objects in std::map.
TSeqPos x_CalculateNumberOfSlices()
Calculates the number of slices in the same manner as the CCachedSequence class in its SplitSeqData m...
IBlastDbAdapter::TSeqIdList m_IdList
List of Seq-id's associated with this sequence.
TSeqPos GetLength() const
Retrieve the sequence length.
bool IsValid()
Returns true if this object has been properly initialized and it's ready to be used.
vector< CRef< CSeq_data > > m_SeqDataVector
each element in this vector represents a "chunk" of the sequence data
bool HasSequenceData(int begin, int end)
Returns true if the requested range has sequence data already.
void SetLength(TSeqPos length, bool use_fixed_size_slices)
Sets the length of the sequence data for a given Bioseq.
CRef< CSeq_data > & GetSeqDataChunk(int begin, int end)
Access the sequence data chunk for a given starting and ending offset.
TSeqPos m_Length
length of the sequence data
void SetIdList(const IBlastDbAdapter::TSeqIdList &idlist)
Sets the Seq-id's associated with a given sequence param idlist IDs to assign to this object [in].
CObject –.
Definition: ncbiobj.hpp:180
This class allows retrieval of sequence data from BLAST databases at NCBI.
virtual bool SeqidToOid(const CSeq_id &id, int &oid)
@inheritDoc
bool SeqidToOidBatch(const vector< CRef< CSeq_id > > &ids, vector< int > &oids)
Batch-version of SeqidToOid.
virtual CSeqDB::ESeqType GetSequenceType()
@inheritDoc
virtual TSeqIdList GetSeqIDs(int oid)
@inheritDoc
void GetSequenceBatch(const vector< int > &oids, const vector< TSeqRange > &ranges, vector< CRef< CSeq_data > > &sequence_data)
Batch-version of GetSequence.
CRemoteBlastDbAdapter(const string &db_name, CSeqDB::ESeqType db_type, bool use_fixed_size_slices)
Constructor.
int m_NextLocalId
Our local "OID generator".
virtual int GetSeqLength(int oid)
@inheritDoc
map< int, CCachedSeqDataForRemote > m_Cache
Internal cache, maps OIDs to CCachedSeqDataForRemote.
virtual CRef< CSeq_data > GetSequence(int oid, int begin=0, int end=0)
@inheritDoc
virtual CRef< CBioseq > GetBioseqNoData(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_id=NULL)
@inheritDoc
string m_DbName
BLAST database name.
void x_FetchData(int oid, int begin, int end)
This method actually retrieves the sequence data.
bool m_UseFixedSizeSlices
Determines whether sequences should be fetched in fixed size slices or in incrementally larger sizes.
CSeqDB::ESeqType m_DbType
Sequence type of the BLAST database.
void x_FetchDataByBatch(const vector< int > &oids, const vector< TSeqRange > &ranges)
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
Interface that provides a common interface to retrieve sequence data from local vs.
list< CRef< CSeq_id > > TSeqIdList
Convenience typedef for a list of CSeq_id-s.
std::list< CRef< objects::CSeq_id > > TSeqIdList
Definition: ftablock.h:58
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define NULL
Definition: ncbistd.hpp:225
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
Int4 ilog2(Int8 x)
Integer base two logarithm.
Definition: lookup_util.c:71
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
#define _ASSERT
Modified on Wed Jul 17 13:23:19 2024 by modify_doxy.py rev. 669887