NCBI C++ ToolKit
bdbloader_rmt.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: bdbloader_rmt.cpp 91383 2020-10-21 16:07:12Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 * File Description:
29 * Data loader implementation that uses the blast databases at NCBI
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
42 
43 //=======================================================================
44 // BlastDbDataLoader Public interface
45 //
46 
49 
53  const string& dbname,
54  const EDbType dbtype,
55  bool use_fixed_size_slices,
56  CObjectManager::EIsDefault is_default,
58 {
59  SBlastDbParam param(dbname, dbtype, use_fixed_size_slices);
60  TMaker maker(param);
61  CDataLoader::RegisterInObjectManager(om, maker, is_default, priority);
62  return maker.GetRegisterInfo();
63 }
64 
65 inline
67 {
68  switch (dbtype) {
69  case CRemoteBlastDbDataLoader::eNucleotide: return "Nucleotide";
70  case CRemoteBlastDbDataLoader::eProtein: return "Protein";
71  default: return "Unknown";
72  }
73 }
74 
75 
76 inline
78 {
79  switch (dbtype) {
82  default: return CSeqDB::eUnknown;
83  }
84 }
85 
86 inline
88 {
89  switch (seq_type) {
92  default: return CRemoteBlastDbDataLoader::eUnknown;
93  }
94 }
95 
96 const string CRemoteBlastDbDataLoader::kNamePrefix("REMOTE_BLASTDB_");
98 {
99  return kNamePrefix + param.m_DbName + DbTypeToStr(param.m_DbType);
100 }
101 
103  const SBlastDbParam & param)
104 {
105  m_DBName = param.m_DbName;
106  m_DBType = param.m_DbType;
108  SetName(loader_name);
109  _ASSERT(param.m_BlastDbHandle.Empty());
110  m_BlastDb.Reset();
111  if (m_DBName.empty()) {
112  NCBI_THROW(CSeqDBException, eArgErr, "Empty BLAST database name");
113  }
114  const CSeqDB::ESeqType dbtype = DbTypeToSeqType(m_DBType);
118  _TRACE("Using " << GetLoaderNameFromArgs(param) << " data loader");
119 }
120 
121 /// A BLAST DB (blob) ID
122 /// The first field represents an OID in the BLAST database
123 typedef pair<int, CSeq_id_Handle> TBlastDbId;
124 
125 /// Template specialization to convert BLAST DB (blob) IDs to human readable
126 /// strings.
127 template<>
129 {
130  /// Convert TBlastDbId (blob IDs) to human readable strings.
131  /// @param v The value to convert. [in]
132  /// @return A string version of the value passed in.
133  string operator()(const TBlastDbId& v) const
134  {
135  return NStr::IntToString(v.first) + ':' + v.second.AsString();
136  }
137 };
138 
139 /// Type definition consistent with those defined in objmgr/blob_id.hpp
141 
142 // Note: this method cannot be just removed even though it's identical to its
143 // parent class' implementation with the exception of the last argument to
144 // x_LoadData, some refactoring is needed
147 {
148  CTSE_LoadLock lock = GetDataSource()->GetTSE_LoadLock(blob_id);
149  if ( !lock.IsLoaded() ) {
150  const TBlastDbId& id =
151  dynamic_cast<const CBlobIdBlastDb&>(*blob_id).GetValue();
152  x_LoadData(id.second, id.first, lock, kRmtSequenceSliceSize);
153  }
154  return lock;
155 }
156 
157 void
159 {
160  if (tse_sets.empty()) {
161  return;
162  }
163 
164  // Collect the Seq-ids for batch retrieval
165  vector< CRef<CSeq_id> > ids2fetch;
166  ids2fetch.reserve(tse_sets.size());
167  NON_CONST_ITERATE(TTSE_LockSets, tse_set, tse_sets) {
168  const CSeq_id_Handle& idh = tse_set->first;
169  CConstRef<CSeq_id> const_id = idh.GetSeqId();
170  CRef<CSeq_id> id(const_cast<CSeq_id*>(const_id.GetPointer()));
171  ids2fetch.push_back(id);
172  }
173 
174  CRemoteBlastDbAdapter* rmt_blastdb_svc =
175  dynamic_cast<CRemoteBlastDbAdapter*>(&*m_BlastDb);
176  _ASSERT( rmt_blastdb_svc != NULL );
177 
178  vector<int> oids;
179  if ( !rmt_blastdb_svc->SeqidToOidBatch(ids2fetch, oids) ) {
180  ERR_POST(Error << "Failed to fetch sequences in batch mode");
181  return;
182  }
183  _ASSERT(oids.size() == tse_sets.size());
184 
185  vector<int>::size_type i = 0;
186  NON_CONST_ITERATE(TTSE_LockSets, tse_set, tse_sets) {
187  const CSeq_id_Handle& idh = tse_set->first;
188  TBlobId blob_id = new CBlobIdBlastDb(TBlastDbId(oids[i], idh));
189  i++;
190  TTSE_Lock lock = GetBlobById(blob_id);
191  tse_set->second.insert(lock);
192  }
193  _ASSERT(tse_sets.size() == i);
194 }
195 
196 void
198 {
199  static const CTSE_Chunk_Info::TBioseq_setId kIgnored = 0;
200 
201  TChunkSet& chunks = const_cast<TChunkSet&>(chunks_orig);
202  if (chunks.empty()) {
203  return;
204  }
205 
206  vector<int> oids;
207  vector<TSeqRange> ranges;
208  vector< CRef<CSeq_data> > sequence_data;
209 
210  ITERATE(TChunkSet, chunk_itr, chunks) {
211  const TChunk& chunk = *chunk_itr;
212  _ASSERT(!chunk->IsLoaded());
213  int oid = x_GetOid(chunk->GetBlobId());
214  oids.push_back(oid);
215 
217  chunk->GetSeq_dataInfos() ) {
218  ranges.push_back(it->second);
219  }
220  }
221  _ASSERT(oids.size() == ranges.size());
222 
223  CRemoteBlastDbAdapter* rmt_blastdb_svc =
224  dynamic_cast<CRemoteBlastDbAdapter*>(&*m_BlastDb);
225  _ASSERT( rmt_blastdb_svc != NULL );
226  rmt_blastdb_svc->GetSequenceBatch(oids, ranges,
227  sequence_data);
228  _ASSERT(sequence_data.size() == oids.size());
229 
230  unsigned int seq_data_idx = 0;
231  NON_CONST_ITERATE(TChunkSet, chunk_itr, chunks) {
232  TChunk chunk = *chunk_itr;
233  _ASSERT(!chunk->IsLoaded());
235  chunk->GetSeq_dataInfos() ) {
236  const CSeq_id_Handle& sih = it->first;
237  TSeqPos start = it->second.GetFrom();
238 
240  _ASSERT(it->second.GetLength() == (it->second.GetToOpen() - start));
241  lit->SetLength(it->second.GetLength());
242  lit->SetSeq_data(*sequence_data[seq_data_idx]);
243  seq_data_idx++;
244 
246  seq.push_back(lit);
247  chunk->x_LoadSequence(TPlace(sih, kIgnored), start, seq);
248  }
249  // Mark chunk as loaded
250  chunk->SetLoaded();
251  }
252  _ASSERT(seq_data_idx == sequence_data.size());
253 }
254 
255 void
256 CRemoteBlastDbDataLoader::DebugDump(CDebugDumpContext ddc, unsigned int /*depth*/) const
257 {
258  ddc.SetFrame("CRemoteBlastDbDataLoader");
259  DebugDumpValue(ddc,"m_DBName", m_DBName);
260  DebugDumpValue(ddc,"m_DBType", m_DBType);
261  DebugDumpValue(ddc,"m_UseFixedSizeSlices", m_UseFixedSizeSlices);
262 
263 }
264 
266 
267 // ===========================================================================
268 
270 
272 {
273  // Typedef to silence compiler warning. A better solution to this
274  // problem is probably possible.
275 
276  typedef void(*TArgFuncType)(list<CPluginManager<CDataLoader>
277  ::SDriverInfo> &,
279  ::EEntryPointRequest);
280 
281  RegisterEntryPoint<CDataLoader>((TArgFuncType)
283 }
284 
285 const string kDataLoader_RmtBlastDb_DriverName("rmt_blastdb");
286 
287 /// Data Loader Factory for BlastDbDataLoader
288 ///
289 /// This class provides an interface which builds an instance of the
290 /// BlastDbDataLoader and registers it with the object manager.
291 
293 {
294 public:
295  /// Constructor
298 
299  /// Destructor
300  virtual ~CRmtBlastDb_DataLoaderCF(void) {}
301 
302 protected:
303  /// Create and register a data loader
304  /// @param om
305  /// A reference to the object manager
306  /// @param params
307  /// Arguments for the data loader constructor
310  const TPluginManagerParamTree* params) const;
311 };
312 
313 
316  const TPluginManagerParamTree* params) const
317 {
318  if ( !ValidParams(params) ) {
319  // Use constructor without arguments
321  }
322  // Parse params, select constructor
323  const string& dbname =
324  GetParam(GetDriverName(), params,
325  kCFParam_BlastDb_DbName, false);
326  const string& dbtype_str =
327  GetParam(GetDriverName(), params,
328  kCFParam_BlastDb_DbType, false);
329  if ( !dbname.empty() ) {
330  // Use database name
332  if ( !dbtype_str.empty() ) {
333  if (NStr::CompareNocase(dbtype_str, "Nucleotide") == 0) {
335  }
336  else if (NStr::CompareNocase(dbtype_str, "Protein") == 0) {
338  }
339  }
341  om,
342  dbname,
343  dbtype,
344  true, // use_fixed_size_slices
345  GetIsDefault(params),
346  GetPriority(params)).GetLoader();
347  }
348  // IsDefault and Priority arguments may be specified
350 }
351 
352 
356 {
358  NCBI_EntryPointImpl(info_list, method);
359 }
360 
361 
365 {
366  NCBI_EntryPoint_DataLoader_RmtBlastDb(info_list, method);
367 }
368 
369 
User-defined methods of the data storage class.
pair< int, CSeq_id_Handle > TBlastDbId
A BLAST DB (blob) ID The first field represents an OID in the BLAST database.
Definition: bdbloader.cpp:209
const string kDataLoader_BlastDb_DriverName
const string kCFParam_BlastDb_DbName
Definition: bdbloader.hpp:49
const string kCFParam_BlastDb_DbType
Definition: bdbloader.hpp:50
const string kDataLoader_RmtBlastDb_DriverName("rmt_blastdb")
USING_SCOPE(objects)
void DataLoaders_Register_RmtBlastDb(void)
CSeqDB::ESeqType DbTypeToSeqType(CRemoteBlastDbDataLoader::EDbType dbtype)
CRemoteBlastDbDataLoader::EDbType SeqTypeToDbType(CSeqDB::ESeqType seq_type)
void NCBI_EntryPoint_DataLoader_RmtBlastDb(CPluginManager< CDataLoader >::TDriverInfoList &info_list, CPluginManager< CDataLoader >::EEntryPointRequest method)
pair< int, CSeq_id_Handle > TBlastDbId
A BLAST DB (blob) ID The first field represents an OID in the BLAST database.
CBlobIdFor< TBlastDbId > CBlobIdBlastDb
Type definition consistent with those defined in objmgr/blob_id.hpp.
void NCBI_EntryPoint_xloader_blastdb_rmt(CPluginManager< objects::CDataLoader >::TDriverInfoList &info_list, CPluginManager< objects::CDataLoader >::EEntryPointRequest method)
string DbTypeToStr(CRemoteBlastDbDataLoader::EDbType dbtype)
Data loader implementation that uses the blast databases remotely.
@ kRmtSequenceSliceSize
Same as above, but used for fetching sequences from remote BLAST databases.
void x_LoadData(const CSeq_id_Handle &idh, int oid, CTSE_LoadLock &lock, int slice_size)
Load sequence data from cache or from the database.
Definition: bdbloader.cpp:257
CRef< IBlastDbAdapter > m_BlastDb
The sequence database.
Definition: bdbloader.hpp:220
string m_DBName
Blast database name.
Definition: bdbloader.hpp:218
EDbType m_DBType
Is this database protein or nucleotide?
Definition: bdbloader.hpp:219
int x_GetOid(const CSeq_id_Handle &idh)
Gets the OID from m_Ids cache or the BLAST databases.
Definition: bdbloader.cpp:379
EDbType
Describes the type of blast database to use.
Definition: bdbloader.hpp:57
@ eNucleotide
nucleotide database
Definition: bdbloader.hpp:58
@ eProtein
protein database
Definition: bdbloader.hpp:59
@ eUnknown
protein is attempted first, then nucleotide
Definition: bdbloader.hpp:60
pair< TBioseqId, TBioseq_setId > TPlace
Definition: bdbloader.hpp:178
bool m_UseFixedSizeSlices
Configuration value specified to the CCachedSequence.
Definition: bdbloader.hpp:225
CObjectManager::TPriority GetPriority(const TPluginManagerParamTree *params) const
const string & GetDriverName(void) const
CObjectManager::EIsDefault GetIsDefault(const TPluginManagerParamTree *params) const
bool ValidParams(const TPluginManagerParamTree *params) const
CTSE_LoadLock GetTSE_LoadLock(const TBlobId &blob_id)
void SetFrame(const string &frame)
Definition: ddumpable.cpp:137
CObjectManager –.
CPluginManager<> –.
This class allows retrieval of sequence data from BLAST databases at NCBI.
bool SeqidToOidBatch(const vector< CRef< CSeq_id > > &ids, vector< int > &oids)
Batch-version of SeqidToOid.
void GetSequenceBatch(const vector< int > &oids, const vector< TSeqRange > &ranges, vector< CRef< CSeq_data > > &sequence_data)
Batch-version of GetSequence.
virtual void GetBlobs(TTSE_LockSets &tse_sets)
Support for fetching the sequence length by batch.
virtual TTSE_Lock GetBlobById(const TBlobId &blob_id)
For a given TBlobId, get the TTSE_Lock.
virtual void GetChunks(const TChunkSet &chunks)
Support for fetching the sequence chunks by batch.
virtual void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Define method for dumping debug information.
static const string kNamePrefix
All BLAST DB data loader instances have a name that starts with kNamePrefix.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &dbname="nr", const EDbType dbtype=eUnknown, bool use_fixed_size_slices=true, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
static string GetLoaderNameFromArgs(const SBlastDbParam &param)
CRemoteBlastDbDataLoader(const string &loader_name, const SBlastDbParam &param)
Parametrized constructor.
Data Loader Factory for BlastDbDataLoader.
virtual CDataLoader * CreateAndRegister(CObjectManager &om, const TPluginManagerParamTree *params) const
Create and register a data loader.
CRmtBlastDb_DataLoaderCF(void)
Constructor.
virtual ~CRmtBlastDb_DataLoaderCF(void)
Destructor.
CSeqDBException.
Definition: seqdbcommon.hpp:73
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eUnknown
Definition: seqdb.hpp:176
@ eProtein
Definition: seqdb.hpp:174
TBlobId GetBlobId(void) const
void x_LoadSequence(const TPlace &place, TSeqPos pos, const TSequence &seq)
void SetLoaded(CObject *obj=0)
vector< TLocation > TLocationSet
const TLocationSet & GetSeq_dataInfos(void) const
list< CRef< CSeq_literal > > TSequence
bool IsLoaded(void) const
bool IsLoaded(void) const
definition of a Culling tree
Definition: ncbi_tree.hpp:100
size_type size() const
Definition: map.hpp:148
bool empty() const
Definition: map.hpp:149
Definition: map.hpp:338
void DebugDumpValue(CDebugDumpContext &_this, const string &name, const T &value, const string &comment=kEmptyStr)
Definition: ddumpable.hpp:206
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
CConstRef< CSeq_id > GetSeqId(void) const
const value_type & GetValue(void) const
Definition: blob_id.hpp:122
void SetName(const string &loader_name)
Definition: data_loader.cpp:98
TLoader * GetLoader(void) const
Get pointer to the loader.
CDataSource * GetDataSource(void) const
Definition: data_loader.cpp:92
EIsDefault
Flag defining if the data loader is included in the "default" group.
TRegisterInfo GetRegisterInfo(void)
static void RegisterInObjectManager(CObjectManager &om, CLoaderMaker_Base &loader_maker, CObjectManager::EIsDefault is_default, CObjectManager::TPriority priority)
Register the loader only if the name is not yet registered in the object manager.
Definition: data_loader.cpp:53
vector< TChunk > TChunkSet
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
static void NCBI_EntryPointImpl(TDriverInfoList &info_list, EEntryPointRequest method)
Entry point implementation.
string GetParam(const string &driver_name, const TPluginManagerParamTree *params, const string &param_name, bool mandatory, const string &default_value) const
Utility function to get an element of parameter tree Throws an exception when mandatory parameter is ...
list< SDriverInfo > TDriverInfoList
List of driver information.
EEntryPointRequest
Actions performed by the entry point.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
void SetLength(TLength value)
Assign a value to Length data member.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
Helper classes and templates to implement plugins.
Declaration of the CRemoteBlastDbAdapter class.
CRef< objects::CObjectManager > om
string operator()(const TBlastDbId &v) const
Convert TBlastDbId (blob IDs) to human readable strings.
SRegisterLoaderInfo –.
#define _ASSERT
Modified on Sun Apr 14 05:27:13 2024 by modify_doxy.py rev. 669887