NCBI C++ ToolKit
remote_blastdb_adapter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: remote_blastdb_adapter.cpp 99543 2023-04-14 16:50:25Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  * ===========================================================================
29  */
30 
31 /** @file remote_blastdb_adapter.cpp
32  * Defines the CRemoteBlastDbAdapter class
33  */
34 #include <ncbi_pch.hpp>
37 #include <objects/seq/Seq_inst.hpp>
40 #include <objects/seq/Seq_ext.hpp>
43 
46 
48  CSeqDB::ESeqType db_type,
49  bool use_fixed_size_slices)
50 : m_DbName(db_name), m_DbType(db_type), m_NextLocalId(1),
51  m_UseFixedSizeSlices(use_fixed_size_slices)
52 {
53  CBlastServices rmt_svc;
54  const bool kIsProtein = (db_type == CSeqDB::eProtein) ? true : false;
55  if ( !rmt_svc.IsValidBlastDb(db_name, kIsProtein) ) {
57  out << (kIsProtein ? "Protein" : "Nucleotide") << " BLAST database "
58  << "'" << db_name << "' does not exist in the NCBI servers";
60  }
61 }
62 
63 int
65 {
66  _ASSERT(m_Cache[oid].IsValid());
67  return m_Cache[oid].GetLength();
68 }
69 
72 {
73  _ASSERT(m_Cache[oid].IsValid());
74  return m_Cache[oid].GetIdList();
75 }
76 
78 CRemoteBlastDbAdapter::GetBioseqNoData(int oid, TGi /* target_gi = 0 */, const CSeq_id * target_id /* = NULL */)
79 {
80  /// @todo FIXME we should do something with the target_gi
81  _ASSERT(m_Cache[oid].IsValid());
82  // N.B.: the assignment to a newly created bioseq is deliberate to avoid an
83  // exception when loading data in the object manager
84  CRef<CBioseq> retval(new CBioseq);
85  retval->Assign(*m_Cache[oid].GetBioseq());
86  return retval;
87 }
88 
89 /// Returns false always. Logs an error message with severity warning for all
90 /// errors but sequence not found.
91 /// @param errors errors reported by server [in]
92 /// @param warnings warnings reported by server [in]
93 static bool
94 RemoteBlastDbLoader_ErrorHandler(const string& errors, const string& warnings)
95 {
96  const bool retval = false;
97  /// FIXME ideally this would come from some error code rather than string
98  /// parsing
99  if (NStr::Find(errors, "Failed to fetch sequence: [") != NPOS) {
100  return retval;
101  }
102 
103  string msg;
104  if ( !errors.empty() ) {
105  msg = errors;
106  }
107  if ( !warnings.empty() ) {
108  msg += (msg.empty() ? warnings : " " + warnings);
109  }
110  if (msg.empty()) {
111  msg = "Failed to retrieve sequence data via remote BLAST database ";
112  msg += "data loader";
113  }
114  ERR_POST(Warning << msg);
115  return retval;
116 }
117 
118 void CRemoteBlastDbAdapter::x_FetchData(int oid, int begin, int end)
119 {
120  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oid];
121  _ASSERT( !cached_seqdata.HasSequenceData(begin, end) );
122  _ASSERT( cached_seqdata.GetLength() != 0 );
123  _ASSERT( !cached_seqdata.GetIdList().empty() );
124  _ASSERT( cached_seqdata.IsValid() );
125  const char seqtype = (GetSequenceType() == CSeqDB::eProtein) ? 'p' : 'n';
126 
127  CRef<CSeq_interval> seq_int
128  (new CSeq_interval(*cached_seqdata.GetIdList().front(), begin, end));
129  CBlastServices::TSeqIntervalVector seqids(1, seq_int);
132  string errors, warnings;
133  const bool kVerbose = (getenv("VERBOSE") ? true : false);
134 
135  CBlastServices::GetSequenceParts(seqids, m_DbName, seqtype, ids, seq_data,
136  errors, warnings, kVerbose);
137  if (seq_data.empty() || !errors.empty() || !warnings.empty() ||
138  ids.empty() ) {
139  RemoteBlastDbLoader_ErrorHandler(errors, warnings);
140  }
141  _ASSERT(ids.size() == seq_data.size());
142  cached_seqdata.GetSeqDataChunk(begin, end) = seq_data.front();
143  _ASSERT(cached_seqdata.HasSequenceData(begin, end));
144 }
145 
146 void CRemoteBlastDbAdapter::x_FetchDataByBatch(const vector<int>& oids,
147  const vector<TSeqRange>& ranges)
148 {
149  const char seqtype = (GetSequenceType() == CSeqDB::eProtein) ? 'p' : 'n';
150  if (oids.empty()) {
151  return;
152  }
153 
155  seqids.reserve(oids.size());
156  for (vector<int>::size_type i = 0; i < oids.size(); i++) {
157  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]];
158  _ASSERT( !cached_seqdata.HasSequenceData(ranges[i].GetFrom(),
159  ranges[i].GetToOpen()) );
160  _ASSERT( cached_seqdata.GetLength() != 0 );
161  _ASSERT( !cached_seqdata.GetIdList().empty() );
162  _ASSERT( cached_seqdata.IsValid() );
163 
164  CRef<CSeq_interval> seq_int
165  (new CSeq_interval(*cached_seqdata.GetIdList().front(),
166  ranges[i].GetFrom(), ranges[i].GetToOpen()));
167  seqids.push_back(seq_int);
168  }
169 
172  string errors, warnings;
173  const bool kVerbose = (getenv("VERBOSE") ? true : false);
174 
175  CBlastServices::GetSequenceParts(seqids, m_DbName, seqtype, ids, seq_data,
176  errors, warnings, kVerbose);
177  if (seq_data.empty() || !errors.empty() || !warnings.empty() ||
178  ids.empty() ) {
179  RemoteBlastDbLoader_ErrorHandler(errors, warnings);
180  }
181  _ASSERT(seqids.size() == ids.size());
182  _ASSERT(ids.size() == seq_data.size());
183 
184  for (vector<int>::size_type i = 0; i < oids.size(); i++) {
185  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]];
186  cached_seqdata.GetSeqDataChunk(ranges[i].GetFrom(),
187  ranges[i].GetToOpen()) = seq_data[i];
188  _ASSERT(cached_seqdata.HasSequenceData(ranges[i].GetFrom(),
189  ranges[i].GetToOpen()));
190  }
191 }
192 
195  int begin /* = 0 */,
196  int end /* = 0*/)
197 {
198  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oid];
199  _ASSERT(cached_seqdata.IsValid());
200  if ( !cached_seqdata.HasSequenceData(begin, end) ) {
201  x_FetchData(oid, begin, end);
202  }
203  return cached_seqdata.GetSeqDataChunk(begin, end);
204 }
205 
206 void
208  const vector<TSeqRange>& ranges,
209  vector< CRef<CSeq_data> >& sequence_data)
210 {
211  _ASSERT( !ranges.empty() );
212  _ASSERT(oids.size() == ranges.size());
213  sequence_data.clear();
214 
215  vector<int> oids2fetch;
216  vector<TSeqRange> ranges2fetch;
217  for (vector<int>::size_type i = 0; i < oids.size(); i++) {
218  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]];
219  _ASSERT(cached_seqdata.IsValid());
220  // default is to fetch the entire sequence
221  int begin = 0, end = cached_seqdata.GetLength();
222  if (ranges[i] != TSeqRange::GetEmpty()) { // get partial sequence
223  begin = ranges[i].GetFrom();
224  end = ranges[i].GetToOpen();
225  }
226  if ( !cached_seqdata.HasSequenceData(begin, end) ) {
227  oids2fetch.push_back(oids[i]);
228  ranges2fetch.push_back(TSeqRange(begin, end-1));
229  if (ranges[i] != TSeqRange::GetEmpty()) { // get partial sequence
230  _ASSERT(ranges[i] == ranges2fetch.back());
231  }
232  }
233  }
234 
235  x_FetchDataByBatch(oids2fetch, ranges2fetch);
236 
237  // Populate the return value
238  sequence_data.reserve(oids.size());
239  for (vector<int>::size_type i = 0; i < oids.size(); i++) {
240  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]];
241  _ASSERT(cached_seqdata.IsValid());
242  int begin = 0, end = cached_seqdata.GetLength();
243  if (ranges[i] != TSeqRange::GetEmpty()) {
244  begin = ranges[i].GetFrom();
245  end = ranges[i].GetToOpen();
246  }
247  _ASSERT(cached_seqdata.HasSequenceData(begin, end));
248  sequence_data.push_back(cached_seqdata.GetSeqDataChunk(begin, end));
249  }
250  _ASSERT(sequence_data.size() == oids.size());
251 
252 #if _DEBUG
253  for (vector<int>::size_type i = 0; i < sequence_data.size(); i++) {
254  _ASSERT(sequence_data[i] != NULL);
255  }
256 #endif
257 }
258 
259 // N.B.: this method should be called when the BLAST database data loader
260 // hasn't been able to find the id in its cache
261 bool
263 {
264  // N.B.: This method doesn't get any sequence data from the server side.
265  const char seqtype = (GetSequenceType() == CSeqDB::eProtein) ? 'p' : 'n';
266 
267  oid = m_NextLocalId++;
268 
269  // Return types
272  const bool kVerbose = (getenv("VERBOSE") ? true : false);
273  string errors, warnings;
274  seqids.push_back(CRef<CSeq_id>(const_cast<CSeq_id*>(&id)));
275 
276  CBlastServices::GetSequencesInfo(seqids, m_DbName, seqtype, bioseqs, errors,
277  warnings, kVerbose, true);
278  if ( !errors.empty() || !warnings.empty() || bioseqs.empty() ) {
279  return RemoteBlastDbLoader_ErrorHandler(errors, warnings);
280  }
281  _ASSERT(bioseqs.size() == seqids.size());
282 
283  // cache the retrieved information
284  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oid];
285  cached_seqdata.SetLength(bioseqs.front()->GetLength(),
287  cached_seqdata.SetIdList(bioseqs.front()->SetId());
288  cached_seqdata.SetBioseq(bioseqs.front());
289  return cached_seqdata.IsValid();
290 }
291 
292 bool
294  vector<int>& oids)
295 {
296  // N.B.: This method doesn't get any sequence data from the server side.
297  const char seqtype = (GetSequenceType() == CSeqDB::eProtein) ? 'p' : 'n';
298 
299  if (ids.empty()) {
300  return true;
301  }
302 
303  oids.clear();
304  oids.reserve(ids.size());
305  for (vector<int>::size_type i = 0; i < ids.size(); i++) {
306  oids.push_back(m_NextLocalId++);
307  }
308 
309  // Return types
311  const bool kVerbose = (getenv("VERBOSE") ? true : false);
312  string errors, warnings;
313 
315  (const_cast< vector< CRef<CSeq_id> >& >(*&ids), m_DbName, seqtype,
316  bioseqs, errors, warnings, kVerbose, true);
317  if ( !errors.empty() || !warnings.empty() || bioseqs.empty() ) {
318  return RemoteBlastDbLoader_ErrorHandler(errors, warnings);
319  }
320  _ASSERT(bioseqs.size() == ids.size());
321 
322  // cache the retrieved information
323  for (vector<int>::size_type i = 0; i < oids.size(); i++) {
324  CCachedSeqDataForRemote& cached_seqdata = m_Cache[oids[i]];
325  cached_seqdata.SetLength(bioseqs[i]->GetLength(),
327  cached_seqdata.SetIdList(bioseqs[i]->SetId());
328  cached_seqdata.SetBioseq(bioseqs[i]);
329  _ASSERT(cached_seqdata.IsValid());
330  }
331  return true;
332 }
333 
336 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares the CBlastServices class.
#define true
Definition: bool.h:35
API for Remote Blast Services.
This class defines a "bundle" of elements to cache which are then returned by CRemoteBlastDbAdapter.
void SetBioseq(CRef< CBioseq > bioseq)
Set the Bioseq associated with a given sequence.
IBlastDbAdapter::TSeqIdList GetIdList() const
Retrieve the Seq-id's associated with a given sequence.
TSeqPos GetLength() const
Retrieve the sequence length.
bool IsValid()
Returns true if this object has been properly initialized and it's ready to be used.
bool HasSequenceData(int begin, int end)
Returns true if the requested range has sequence data already.
void SetLength(TSeqPos length, bool use_fixed_size_slices)
Sets the length of the sequence data for a given Bioseq.
CRef< CSeq_data > & GetSeqDataChunk(int begin, int end)
Access the sequence data chunk for a given starting and ending offset.
void SetIdList(const IBlastDbAdapter::TSeqIdList &idlist)
Sets the Seq-id's associated with a given sequence param idlist IDs to assign to this object [in].
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
This class allows retrieval of sequence data from BLAST databases at NCBI.
virtual bool SeqidToOid(const CSeq_id &id, int &oid)
@inheritDoc
bool SeqidToOidBatch(const vector< CRef< CSeq_id > > &ids, vector< int > &oids)
Batch-version of SeqidToOid.
virtual CSeqDB::ESeqType GetSequenceType()
@inheritDoc
virtual TSeqIdList GetSeqIDs(int oid)
@inheritDoc
void GetSequenceBatch(const vector< int > &oids, const vector< TSeqRange > &ranges, vector< CRef< CSeq_data > > &sequence_data)
Batch-version of GetSequence.
int m_NextLocalId
Our local "OID generator".
virtual int GetSeqLength(int oid)
@inheritDoc
map< int, CCachedSeqDataForRemote > m_Cache
Internal cache, maps OIDs to CCachedSeqDataForRemote.
virtual CRef< CSeq_data > GetSequence(int oid, int begin=0, int end=0)
@inheritDoc
virtual CRef< CBioseq > GetBioseqNoData(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_id=NULL)
@inheritDoc
string m_DbName
BLAST database name.
void x_FetchData(int oid, int begin, int end)
This method actually retrieves the sequence data.
bool m_UseFixedSizeSlices
Determines whether sequences should be fetched in fixed size slices or in incrementally larger sizes.
void x_FetchDataByBatch(const vector< int > &oids, const vector< TSeqRange > &ranges)
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDB.
Definition: seqdb.hpp:161
@ eProtein
Definition: seqdb.hpp:174
list< CRef< CSeq_id > > TSeqIdList
Convenience typedef for a list of CSeq_id-s.
std::ofstream out("events_result.xml")
main entry point for tests
static void GetSequenceParts(const TSeqIntervalVector &seqids, const string &database, char seqtype, TSeqIdVector &ids, TSeqDataVector &seq_data, string &errors, string &warnings, bool verbose=false)
This retrieves (partial) sequence data from the remote BLAST server.
static void GetSequencesInfo(TSeqIdVector &seqids, const string &database, char seqtype, TBioseqVector &bioseqs, string &errors, string &warnings, bool verbose=false, bool target_only=false)
Get a set of Bioseqs without their sequence data given an input set of.
bool IsValidBlastDb(const string &dbname, bool is_protein)
Returns true if the BLAST database specified exists in the NCBI servers.
vector< CRef< objects::CSeq_data > > TSeqDataVector
Defines a std::vector of CRef<CSeq_data>
vector< CRef< objects::CBioseq > > TBioseqVector
Defines a std::vector of CRef<CBioseq>
vector< CRef< objects::CSeq_interval > > TSeqIntervalVector
Defines a std::vector of CRef<CSeq_interval>
vector< CRef< objects::CSeq_id > > TSeqIdVector
Defines a std::vector of CRef<CSeq_id>
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool IsValid(const CSeq_point &pt, CScope *scope)
Checks that point >= 0 and point < length of Bioseq.
static TThisType GetEmpty(void)
Definition: range.hpp:306
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
int i
CBioseq_Info & GetBioseq(CTSE_Info &tse, const CBioObjectId &id)
const string kIsProtein
static bool RemoteBlastDbLoader_ErrorHandler(const string &errors, const string &warnings)
Returns false always.
Declaration of the CRemoteBlastDbAdapter class.
#define _ASSERT
#define const
Definition: zconf.h:230
Modified on Thu Dec 07 10:06:04 2023 by modify_doxy.py rev. 669887