NCBI C++ ToolKit
objmgr_query_data.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 /* ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho, Kevin Bealer
27  *
28  */
29 
30 /** @file objmgr_query_data.cpp
31  * NOTE: This file contains work in progress and the APIs are likely to change,
32  * please do not rely on them until this notice is removed.
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbi_limits.hpp>
39 #include <objmgr/util/sequence.hpp>
40 #include "blast_setup.hpp"
41 #include "blast_objmgr_priv.hpp"
42 
44 #include "blast_seqalign.hpp"
45 
46 /** @addtogroup AlgoBlast
47  *
48  * @{
49  */
50 
53 BEGIN_SCOPE(blast)
54 
55 /////////////////////////////////////////////////////////////////////////////
56 
57 /// Produces a BioseqSet from a CBlastQueryVector
58 /// @param queries queries as a CBlastQueryVector
59 /// @retval CRef to BioseqSet
60 static CRef<CBioseq_set>
62 {
63  list< CRef<CSeq_entry> > se_list;
64 
65  for(size_t i = 0; i < queries.Size(); i++) {
66  CScope & scope = *queries.GetScope(i);
67 
68  const CBioseq * cbs =
69  scope.GetBioseqHandle(*queries.GetQuerySeqLoc(i)).GetBioseqCore();
70 
71  CRef<CBioseq> bs(const_cast<CBioseq*>(cbs));
72 
74  se->SetSeq(*bs);
75 
76  se_list.push_back(se);
77  }
78 
80  if ( !se_list.empty() ) {
81  rv.Reset(new CBioseq_set);
82  rv->SetSeq_set().swap(se_list);
83  }
84 
85  return rv;
86 }
87 
88 /// Produces a BioseqSet from a TSeqLocVector
89 /// @param queries queries as a TSeqLocVector
90 /// @retval Cref to BioseqSet
91 static CRef<CBioseq_set>
93 {
94  list< CRef<CSeq_entry> > se_list;
95 
96  ITERATE(TSeqLocVector, query, *queries) {
97  if ( !query->seqloc->GetId() ) {
98  continue;
99  }
100  const CBioseq * cbs =
101  query->scope->GetBioseqHandle(*query->seqloc->GetId()).GetBioseqCore();
102 
104  se->SetSeq(*const_cast<CBioseq*>(cbs));
105 
106  se_list.push_back(se);
107  }
108 
110  rv->SetSeq_set().swap(se_list);
111 
112  return rv;
113 }
114 
115 /// Produces a vector of SeqLocs from a TSeqLocVector
116 /// @param queries queries as a TSeqLocVector
117 /// @retval vector of SeqLocs.
120 {
122 
123  ITERATE(TSeqLocVector, query, *queries) {
124  CRef<CSeq_loc> sl(const_cast<CSeq_loc *>(&* query->seqloc));
125  retval.push_back(sl);
126  }
127 
128  return retval;
129 }
130 
131 /// Produces a vector of SeqLocs from a CBlastQueryVector
132 /// @param queries queries as a CBlastQueryVector
133 /// @retval vector of SeqLocs.
136 {
138 
139  for(size_t i = 0; i < queries.Size(); i++) {
140  CSeq_loc * slp =
141  const_cast<CSeq_loc *>(&* queries.GetQuerySeqLoc(i));
142 
143  retval.push_back(CRef<CSeq_loc>(slp));
144  }
145 
146  return retval;
147 }
148 
149 /////////////////////////////////////////////////////////////////////////////
150 //
151 // CObjMgr_LocalQueryData
152 //
153 /////////////////////////////////////////////////////////////////////////////
154 
155 /// Provides access (not ownership) to the C structures used to configure local
156 /// BLAST search class implementations.
158 {
159 public:
160  /// Ctor that takes a vector of SSeqLocs
161  /// @param queries queries as a vector of SSeqLoc [in]
162  /// @param options Blast options [in]
164  const CBlastOptions* options);
165  /// Ctor that takes a CBlastQueryVector (preferred over TSeqLocVector).
166  /// @param queries queries as a CBlastQueryVector [in]
167  /// @param options Blast options [in]
169  const CBlastOptions* options);
170 
172  virtual BlastQueryInfo* GetQueryInfo();
173 
174 
175  /// Get the number of queries.
176  virtual size_t GetNumQueries();
177 
178  /// Get the Seq_loc for the sequence indicated by index.
179  virtual CConstRef<CSeq_loc> GetSeq_loc(size_t index);
180 
181  /// Get the length of the sequence indicated by index.
182  virtual size_t GetSeqLength(size_t index);
183 
184 private:
185  const TSeqLocVector* m_Queries; ///< Adaptee in adapter design pattern
189 };
190 
192  const CBlastOptions * opts)
193  : m_Queries(queries), m_Options(opts)
194 {
195  m_QuerySource.Reset(new CBlastQuerySourceOM(*queries, opts));
196 }
197 
199  const CBlastOptions * opts)
200  : m_Queries(NULL), m_QueryVector(& qv), m_Options(opts)
201 {
202  m_QuerySource.Reset(new CBlastQuerySourceOM(qv, opts));
203 }
204 
207 {
208  if (m_SeqBlk.Get() == NULL) {
209  if (m_Queries || m_QueryVector.NotEmpty()) {
211  m_Options,
212  GetQueryInfo(),
213  m_Messages));
214  } else {
215  abort();
216  }
217  }
218  return m_SeqBlk.Get();
219 }
220 
223 {
224  if (m_QueryInfo.Get() == NULL) {
225  if (m_QuerySource) {
227  } else {
228  abort();
229  }
230  }
231  return m_QueryInfo.Get();
232 }
233 
234 size_t
236 {
237  size_t retval = m_QuerySource->Size();
238  _ASSERT(retval == (size_t)GetQueryInfo()->num_queries);
239  return retval;
240 }
241 
244 {
245  return m_QuerySource->GetSeqLoc(static_cast<int>(index));
246 }
247 
248 size_t
250 {
251  return m_QuerySource->GetLength(static_cast<int>(index));
252 }
253 
254 
255 /////////////////////////////////////////////////////////////////////////////
256 //
257 // CObjMgr_RemoteQueryData
258 //
259 /////////////////////////////////////////////////////////////////////////////
260 
262 {
263 public:
264  /// Construct query data from a TSeqLocVector.
265  /// @param queries Queries expressed as a TSeqLocVector.
266  CObjMgr_RemoteQueryData(const TSeqLocVector* queries);
267 
268  /// Construct query data from a CBlastQueryVector.
269  /// @param queries Queries expressed as a CBlastQueryVector.
271 
272  /// Accessor for the CBioseq_set.
274 
275  /// Accessor for the TSeqLocs.
276  virtual TSeqLocs GetSeqLocs();
277 
278 private:
279  /// Queries, if input representation is TSeqLocVector, or NULL.
281 
282  /// Queries, if input representation is a CBlastQueryVector, or NULL.
284 };
285 
287  : m_Queries(queries)
288 {}
289 
291  : m_QueryVector(& qv)
292 {}
293 
296 {
297  if (m_Bioseqs.Empty()) {
298  if (m_QueryVector.NotEmpty()) {
300  } else if (m_Queries) {
302  } else {
303  abort();
304  }
305  }
306  return m_Bioseqs;
307 }
308 
311 {
312  if (m_SeqLocs.empty()) {
313  if (m_QueryVector.NotEmpty()) {
315  } else if (m_Queries) {
317  } else {
318  abort();
319  }
320  }
321  return m_SeqLocs;
322 }
323 
324 /////////////////////////////////////////////////////////////////////////////
325 //
326 // CObjMgr_QueryFactory
327 //
328 /////////////////////////////////////////////////////////////////////////////
329 
331 {
332  if (queries.empty()) {
333  NCBI_THROW(CBlastException, eInvalidArgument, "Empty TSeqLocVector");
334  }
335 
336  bool found_packedint = false;
337  ITERATE(TSeqLocVector, itr, queries)
338  {
339  if (((*itr).seqloc)->IsPacked_int())
340  {
341  found_packedint = true;
342  break;
343  }
344  }
345 
346  if (found_packedint)
347  {
348  NON_CONST_ITERATE(TSeqLocVector, itr, queries)
349  {
350  if (((*itr).seqloc)->IsPacked_int())
351  {
352  CSeq_loc* mix = const_cast<CSeq_loc *> (&* (*itr).seqloc);
354  {
355  CRef<CSeq_loc> ival(new CSeq_loc);
356  ival->SetInt(**it);
357  m_SSeqLocVector.push_back(SSeqLoc(ival, (*itr).scope, (*itr).mask));
358  }
359  }
360  else
361  {
362  m_SSeqLocVector.push_back(*itr);
363  }
364  }
365  }
366  else
367  {
368  NON_CONST_ITERATE(TSeqLocVector, itr, queries)
369  {
370  m_SSeqLocVector.push_back(*itr);
371  }
372  }
373 }
374 
376  : m_QueryVector(& queries)
377 {
378  if (queries.Empty()) {
379  NCBI_THROW(CBlastException, eInvalidArgument, "Empty CBlastQueryVector");
380  }
381 }
382 
383 vector< CRef<CScope> >
385 {
386  vector< CRef<CScope> > retval;
387  if ( !m_SSeqLocVector.empty() ) {
389  retval.push_back(itr->scope);
390  } else if (m_QueryVector.NotEmpty()) {
392  retval.push_back(m_QueryVector->GetScope(i));
393  } else {
394  abort();
395  }
396  return retval;
397 }
398 
401 {
402  TSeqLocVector retval;
403  if ( !m_SSeqLocVector.empty() ) {
404  retval = m_SSeqLocVector;
405  } else if (m_QueryVector.NotEmpty()) {
406  // FIXME: this is inefficient as it might be copying the masks too many
407  // times
409  i < m_QueryVector->Size(); i++) {
411  CRef<CSeq_loc> masks;
413  if (conv_masks.NotEmpty()) {
414  masks.Reset(new CSeq_loc);
415  masks->SetPacked_int(*conv_masks);
416  }
418  m_QueryVector->GetScope(i), masks);
419  retval.push_back(sl);
420  }
421  } else {
422  abort();
423  }
424  return retval;
425 }
426 
427 /// Auxiliary function to help guess the program type from a CSeq-loc. This
428 /// should only be used in the context of
429 /// CObjMgr_QueryFactory::ExtractUserSpecifiedMasks
430 static EBlastProgramType
432 {
433  // if we cannot safely determine the program from the mask, specifying
434  // nucleotide query for a protein will result in a duplicate mask in the
435  // worst case... not great, but acceptable.
437  if (mask.Empty() || mask->GetStrand() == eNa_strand_unknown) {
438  return retval;
439  }
440 
441  return retval;
442 }
443 
446 {
447  TSeqLocInfoVector retval;
448  if ( !m_SSeqLocVector.empty() ) {
449  const EBlastProgramType kProgram =
450  s_GuessProgram(m_SSeqLocVector.front().mask);
452  TMaskedQueryRegions mqr =
453  PackedSeqLocToMaskedQueryRegions(itr->mask, kProgram,
454  itr->ignore_strand_in_mask);
455  retval.push_back(mqr);
456  }
457  } else if (m_QueryVector.NotEmpty()) {
459  retval.push_back(m_QueryVector->GetMaskedRegions(i));
460  } else {
461  abort();
462  }
463  return retval;
464 }
465 
468 {
469  CRef<ILocalQueryData> retval;
470 
471  if ( !m_SSeqLocVector.empty() ) {
472  retval.Reset(new CObjMgr_LocalQueryData(&m_SSeqLocVector, opts));
473  } else if (m_QueryVector.NotEmpty()) {
474  retval.Reset(new CObjMgr_LocalQueryData(*m_QueryVector, opts));
475  } else {
476  abort();
477  }
478 
479  return retval;
480 }
481 
484 {
485  CRef<IRemoteQueryData> retval;
486 
487  if ( !m_SSeqLocVector.empty() ) {
489  } else if (m_QueryVector.NotEmpty()) {
491  } else {
492  abort();
493  }
494 
495  return retval;
496 }
497 
498 
499 END_SCOPE(blast)
501 
502 /* @} */
Definitions which are dependant on the NCBI C++ Object Manager.
Declares class to encapsulate all BLAST options.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
Utility function to convert internal BLAST result structures into objects::CSeq_align_set objects.
Internal auxiliary setup classes/functions for C++ BLAST APIs.
ncbi::TMaskedQueryRegions mask
Defines BLAST error codes (user errors included)
Encapsulates ALL the BLAST algorithm's options.
Implements the object manager dependant version of the IBlastQuerySource.
Query Vector.
Definition: sseqloc.hpp:276
bool Empty() const
Returns true if this query vector is empty.
Definition: sseqloc.hpp:299
CRef< objects::CScope > GetScope(size_type i) const
Get the scope containing a query by index.
Definition: sseqloc.hpp:322
TMaskedQueryRegions GetMaskedRegions(size_type i) const
Get the masked regions for a query by number.
Definition: sseqloc.hpp:331
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
vector< value_type >::size_type size_type
size_type type definition
Definition: sseqloc.hpp:282
CConstRef< objects::CSeq_loc > GetQuerySeqLoc(size_type i) const
Get the query Seq-loc for a query by index.
Definition: sseqloc.hpp:313
Provides access (not ownership) to the C structures used to configure local BLAST search class implem...
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
Provides access (not ownership) to the C structures used to configure local BLAST search class implem...
Definition: query_data.hpp:55
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
CRef< objects::CPacked_seqint > ConvertToCPacked_seqint() const
Converts this object to a CPacked_seqint (this is the convention used to encode masking locations in ...
Definition: seqlocinfo.cpp:127
CRef< ILocalQueryData > x_MakeLocalQueryData(const CBlastOptions *opts)
factory method to create an ILocalQueryData, only called if the data members above are not set
BlastQueryInfo * Get() const
Definition: blast_aux.hpp:311
TSearchMessages m_Messages
Error/warning messages are stored here.
Definition: query_data.hpp:107
CRef< CBlastQueryVector > m_QueryVector
TMaskedQueryRegions PackedSeqLocToMaskedQueryRegions(CConstRef< objects::CSeq_loc > sloc, EBlastProgramType program, bool assume_both_strands=false)
Auxiliary function to convert a Seq-loc describing masked query regions to a TMaskedQueryRegions obje...
TSeqLocVector GetTSeqLocVector()
Retrieves the TSeqLocVector used to construct this object or a conversion of the CBlastQueryVector pr...
CRef< CBlastQueryVector > m_QueryVector
virtual BLAST_SequenceBlk * GetSequenceBlk()
Accessor for the BLAST_SequenceBlk structure.
CRef< IBlastQuerySource > m_QuerySource
virtual CConstRef< objects::CSeq_loc > GetSeqLoc(int index) const =0
Return the CSeq_loc associated with a sequence.
CObjMgr_LocalQueryData(TSeqLocVector *queries, const CBlastOptions *options)
Ctor that takes a vector of SSeqLocs.
list< CRef< objects::CSeq_loc > > TSeqLocs
Type definition for CSeq_loc set used as queries in the BLAST remote search class.
Definition: query_data.hpp:123
virtual size_t GetNumQueries()
Get the number of queries.
const TSeqLocVector * m_Queries
Queries, if input representation is TSeqLocVector, or NULL.
static CRef< CBioseq_set > s_TSeqLocVectorToBioseqSet(const TSeqLocVector *queries)
Produces a BioseqSet from a TSeqLocVector.
virtual size_t GetSeqLength(size_t index)
Get the length of the sequence indicated by index.
CObjMgr_RemoteQueryData(const TSeqLocVector *queries)
Construct query data from a TSeqLocVector.
TSeqLocs m_SeqLocs
Data member to cache the TSeqLocs.
Definition: query_data.hpp:131
CObjMgr_QueryFactory(TSeqLocVector &queries)
ctor that takes a vector of SSeqLoc
BLAST_SequenceBlk * Get() const
Definition: blast_aux.hpp:309
static IRemoteQueryData::TSeqLocs s_TSeqLocVectorToTSeqLocs(const TSeqLocVector *queries)
Produces a vector of SeqLocs from a TSeqLocVector.
virtual TSeqPos GetLength(int index) const =0
Return the length of a sequence.
CRef< objects::CBioseq_set > m_Bioseqs
Data member to cache the CBioseq_set.
Definition: query_data.hpp:129
static IRemoteQueryData::TSeqLocs s_QueryVectorToTSeqLocs(const CBlastQueryVector &queries)
Produces a vector of SeqLocs from a CBlastQueryVector.
TSeqLocInfoVector ExtractUserSpecifiedMasks()
Retrieve any user specified masking locations.
void Reset(BLAST_SequenceBlk *p=NULL)
Definition: blast_aux.hpp:309
TSeqLocVector m_SSeqLocVector
virtual BlastQueryInfo * GetQueryInfo()
Accessor for the BlastQueryInfo structure.
static EBlastProgramType s_GuessProgram(CConstRef< CSeq_loc > mask)
Auxiliary function to help guess the program type from a CSeq-loc.
const CBlastOptions * m_Options
virtual CRef< objects::CBioseq_set > GetBioseqSet()
Accessor for the CBioseq_set.
CRef< IRemoteQueryData > x_MakeRemoteQueryData()
factory method to create an IRemoteQueryData, only called if the data members above are not set
void Reset(BlastQueryInfo *p=NULL)
Definition: blast_aux.hpp:311
BlastQueryInfo * SafeSetupQueryInfo(const IBlastQuerySource &queries, const CBlastOptions *options)
Wrapper around SetupQueryInfo.
virtual CConstRef< CSeq_loc > GetSeq_loc(size_t index)
Get the Seq_loc for the sequence indicated by index.
virtual TSeqPos Size() const =0
Return the number of elements in the sequence container.
vector< CRef< objects::CScope > > ExtractScopes()
Retrieve the CScope objects associated with the query sequences associated with this object.
const CRef< CBlastQueryVector > m_QueryVector
Queries, if input representation is a CBlastQueryVector, or NULL.
CBLAST_SequenceBlk m_SeqBlk
Data member to cache the BLAST_SequenceBlk.
Definition: query_data.hpp:102
const TSeqLocVector * m_Queries
Adaptee in adapter design pattern.
BLAST_SequenceBlk * SafeSetupQueries(IBlastQuerySource &queries, const CBlastOptions *options, BlastQueryInfo *query_info, TSearchMessages &messages)
Wrapper around SetupQueries.
CBlastQueryInfo m_QueryInfo
Data member to cache the BlastQueryInfo.
Definition: query_data.hpp:104
virtual TSeqLocs GetSeqLocs()
Accessor for the TSeqLocs.
static CRef< CBioseq_set > s_QueryVectorToBioseqSet(const CBlastQueryVector &queries)
Produces a BioseqSet from a CBlastQueryVector.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
list< CRef< CSeq_interval > > Tdata
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
int i
void abort()
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Defines a concrete strategy for the IBlastSeqInfoSrc interface for sequence identifiers retrieval fro...
vector< TMaskedQueryRegions > TSeqLocInfoVector
Collection of masked regions for all queries in a BLAST search.
Definition: seqlocinfo.hpp:139
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Structure to hold a sequence.
Definition: blast_def.h:242
The query related information.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string query
#define _ASSERT
Modified on Thu Feb 22 17:11:18 2024 by modify_doxy.py rev. 669887