NCBI C++ ToolKit
psi_pssm_input.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_BLAST_API__PSI_PSSM_INPUT__HPP
2 #define ALGO_BLAST_API__PSI_PSSM_INPUT__HPP
3 
4 /* $Id: psi_pssm_input.hpp 51610 2011-10-18 12:21:49Z fongah2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Christiam Camacho
30  *
31  */
32 
33 /** @file psi_pssm_input.hpp
34  * Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
35  */
36 
37 #include <corelib/ncbiobj.hpp>
40 #include <objmgr/scope.hpp>
41 
42 /// Forward declaration for unit test classes
44 
45 /** @addtogroup AlgoBlast
46  *
47  * @{
48  */
49 
51 
52 // Forward declarations in objects scope
54 #ifndef SKIP_DOXYGEN_PROCESSING
55  class CSeq_align_set;
56  class CDense_seg;
57 #endif /* SKIP_DOXYGEN_PROCESSING */
59 
60 BEGIN_SCOPE(blast)
61 
62 /// Implements the interface to retrieve data for the last 2 stages of the PSSM
63 /// creation. Note that none of the data is owned by this class, it simply
64 /// returns pointers to the data it is passed
66 {
67 public:
68  /**
69  * @brief Constructor
70  *
71  * @param query pointer to query sequence data [in]
72  * @param query_length length of the query sequence data [in]
73  * @param freq_ratios matrix of frequency ratios [in]
74  * @param matrix_name name of underlying scoring matrix used [in]
75  * @param gap_existence cost to open a gap, if zero default from IPssmInputData used.
76  * @param gap_extension cost to open a gap, if zero default from IPssmInputData used.
77  */
78  CPsiBlastInputFreqRatios(const unsigned char* query,
79  unsigned int query_length,
80  const CNcbiMatrix<double>& freq_ratios,
81  const char* matrix_name = NULL,
82  int gap_existence = 0,
83  int gap_extension = 0,
84  double impala_scale_factor = 0)
85  : m_Query(const_cast<unsigned char*>(query)),
86  m_QueryLength(query_length),
87  m_MatrixName(matrix_name),
88  m_GapExistence(gap_existence),
89  m_GapExtension(gap_extension),
90  m_FreqRatios(freq_ratios),
91  m_ImpalaScaleFactor(impala_scale_factor)
92  {}
93 
94  /// No-op as we assume the data is passed in to the constructor
95  void Process() {}
96 
97  /// @inheritDoc
98  unsigned char* GetQuery() { return m_Query; }
99 
100  /// Get the query's length
101  unsigned int GetQueryLength() { return m_QueryLength; }
102 
103  /// Obtain the name of the underlying matrix to use when building the PSSM
104  const char* GetMatrixName() {
105  return m_MatrixName ? m_MatrixName :
107  }
108 
109  /// Obtain the gap existence value to use when building the PSSM
111  return m_GapExistence
112  ? m_GapExistence
114  }
115 
116  /// Obtain the gap extension value to use when building the PSSM
118  return m_GapExtension
119  ? m_GapExtension
121  }
122 
123  /// Obtain the IMPALA Scale Factor value to use when building the PSSM
125  return m_ImpalaScaleFactor
126  ? m_ImpalaScaleFactor
128  }
129 
130  /// Obtain a matrix of frequency ratios with this->GetQueryLength() columns
131  /// and BLASTAA_SIZE rows
133  return m_FreqRatios;
134  }
135 
136 private:
137  /// Query sequence data
138  unsigned char* m_Query;
139  /// Length of query sequence data
140  unsigned int m_QueryLength;
141  /// Name of underlying scoring matrix
142  const char* m_MatrixName;
143  /// Gap existence penalty
145  /// Gap extension penalty
147  /// Frequency ratios
149 
150  // IMPALA Scale Factor
152 };
153 
154 /// This class is a concrete strategy for IPssmInputData, and it
155 /// implements the traditional PSI-BLAST algorithm for building a multiple
156 /// sequence alignment from a list of pairwise alignments using the C++ object
157 /// manager.
159 {
160 public:
161  /// Construct a concrete strategy, used to configure the CPssmEngine object
162  /// @param query query sequence for the alignment in ncbistdaa encoding.
163  /// @param query_length length of the sequence above.
164  /// @param sset pairwise alignment produced by BLAST where query was the
165  /// query sequence.
166  /// @param scope object manager scope from which to retrieve sequence data
167  /// [in]
168  /// @param opts options to be used in the PSSM engine
169  /// @param matrix_name name of the substitution matrix to use to build PSSM
170  /// If not provided, the default implementation of
171  /// IPssmInputData::GetMatrixName() will be returned
172  /// @param gap_existence cost to open a gap, if zero default from IPssmInputData used.
173  /// @param gap_extension cost to open a gap, if zero default from IPssmInputData used.
174  /// @param diags diagnostics data requests for the PSSM engine
175  CPsiBlastInputData(const unsigned char* query,
176  unsigned int query_length,
178  CRef<objects::CScope> scope,
179  const PSIBlastOptions& opts,
180  const char* matrix_name = NULL,
181  int gap_existence = 0,
182  int gap_opening = 0,
183  const PSIDiagnosticsRequest* diags = NULL,
184  const string& query_title = "");
185 
186  /// virtual destructor
187  virtual ~CPsiBlastInputData();
188 
189  /// The work to process the alignment is done here
190  void Process();
191 
192  /// Get the query sequence used as master for the multiple sequence
193  /// alignment in ncbistdaa encoding.
194  unsigned char* GetQuery();
195 
196  /// Get the query's length
197  unsigned int GetQueryLength();
198 
199  /// Obtain the multiple sequence alignment structure
200  PSIMsa* GetData();
201 
202  /// Obtain the options for the PSSM engine
203  const PSIBlastOptions* GetOptions();
204 
205  /// Obtain the name of the underlying matrix to use when building the PSSM
206  const char* GetMatrixName();
207 
208  /// Obtain the gap existence value to use when building the PSSM
210  return m_GapExistence
211  ? m_GapExistence
213  }
214 
215  /// Obtain the gap extension value to use when building the PSSM
217  return m_GapExtension
218  ? m_GapExtension
220  }
221 
222  /// Obtain the diagnostics data that is requested from the PSSM engine
224 
225  /// @inheritDoc
227  return m_QueryBioseq;
228  }
229 
230 private:
231 
232  /// Pointer to query sequence
233  unsigned char* m_Query;
234  /// Title of query
235  string m_QueryTitle;
236  /// Scope where to retrieve the sequences in the aligment from
238  /// Structure representing the multiple sequence alignment
240  /// Multiple sequence alignment dimensions
242  /// Pairwise alignment result of a BLAST search
244  /// Algorithm options
246  /// Diagnostics request structure
248  /// Underlying matrix to use
249  string m_MatrixName;
250  /// Gap existence paramter used.
252  /// Gap extension paramter used.
254  /// Query as CBioseq for PSSM
256 
257  /////////////////////////// Auxiliary functions ///////////////////////////
258 
259  /// Tries to fetch the sequence data for the subject for the segments
260  /// specified in the Dense-seg. If the sequence cannot be retrieved from the
261  /// scope, a warning is printed and an empty string is returned in
262  /// sequence_data
263  /// @param ds dense seg for which the sequence data is needed [in]
264  /// @param scope scope from which to obtain the sequence data [in]
265  /// @param sequence_data string which will contain the sequence data.
266  static void
267  x_GetSubjectSequence(const objects::CDense_seg& ds, objects::CScope& scope,
268  string& sequence_data);
269 
270  /// Examines the sequence alignment and keeps track of those hits which
271  /// have an HSP with an e-value below the inclusion threshold specified in
272  /// the PSIBlastOptions structure.
273  /// @return number of hits which qualify for constructing the multiple
274  /// sequence alignment structure
275  unsigned int
276  x_CountAndSelectQualifyingAlignments();
277 
278  /// Populates the multiple alignment data structure
279  void x_ExtractAlignmentData();
280  // First implementation of use_best_align option from old toolkit. Should
281  // be implemented as a subclass of this one?
282  //void x_ExtractAlignmentDataUseBestAlign();
283 
284  /// Copies query sequence data to multiple alignment data structure
285  void x_CopyQueryToMsa();
286 
287  /// Returns the number of sequences that make up the multiple sequence
288  /// alignment
289  /// @throws CBlastException if this number hasn't been calculated yet (need
290  /// to invoke Process() first!)
291  unsigned int GetNumAlignedSequences() const;
292 
293  /// Iterates over the Dense-seg passed in and extracts alignment
294  /// information to multiple alignment data structure.
295  /// @param denseg source alignment segment (HSP) [in]
296  /// @param msa_index index of the sequence aligned with the query in the
297  /// desc_matrix field of the m_AlignmentData data member [in]
298  /// @param evalue evalue for this sequence aligned with the query (used for
299  /// debugging only) [in]
300  /// @param bit_score bit score for this sequence aligned with the query
301  /// (used for debugging only) [in]
302  void x_ProcessDenseg(const objects::CDense_seg& denseg,
303  unsigned int msa_index,
304  double evalue, double bit_score);
305 
306  /// Extracts the query bioseq from m_SeqAlignSet
307  void x_ExtractQueryForPssm();
308 
309  /// unit test class
310  friend class ::CPssmCreateTestFixture;
311 
312 private:
313  /// prohibit copy constructor
315  /// prohibit assignment operator
317 };
318 
319 END_SCOPE(blast)
321 
322 /* @} */
323 
324 #endif /* ALGO_BLAST_API__PSI_PSSM_INPUT_HPP */
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
This class is a concrete strategy for IPssmInputData, and it implements the traditional PSI-BLAST alg...
Implements the interface to retrieve data for the last 2 stages of the PSSM creation.
This class exists merely to call private methods in CPsiBlastInputData and CPssmEngine.
int GetGapExistence()
Obtain the gap existence value to use when building the PSSM.
virtual void Process()=0
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
double GetImpalaScaleFactor()
Obtain the IMPALA Scale Factor value to use when building the PSSM.
CConstRef< objects::CSeq_align_set > m_SeqAlignSet
Pairwise alignment result of a BLAST search.
PSIMsa * m_Msa
Structure representing the multiple sequence alignment.
PSIDiagnosticsRequest * m_DiagnosticsRequest
Diagnostics request structure.
virtual const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
Definition: pssm_input.hpp:68
CRef< objects::CBioseq > GetQueryForPssm()
@inheritDoc
unsigned int m_QueryLength
Length of query sequence data.
virtual unsigned char * GetQuery()=0
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
const CNcbiMatrix< double > & GetData()
Obtain a matrix of frequency ratios with this->GetQueryLength() columns and BLASTAA_SIZE rows.
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
Definition: pssm_input.hpp:123
virtual double GetImpalaScaleFactor()
Definition: pssm_input.hpp:144
PSIMsaDimensions m_MsaDimensions
Multiple sequence alignment dimensions.
virtual int GetGapExistence()
Obtain the gap existence value for the underlying matrix used to build the PSSM.
Definition: pssm_input.hpp:73
PSIBlastOptions m_Opts
Algorithm options.
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
virtual unsigned int GetQueryLength()=0
Get the query's length.
unsigned int GetQueryLength()
Get the query's length.
int m_GapExistence
Gap existence paramter used.
unsigned char * GetQuery()
@inheritDoc
CRef< objects::CScope > m_Scope
Scope where to retrieve the sequences in the aligment from.
virtual int GetGapExtension()
Obtain the gap extension value for the underlying matrix used to build the PSSM.
Definition: pssm_input.hpp:78
virtual const PSIBlastOptions * GetOptions()=0
Obtain the options for the PSSM engine.
const char * m_MatrixName
Name of underlying scoring matrix.
CRef< objects::CBioseq > m_QueryBioseq
Query as CBioseq for PSSM.
int GetGapExtension()
Obtain the gap extension value to use when building the PSSM.
int GetGapExistence()
Obtain the gap existence value to use when building the PSSM.
int m_GapExtension
Gap extension penalty.
int GetGapExtension()
Obtain the gap extension value to use when building the PSSM.
CPsiBlastInputFreqRatios(const unsigned char *query, unsigned int query_length, const CNcbiMatrix< double > &freq_ratios, const char *matrix_name=NULL, int gap_existence=0, int gap_extension=0, double impala_scale_factor=0)
Constructor.
int m_GapExtension
Gap extension paramter used.
CPsiBlastInputData & operator=(const CPsiBlastInputData &)
prohibit assignment operator
unsigned char * m_Query
Pointer to query sequence.
int m_GapExistence
Gap existence penalty.
unsigned char * m_Query
Query sequence data.
virtual PSIMsa * GetData()=0
Obtain the multiple sequence alignment structure.
CPsiBlastInputData(const CPsiBlastInputData &)
prohibit copy constructor
CNcbiMatrix< double > m_FreqRatios
Frequency ratios.
string m_MatrixName
Underlying matrix to use.
string m_QueryTitle
Title of query.
void Process()
No-op as we assume the data is passed in to the constructor.
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Defines interface for a sequence alignment processor that can populate a multiple alignment data stru...
Abstract base class to encapsulate the source(s) and pre-processing of PSSM input data as well as opt...
Definition: pssm_input.hpp:106
Interface used to retrieve the PSSM frequency ratios to allow for "restart" processing in PSI-BLAST: ...
Definition: pssm_input.hpp:131
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Structure representing the dimensions of the multiple sequence alignment data structure.
Definition: blast_psi.h:57
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
static string query
Modified on Mon May 27 04:39:38 2024 by modify_doxy.py rev. 669887