NCBI C++ ToolKit
psi_pssm_input.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
3  * PUBLIC DOMAIN NOTICE
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Christiam Camacho
26  *
27  */
28 
29 /** @file psi_pssm_input.cpp
30  * Implementation of the concrete strategy to obtain PSSM input data for
31  * PSI-BLAST.
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <iomanip>
36 
37 // BLAST includes
40 #include "../core/blast_psi_priv.h"
41 
42 // Object includes
50 
51 // Object manager includes
52 #include <objmgr/scope.hpp>
53 #include <objmgr/seq_vector.hpp>
54 #include <objects/seq/Seq_data.hpp>
55 #include <objects/seq/Seqdesc.hpp>
58 
59 #include "psiblast_aux_priv.hpp"
60 
61 /** @addtogroup AlgoBlast
62  *
63  * @{
64  */
65 
68 BEGIN_SCOPE(blast)
69 
70 #ifndef GAP_IN_ALIGNMENT
71  /// Representation of GAP in Seq-align
72 # define GAP_IN_ALIGNMENT ((Uint4)-1)
73 #endif
74 
75 //////////////////////////////////////////////////////////////////////////////
76 
78  unsigned int query_length,
81  const PSIBlastOptions& opts,
82  const char* matrix_name,
83  int gap_existence /* = 0 */,
84  int gap_extension /* = 0 */,
85  const PSIDiagnosticsRequest* diags,
86  const string& query_title)
87  : m_GapExistence(gap_existence), m_GapExtension(gap_extension)
88 {
89  if ( !query ) {
90  NCBI_THROW(CBlastException, eInvalidArgument, "NULL query");
91  }
92 
93  if ( !sset || sset->Get().front()->GetDim() != 2) {
94  NCBI_THROW(CBlastException, eNotSupported,
95  "Only 2-dimensional alignments are supported");
96  }
97 
98  m_Query = new Uint1[query_length];
99  memcpy((void*) m_Query, (void*) query, query_length);
100  m_QueryTitle = query_title;
101 
102  m_Scope.Reset(scope);
103  m_SeqAlignSet.Reset(sset);
104  m_Opts = opts;
105 
106  m_MsaDimensions.query_length = query_length;
108  m_Msa = NULL;
109 
110  // Default value provided by base class
111  m_MatrixName = string(matrix_name ? matrix_name : "");
112  m_DiagnosticsRequest = const_cast<PSIDiagnosticsRequest*>(diags);
113 }
114 
116 {
117  delete [] m_Query;
118  PSIMsaFree(m_Msa);
119 }
120 
121 void
123 {
124 
125  _ASSERT(m_Query != NULL);
126 
127  // Update the number of aligned sequences
129 
130  // Create multiple alignment data structure and populate with query
131  // sequence
133  if ( !m_Msa ) {
134  NCBI_THROW(CBlastSystemException, eOutOfMemory,
135  "Multiple alignment data structure");
136  }
137 
141 }
142 
143 void
145 {
146  // Test our pre-conditions
149 
151  // set the sequence id
152  CRef<CSeq_align> aln =
153  const_cast<CSeq_align_set*>(&*m_SeqAlignSet)->Set().front();
154  CRef<CSeq_id> query_id(const_cast<CSeq_id*>(&aln->GetSeq_id(0)));
155  m_QueryBioseq->SetId().push_back(query_id);
156 
157  CRef<CSeqdesc> desc(new CSeqdesc);
158  desc->SetTitle(m_QueryTitle);
159  m_QueryBioseq->SetDescr().Set().push_back(desc);
160 
161  // set required Seq-inst fields
162  m_QueryBioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
163  m_QueryBioseq->SetInst().SetMol(CSeq_inst::eMol_aa);
164  m_QueryBioseq->SetInst().SetLength(GetQueryLength());
165 
166  // set the sequence data in ncbistdaa format
167  CNCBIstdaa& seq = m_QueryBioseq->SetInst().SetSeq_data().SetNcbistdaa();
168  seq.Set().reserve(GetQueryLength());
169  for (TSeqPos i = 0; i < GetQueryLength(); i++) {
170  seq.Set().push_back(m_Query[i]);
171  }
172 
173  // Test our post-condition
175 }
176 
177 unsigned int
179 {
183  return hit_ids.size();
184 }
185 
186 unsigned int
188 {
189  // Process() should result in this field being assigned a non-zero value
191  return m_MsaDimensions.num_seqs;
192 }
193 
194 inline PSIMsa*
196 {
197  return m_Msa;
198 }
199 
200 inline unsigned char*
202 {
203  return m_Query;
204 }
205 
206 inline unsigned int
208 {
210 }
211 
212 inline const PSIBlastOptions*
214 {
215  return &m_Opts;
216 }
217 
218 inline const char*
220 {
221  if (m_MatrixName.length() != 0) {
222  return m_MatrixName.c_str();
223  } else {
225  }
226 }
227 
228 inline const PSIDiagnosticsRequest*
230 {
231  return m_DiagnosticsRequest;
232 }
233 
234 #if 0
235 void
236 CPsiBlastInputData::x_ExtractAlignmentDataUseBestAlign()
237 {
238  TSeqPos seq_index = 1; // Query sequence already processed
239 
240  // Note that in this implementation the m_ProcessHit vector is irrelevant
241  // because we assume the Seq-align contains only those sequences selected
242  // by the user (www-psi-blast). This could also be implemented by letting
243  // the calling code populate a vector like m_ProcessHit or specifying a
244  // vector of Seq-ids.
245  ITERATE(list< CRef<CSeq_align> >, itr, m_SeqAlignSet->Get()) {
246 
247  const CSeq_align::C_Segs::TDisc::Tdata& hsp_list =
248  (*itr)->GetSegs().GetDisc().Get();
249  CSeq_align::C_Segs::TDisc::Tdata::const_iterator best_alignment;
250  double min_evalue = numeric_limits<double>::max();
251 
252  // Search for the best alignment among all HSPs corresponding to this
253  // query-subject pair (hit)
254  ITERATE(CSeq_align::C_Segs::TDisc::Tdata, hsp_itr, hsp_list) {
255 
256  // Note: Std-seg can be converted to Denseg, will need
257  // conversion from Dendiag to Denseg too
258  if ( !(*hsp_itr)->GetSegs().IsDenseg() ) {
259  NCBI_THROW(CBlastException, eNotSupported,
260  "Segment type not supported");
261  }
262 
263  double evalue = s_GetLowestEvalue((*hsp_itr)->GetScore());
264  if (evalue < min_evalue) {
265  best_alignment = hsp_itr;
266  min_evalue = evalue;
267  }
268  }
269  _ASSERT(best_alignment != hsp_list.end());
270 
271  x_ProcessDenseg((*best_alignment)->GetSegs().GetDenseg(),
272  seq_index, min_evalue);
273 
274  seq_index++;
275 
276  }
277 
278  _ASSERT(seq_index == GetNumAlignedSequences()+1);
279 }
280 #endif
281 
282 void
284 {
285  _ASSERT(m_Msa);
286 
287  for (unsigned int i = 0; i < GetQueryLength(); i++) {
289  m_Msa->data[kQueryIndex][i].is_aligned = true;
290  }
291 }
292 
293 void
295 {
296  // Index into multiple sequence alignment structure, query sequence
297  // already processed
298  unsigned int msa_index = kQueryIndex + 1;
299 
300  CSeq_id* last_sid=NULL;
301 
302  // For each HSP...
304 
305  double bit_score;
306  double evalue = GetLowestEvalue((*itr)->GetScore(), &bit_score);
307  CSeq_id* current_sid = const_cast<CSeq_id*> (&(*itr)->GetSeq_id(1));
308 
309  // Increment msa_index (if appropriate) after all CDense_seg for a given target
310  // sequence have been processed.
311  if (last_sid && !current_sid->Match(*last_sid)) {
312  msa_index++;
313  }
314 
315  // ... below the e-value inclusion threshold
316  if (evalue < m_Opts.inclusion_ethresh) {
317  _ASSERT(msa_index < GetNumAlignedSequences() + 1);
318  const CDense_seg& seg = (*itr)->GetSegs().GetDenseg();
319  x_ProcessDenseg(seg, msa_index, evalue, bit_score);
320  }
321  last_sid = current_sid;
322  }
323 }
324 
325 void
326 CPsiBlastInputData::x_ProcessDenseg(const objects::CDense_seg& denseg,
327  unsigned int msa_index,
328  double evalue,
329  double bit_score)
330 {
331  _ASSERT(denseg.GetDim() == 2);
332 
333  const Uint1 GAP = AMINOACID_TO_NCBISTDAA[(Uint1)'-'];
334  const CDense_seg::TStarts& starts = denseg.GetStarts();
335  const CDense_seg::TLens& lengths = denseg.GetLens();
336  const int kNumSegments = denseg.GetNumseg();
337  const TSeqPos kDimensions = denseg.GetDim();
338  TSeqPos query_index = 0; // index into starts vector
339  TSeqPos subj_index = 1; // index into starts vector
340  TSeqPos subj_seq_idx = 0; // index into subject sequence buffer
341  string seq; // the sequence data
342 
343  // Get the portion of the subject sequence corresponding to this Dense-seg
344  x_GetSubjectSequence(denseg, *m_Scope, seq);
345 
346  // if this isn't available, set its corresponding row in the multiple
347  // sequence alignment to the query sequence so that it can be purged in
348  // PSIPurgeMatrix -> This is a hack, it should withdraw the sequence from
349  // the multiple sequence alignment structure!
350  if (seq.size() == 0) {
351  for (unsigned int i = 0; i < GetQueryLength(); i++) {
352  m_Msa->data[msa_index][i].letter = m_Query[i];
353  m_Msa->data[msa_index][i].is_aligned = true;
354  }
355  return;
356  }
357 
358 #ifdef DEBUG_PSSM_ENGINE
359  _ASSERT(denseg.CanGetIds() && denseg.GetIds().size() == 2);
360  if (denseg.GetIds().back()->IsGi()) {
361  m_Msa->seqinfo[msa_index].gi = denseg.GetIds().back()->GetGi();
362  }
363  m_Msa->seqinfo[msa_index].evalue = evalue;
364  m_Msa->seqinfo[msa_index].bit_score = bit_score;
365 #endif /* DEBUG_PSSM_ENGINE */
366 
367  // Iterate over all segments
368  for (int segmt_idx = 0; segmt_idx < kNumSegments; segmt_idx++) {
369 
370  TSeqPos query_offset = starts[query_index];
371  TSeqPos subject_offset = starts[subj_index];
372 
373  // advance the query and subject indices for next iteration
374  query_index += kDimensions;
375  subj_index += kDimensions;
376 
377  if (query_offset == GAP_IN_ALIGNMENT) {
378 
379  // gap in query, just skip residues on subject sequence
380  subj_seq_idx += lengths[segmt_idx];
381  continue;
382 
383  } else if (subject_offset == GAP_IN_ALIGNMENT) {
384 
385  // gap in subject, initialize appropriately
386  for (TSeqPos i = 0; i < lengths[segmt_idx]; i++) {
387  PSIMsaCell& msa_cell = m_Msa->data[msa_index][query_offset++];
388  if ( !msa_cell.is_aligned ) {
389  msa_cell.letter = GAP;
390  msa_cell.is_aligned = true;
391  }
392  }
393 
394  } else {
395 
396  // Aligned segments without any gaps
397  for (TSeqPos i = 0; i < lengths[segmt_idx]; i++, subj_seq_idx++) {
398  PSIMsaCell& msa_cell =
399  m_Msa->data[msa_index][query_offset++];
400  if ( !msa_cell.is_aligned ) {
401  msa_cell.letter = static_cast<Uint1>(seq[subj_seq_idx]);
402  msa_cell.is_aligned = true;
403  }
404  }
405  }
406 
407  }
408 
409 }
410 
411 void
412 CPsiBlastInputData::x_GetSubjectSequence(const objects::CDense_seg& ds,
413  objects::CScope& scope,
414  string& sequence_data)
415 {
416  _ASSERT(ds.GetDim() == 2);
417  TSeqPos subjlen = 0; // length of the return value
418  TSeqPos subj_start = kInvalidSeqPos; // start of subject alignment
419  bool subj_start_found = false;
420  const int kNumSegments = ds.GetNumseg();
421  const TSeqPos kDimensions = ds.GetDim();
422  TSeqPos subj_index = 1; // index into starts vector
423 
424  const CDense_seg::TStarts& starts = ds.GetStarts();
425  const CDense_seg::TLens& lengths = ds.GetLens();
426 
427  for (int i = 0; i < kNumSegments; i++) {
428 
429  if (starts[subj_index] != (TSignedSeqPos)GAP_IN_ALIGNMENT) {
430  if ( !subj_start_found ) {
431  subj_start = starts[subj_index];
432  subj_start_found = true;
433  }
434  subjlen += lengths[i];
435  }
436 
437  subj_index += kDimensions;
438  }
439  _ASSERT(subj_start_found);
440 
441  CSeq_loc seqloc(const_cast<CSeq_id&>(*ds.GetIds().back()), subj_start,
442  subj_start+subjlen-1);
443 
444  try {
445  CSeqVector sv(seqloc, scope);
447  sv.GetSeqData(0, kInvalidSeqPos, sequence_data);
448  } catch (const CException&) {
449  sequence_data.erase();
450  ERR_POST(Warning << "Failed to retrieve sequence " <<
451  seqloc.GetInt().GetId().AsFastaString());
452  }
453 }
454 
455 END_SCOPE(blast)
457 
458 /* @} */
User-defined methods of the data storage class.
Declares the BLAST exception class.
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
Definition: blast_psi.c:513
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
Definition: blast_psi.c:462
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
Defines BLAST error codes (user errors included)
Defines system exceptions occurred while running BLAST.
CNCBIstdaa –.
Definition: NCBIstdaa.hpp:66
Auxiliary class to retrieve sequence identifiers its position in the alignment which are below the in...
CSeqVector –.
Definition: seq_vector.hpp:65
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
size_type size() const
Definition: set.hpp:132
void Process()
The work to process the alignment is done here.
unsigned int GetNumAlignedSequences() const
Returns the number of sequences that make up the multiple sequence alignment.
#define GAP_IN_ALIGNMENT
Representation of GAP in Seq-align.
CConstRef< objects::CSeq_align_set > m_SeqAlignSet
Pairwise alignment result of a BLAST search.
void x_CopyQueryToMsa()
Copies query sequence data to multiple alignment data structure.
PSIMsa * m_Msa
Structure representing the multiple sequence alignment.
PSIDiagnosticsRequest * m_DiagnosticsRequest
Diagnostics request structure.
virtual const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
Definition: pssm_input.hpp:68
const PSIBlastOptions * GetOptions()
Obtain the options for the PSSM engine.
PSIMsaDimensions m_MsaDimensions
Multiple sequence alignment dimensions.
PSIBlastOptions m_Opts
Algorithm options.
virtual ~CPsiBlastInputData()
virtual destructor
CRef< objects::CScope > m_Scope
Scope where to retrieve the sequences in the aligment from.
unsigned int x_CountAndSelectQualifyingAlignments()
Examines the sequence alignment and keeps track of those hits which have an HSP with an e-value below...
unsigned int GetQueryLength()
Get the query's length.
CRef< objects::CBioseq > m_QueryBioseq
Query as CBioseq for PSSM.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
void x_ProcessDenseg(const objects::CDense_seg &denseg, unsigned int msa_index, double evalue, double bit_score)
Iterates over the Dense-seg passed in and extracts alignment information to multiple alignment data s...
const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine.
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
PSIMsa * GetData()
Obtain the multiple sequence alignment structure.
double GetLowestEvalue(const objects::CDense_seg::TScores &scores, double *bit_score)
Returns the lowest score from the list of scores in CDense_seg::TScores.
void x_ExtractAlignmentData()
Populates the multiple alignment data structure.
unsigned char * m_Query
Pointer to query sequence.
CPsiBlastInputData(const unsigned char *query, unsigned int query_length, CConstRef< objects::CSeq_align_set > sset, CRef< objects::CScope > scope, const PSIBlastOptions &opts, const char *matrix_name=NULL, int gap_existence=0, int gap_opening=0, const PSIDiagnosticsRequest *diags=NULL, const string &query_title="")
Construct a concrete strategy, used to configure the CPssmEngine object.
string m_MatrixName
Underlying matrix to use.
string m_QueryTitle
Title of query.
unsigned char * GetQuery()
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
static void x_GetSubjectSequence(const objects::CDense_seg &ds, objects::CScope &scope, string &sequence_data)
Tries to fetch the sequence data for the subject for the segments specified in the Dense-seg.
void x_ExtractQueryForPssm()
Extracts the query bioseq from m_SeqAlignSet.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
TPrim & Set(void)
Definition: serialbase.hpp:351
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
void SetCoding(TCoding coding)
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
list< CRef< CSeq_align > > Tdata
const TId & GetId(void) const
Get the Id member data.
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
int i
T max(T x_, T y_)
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
Declarations of auxiliary functions/classes for PSI-BLAST.
static const char * proc
Definition: stats.c:21
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double inclusion_ethresh
Minimum evalue for inclusion in PSSM calculation.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
Definition: blast_psi.h:49
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Definition: blast_psi.h:77
static string query
#define _ASSERT
Modified on Sat Dec 02 09:21:26 2023 by modify_doxy.py rev. 669887