NCBI C++ ToolKit
pssm_engine.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
3  * PUBLIC DOMAIN NOTICE
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Christiam Camacho
26  *
27  */
28 
29 /** @file pssm_engine.cpp
30  * Implementation of the C++ API for the PSI-BLAST PSSM generation engine.
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <sstream>
35 
38 #include "blast_setup.hpp"
39 
40 // Object includes
48 
49 // Core BLAST includes
53 #include "../core/blast_psi_priv.h"
54 
55 /** @addtogroup AlgoBlast
56  *
57  * @{
58  */
59 
62 BEGIN_SCOPE(blast)
63 
64 /// This function makes sure that none of the required data is returned as NULL
65 /// or "empty"
66 /// @param pssm_input_msa interface which provides the data [in]
67 /// @throw CPssmEngineException in case of validation failure
68 static void
70 {
71  if ( !pssm_input_msa ) {
72  NCBI_THROW(CPssmEngineException, eNullInputData,
73  "IPssmInputData is NULL");
74  }
75 
76  if ( !pssm_input_msa->GetOptions() ) {
77  NCBI_THROW(CPssmEngineException, eNullInputData,
78  "IPssmInputData returns NULL PSIBlastOptions");
79  }
80 
81  if ( !pssm_input_msa->GetQuery() ) {
82  NCBI_THROW(CPssmEngineException, eNullInputData,
83  "IPssmInputData returns NULL query sequence");
84  }
85 
86  if (pssm_input_msa->GetQueryLength() == 0) {
87  NCBI_THROW(CPssmEngineException, eNullInputData,
88  "Query length provided by IPssmInputData is 0");
89  }
90 }
91 
92 /// This function makes sure that none of the required data is returned as NULL
93 /// or "empty"
94 /// @param pssm_input_freqratios interface which provides the data [in]
95 /// @throw CPssmEngineException in case of validation failure
96 static void
98 {
99  if ( !pssm_input_freqratios ) {
100  NCBI_THROW(CPssmEngineException, eNullInputData,
101  "IPssmInputFreqRatios is NULL");
102  }
103 
104  if ( !pssm_input_freqratios->GetQuery() ) {
105  NCBI_THROW(CPssmEngineException, eNullInputData,
106  "IPssmInputFreqRatiosFreqRatios returns NULL query sequence");
107  }
108 
109  const unsigned int kQueryLength = pssm_input_freqratios->GetQueryLength();
110  if (kQueryLength == 0) {
111  NCBI_THROW(CPssmEngineException, eInvalidInputData,
112  "Query length provided by IPssmInputFreqRatiosFreqRatios is 0");
113  }
114 
115  if (pssm_input_freqratios->GetData().GetCols() != kQueryLength) {
116  NCBI_THROW(CPssmEngineException, eInvalidInputData,
117  "Number of columns returned by IPssmInputFreqRatiosFreqRatios does "
118  "not match query length");
119  }
120  if (pssm_input_freqratios->GetData().GetRows() != BLASTAA_SIZE) {
121  NCBI_THROW(CPssmEngineException, eInvalidInputData,
122  "Number of rows returned by IPssmInputFreqRatiosFreqRatios differs "
123  "from " + NStr::IntToString(BLASTAA_SIZE));
124  }
125 }
126 
127 /// Performs validation on data provided before invoking the CORE PSSM
128 /// engine. Should be called after invoking Process() on its argument
129 /// @throws CPssmEngineException if validation fails
130 static void
131 s_Validate(IPssmInputData* pssm_input_msa)
132 {
133  _ASSERT(pssm_input_msa);
134 
135  if ( !pssm_input_msa->GetData() ) {
136  NCBI_THROW(CPssmEngineException, eNullInputData,
137  "IPssmInputData returns NULL multiple sequence alignment");
138  }
139 
140  Blast_Message* errors = NULL;
141  if (PSIBlastOptionsValidate(pssm_input_msa->GetOptions(), &errors) != 0) {
142  string msg("IPssmInputData returns invalid PSIBlastOptions: ");
143  msg += string(errors->message);
144  errors = Blast_MessageFree(errors);
145  NCBI_THROW(CBlastException, eInvalidOptions, msg);
146  }
147 }
148 
149 /// Performs validation on data provided before invoking the CORE PSSM
150 /// engine. Should be called after invoking Process() on its argument
151 /// @throws CPssmEngineException if validation fails
152 static void
154 {
155  _ASSERT(pssm_input);
156 
157  if ( !pssm_input->GetData() ) {
158  NCBI_THROW(CPssmEngineException, eNullInputData,
159  "IPssmInputData returns NULL multiple sequence alignment");
160  }
161 
162  Blast_Message* errors = NULL;
163  if (PSIBlastOptionsValidate(pssm_input->GetOptions(), &errors) != 0) {
164  string msg("IPssmInputData returns invalid PSIBlastOptions: ");
165  msg += string(errors->message);
166  errors = Blast_MessageFree(errors);
167  NCBI_THROW(CBlastException, eInvalidOptions, msg);
168  }
169 }
170 
171 
172 /// Performs validation on data provided before invoking the CORE PSSM
173 /// engine. Should be called after invoking Process() on its argument
174 /// @throws CPssmEngineException if validation fails
175 static void
177 {
178  _ASSERT(pssm_input_fr);
179 
180  ITERATE(CNcbiMatrix<double>, itr, pssm_input_fr->GetData()) {
181  if (*itr < 0.0) {
182  NCBI_THROW(CPssmEngineException, eInvalidInputData,
183  "PSSM frequency ratios cannot have negative values");
184  }
185  }
186 }
187 
189  : m_PssmInput(input), m_PssmInputFreqRatios(NULL)
190 {
194 }
195 
197  : m_PssmInput(NULL), m_PssmInputFreqRatios(input)
198 {
202 }
203 
205  m_PssmInputFreqRatios(NULL),
206  m_PssmInputCdd(input)
207 {
208  x_InitializeScoreBlock(input->GetQuery(), input->GetQueryLength(),
209  input->GetMatrixName(), input->GetGapExistence(),
210  input->GetGapExtension());
211 }
212 
214 {
215 }
216 
217 string
219 {
220  string retval;
221 
222  switch (error_code) {
223  case PSI_SUCCESS:
224  retval = "No error detected";
225  break;
226 
227  case PSIERR_BADPARAM:
228  retval = "Bad argument to function detected";
229  break;
230 
231  case PSIERR_OUTOFMEM:
232  retval = "Out of memory";
233  break;
234 
236  retval = "Error computing sequence weights";
237  break;
238 
239  case PSIERR_NOFREQRATIOS:
240  retval = "No matrix frequency ratios were found for requested matrix";
241  break;
242 
244  retval = "PSSM has positive average score";
245  break;
246 
248  retval = "No sequences left after purging biased sequences in ";
249  retval += "multiple sequence alignment";
250  break;
251 
252  case PSIERR_GAPINQUERY:
253  retval = "Gap found in query sequence";
254  break;
255 
257  retval = "Found column with no sequences aligned in it";
258  break;
259 
260  case PSIERR_COLUMNOFGAPS:
261  retval = "Found column with only GAP residues";
262  break;
263 
264  case PSIERR_STARTINGGAP:
265  retval = "Found flanking gap at start of alignment";
266  break;
267 
268  case PSIERR_ENDINGGAP:
269  retval = "Found flanking gap at end of alignment";
270  break;
271 
272  case PSIERR_BADPROFILE:
273  retval = "Errors in conserved domain profile";
274  break;
275 
276  default:
277  retval = "Unknown error code returned from PSSM engine: " +
278  NStr::IntToString(error_code);
279  }
280 
281  return retval;
282 }
283 
286 {
287  if (m_PssmInput) {
288  return x_CreatePssmFromMsa();
289  }
290 
291  if (m_PssmInputFreqRatios) {
293  }
294 
295  if (m_PssmInputCdd) {
296  return x_CreatePssmFromCDD();
297  }
298 
299  NCBI_THROW(CPssmEngineException, eNullInputData, "All pointers to pre-"
300  "processing input data strategies are null");
301 }
302 
303 /// Auxiliary class to convert from a CNcbiMatrix into a double** as
304 /// required by the C API. Used only by CPssmEngine::x_CreatePssmFromFreqRatios
306 {
307  /// Constructor
308  /// @param m standard c++ toolkit matrix
310  : m_NumCols(m.GetCols())
311  {
312  m_Data = new double*[m.GetCols()];
313  for (size_t c = 0; c < m.GetCols(); c++) {
314  m_Data[c] = new double[m.GetRows()];
315  for (size_t r = 0; r < m.GetRows(); r++) {
316  m_Data[c][r] = m(r, c);
317  }
318  }
319  }
320 
321  /// Destructor
323  for (size_t c = 0; c < m_NumCols; c++) {
324  delete [] m_Data[c];
325  }
326  delete [] m_Data;
327  }
328 
329  /// Retrieves data in the format expected by the C CORE APIs
330  operator double**() { return m_Data; }
331 
332 private:
333  /// double** representation of a CNcbiMatrix
334  double** m_Data;
335  /// number of columns in the matrix (for deallocation)
336  size_t m_NumCols;
337 };
338 
341 {
343 
346 
347  CPSIMatrix pssm;
349 
350  int status =
354  m_ScoreBlk,
355  freq_ratios,
357  //kPSSM_NoImpalaScaling,
358  &pssm);
359  if (status != PSI_SUCCESS) {
360  string msg = x_ErrorCodeToString(status);
361  NCBI_THROW(CBlastException, eCoreBlastError, msg);
362  }
363 
364  // Convert core BLAST matrix structure into ASN.1 score matrix object
368  if (query.NotEmpty()) {
369  retval->SetQuery().SetSeq(*query);
370  }
371 
372  return retval;
373 }
374 
377 {
379 
380  m_PssmInput->Process();
382 
383  CPSIMatrix pssm;
384  CPSIDiagnosticsResponse diagnostics;
385  int status =
388  m_ScoreBlk,
390  &pssm,
391  &diagnostics);
392  if (status != PSI_SUCCESS) {
393  // FIXME: need to use core level perror-like facility
394  string msg = x_ErrorCodeToString(status);
395  NCBI_THROW(CBlastException, eCoreBlastError, msg);
396  }
397 
398  // Convert core BLAST matrix structure into ASN.1 score matrix object
400  retval = x_PSIMatrix2Asn1(pssm, m_PssmInput->GetMatrixName(),
401  m_PssmInput->GetOptions(), diagnostics);
403  if (query.NotEmpty()) {
404  retval->SetQuery().SetSeq(*query);
405  }
406 
407  return retval;
408 }
409 
410 
413 {
415 
418 
419  CPSIMatrix pssm;
420  CPSIDiagnosticsResponse diagnostics;
421  int status =
424  m_ScoreBlk,
426  &pssm,
427  &diagnostics);
428 
429  if (status != PSI_SUCCESS) {
430  // FIXME: need to use core level perror-like facility
431  string msg = x_ErrorCodeToString(status);
432  NCBI_THROW(CBlastException, eCoreBlastError, msg);
433  }
434 
435  // Convert core BLAST matrix structure into ASN.1 score matrix object
437  retval = x_PSIMatrix2Asn1(pssm, m_PssmInputCdd->GetMatrixName(),
438  m_PssmInputCdd->GetOptions(), diagnostics);
439 
441  if (query.NotEmpty()) {
442  retval->SetQuery().SetSeq(*query);
443  }
444 
445  return retval;
446 }
447 
448 unsigned char*
450  unsigned int query_length)
451 {
452  _ASSERT(query);
453 
454  unsigned char* retval = NULL;
455  retval = (unsigned char*) malloc(sizeof(unsigned char)*(query_length + 2));
456  if ( !retval ) {
457  NCBI_THROW(CBlastSystemException, eOutOfMemory, "Query with sentinels");
458  }
459 
460  retval[0] = retval[query_length+1] = GetSentinelByte(eBlastEncodingProtein);
461  memcpy((void*) &retval[1], (void*) query, query_length);
462  return retval;
463 }
464 
466 CPssmEngine::x_InitializeQueryInfo(unsigned int query_length)
467 {
468  const int kNumQueries = 1;
469  BlastQueryInfo* retval = BlastQueryInfoNew(eBlastTypeBlastp, kNumQueries);
470 
471  if ( !retval ) {
472  NCBI_THROW(CBlastSystemException, eOutOfMemory, "BlastQueryInfo");
473  }
474 
475  retval->contexts[0].query_offset = 0;
476  retval->contexts[0].query_length = query_length;
477  retval->max_length = query_length;
478 
479  return retval;
480 }
481 
482 void
484  ancillary_data)
485 {
486  _ASSERT(m_ScoreBlk.Get() != NULL);
487  _ASSERT(ancillary_data.NotEmpty());
488  if (ancillary_data->GetPsiUngappedKarlinBlk()) {
490  m_ScoreBlk->kbp_psi[0]->Lambda =
491  ancillary_data->GetPsiUngappedKarlinBlk()->Lambda;
492  m_ScoreBlk->kbp_psi[0]->K =
493  ancillary_data->GetPsiUngappedKarlinBlk()->K;
495  m_ScoreBlk->kbp_psi[0]->H =
496  ancillary_data->GetPsiUngappedKarlinBlk()->H;
497  }
498 
499  if (ancillary_data->GetPsiGappedKarlinBlk()) {
502  ancillary_data->GetPsiGappedKarlinBlk()->Lambda;
503  m_ScoreBlk->kbp_gap_psi[0]->K =
504  ancillary_data->GetPsiGappedKarlinBlk()->K;
506  m_ScoreBlk->kbp_gap_psi[0]->H =
507  ancillary_data->GetPsiGappedKarlinBlk()->H;
508  }
509 }
510 
511 void
513  unsigned int query_length,
514  const char* matrix_name,
515  int gap_existence,
516  int gap_extension)
517 {
518  _ASSERT(query);
519  _ASSERT(matrix_name);
520 
521  const EBlastProgramType kProgramType = eBlastTypePsiBlast;
522  short status = 0;
523 
524  TAutoUint1Ptr guarded_query(x_GuardProteinQuery(query, query_length));
525 
526  // Setup the scoring options
528  status = BlastScoringOptionsNew(kProgramType, &opts);
529  if (status != 0) {
530  NCBI_THROW(CBlastSystemException, eOutOfMemory, "BlastScoringOptions");
531  }
532  BlastScoringOptionsSetMatrix(opts, matrix_name);
533  opts->gap_open = gap_existence;
534  opts->gap_extend = gap_extension;
535 
536  // Setup the sequence block structure
537  CBLAST_SequenceBlk query_blk;
538  status = BlastSeqBlkNew(&query_blk);
539  if (status != 0) {
540  NCBI_THROW(CBlastSystemException, eOutOfMemory, "BLAST_SequenceBlk");
541  }
542 
543  // Populate the sequence block structure, transferring ownership of the
544  // guarded protein sequence
545  status = BlastSeqBlkSetSequence(query_blk, guarded_query.release(),
546  query_length);
547  if (status != 0) {
548  // should never happen, previous function only performs assignments
549  abort();
550  }
551 
552  // Setup the query info structure
553  CBlastQueryInfo query_info(x_InitializeQueryInfo(query_length));
554 
555  BlastScoreBlk* retval = NULL;
556  Blast_Message* errors = NULL;
557  const double kScaleFactor = 1.0;
558  status = BlastSetup_ScoreBlkInit(query_blk,
559  query_info,
560  opts,
561  kProgramType,
562  &retval,
563  kScaleFactor,
564  &errors,
566  if (status != 0) {
567  retval = BlastScoreBlkFree(retval);
568  if (errors) {
569  string msg(errors->message);
570  errors = Blast_MessageFree(errors);
571  NCBI_THROW(CBlastException, eCoreBlastError, msg);
572  } else {
573  NCBI_THROW(CBlastException, eCoreBlastError,
574  "Unknown error when setting up BlastScoreBlk");
575  }
576  }
577 
578  _ASSERT(retval->kbp_ideal);
579  _ASSERT(retval->kbp == retval->kbp_psi);
580  _ASSERT(retval->kbp_gap == retval->kbp_gap_psi);
581 
582  m_ScoreBlk.Reset(retval);
583 }
584 
585 unsigned char*
587 {
588  return (m_PssmInput ?
590 }
591 
592 unsigned int
594 {
595  return (m_PssmInput ?
598 }
599 
600 const char*
602 {
603  return (m_PssmInput ?
606 }
607 
608 int
610 {
611  return (m_PssmInput ?
614 }
615 
616 int
618 {
619  return (m_PssmInput ?
622 }
623 
626  const char* matrix_name,
627  const PSIBlastOptions* opts,
628  const PSIDiagnosticsResponse* diagnostics)
629 {
630  _ASSERT(pssm);
631 
633 
634  // Record the parameters
635  string mtx(matrix_name);
636  mtx = NStr::ToUpper(mtx); // save the matrix name in all capital letters
637  retval->SetParams().SetRpsdbparams().SetMatrixName(mtx);
638  if (opts) {
639  retval->SetParams().SetPseudocount(opts->pseudo_count);
640  }
641 
642  CPssm& asn1_pssm = retval->SetPssm();
643  asn1_pssm.SetIsProtein(true);
644  // number of rows is alphabet size
645  asn1_pssm.SetNumRows(pssm->nrows);
646  // number of columns is query length
647  asn1_pssm.SetNumColumns(pssm->ncols);
648  asn1_pssm.SetByRow(false); // this is the default
649 
650  asn1_pssm.SetLambda(pssm->lambda);
651  asn1_pssm.SetKappa(pssm->kappa);
652  asn1_pssm.SetH(pssm->h);
653  asn1_pssm.SetLambdaUngapped(pssm->ung_lambda);
654  asn1_pssm.SetKappaUngapped(pssm->ung_kappa);
655  asn1_pssm.SetHUngapped(pssm->ung_h);
656  if (asn1_pssm.GetByRow() == false) {
657  for (unsigned int i = 0; i < pssm->ncols; i++) {
658  for (unsigned int j = 0; j < pssm->nrows; j++) {
659  asn1_pssm.SetFinalData().SetScores().
660  push_back(pssm->pssm[i][j]);
661  }
662  }
663  } else {
664  for (unsigned int i = 0; i < pssm->nrows; i++) {
665  for (unsigned int j = 0; j < pssm->ncols; j++) {
666  asn1_pssm.SetFinalData().SetScores().
667  push_back(pssm->pssm[j][i]);
668  }
669  }
670  }
671  if (opts && opts->impala_scaling_factor != kPSSM_NoImpalaScaling) {
672  asn1_pssm.SetFinalData().
673  SetScalingFactor(static_cast<int>(opts->impala_scaling_factor));
674  }
675 
676  /********** Collect information from diagnostics structure ************/
677  if ( !diagnostics ) {
678  return retval;
679  }
680 
681  _ASSERT(pssm->nrows == diagnostics->alphabet_size);
682  _ASSERT(pssm->ncols == diagnostics->query_length);
683 
684  if (diagnostics->information_content) {
686  asn1_pssm.SetIntermediateData().SetInformationContent();
687  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
688  info_content.push_back(diagnostics->information_content[i]);
689  }
690  }
691 
692  if (diagnostics->residue_freqs) {
694  asn1_pssm.SetIntermediateData().SetResFreqsPerPos();
695  if (asn1_pssm.GetByRow() == false) {
696  for (unsigned int i = 0; i < pssm->ncols; i++) {
697  for (unsigned int j = 0; j < pssm->nrows; j++) {
698  res_freqs.push_back(diagnostics->residue_freqs[i][j]);
699  }
700  }
701  } else {
702  for (unsigned int i = 0; i < pssm->nrows; i++) {
703  for (unsigned int j = 0; j < pssm->ncols; j++) {
704  res_freqs.push_back(diagnostics->residue_freqs[j][i]);
705  }
706  }
707  }
708  }
709 
710  if (diagnostics->weighted_residue_freqs) {
712  asn1_pssm.SetIntermediateData().SetWeightedResFreqsPerPos();
713  if (asn1_pssm.GetByRow() == false) {
714  for (unsigned int i = 0; i < pssm->ncols; i++) {
715  for (unsigned int j = 0; j < pssm->nrows; j++) {
716  wres_freqs.
717  push_back(diagnostics->weighted_residue_freqs[i][j]);
718  }
719  }
720  } else {
721  for (unsigned int i = 0; i < pssm->nrows; i++) {
722  for (unsigned int j = 0; j < pssm->ncols; j++) {
723  wres_freqs.
724  push_back(diagnostics->weighted_residue_freqs[j][i]);
725  }
726  }
727  }
728  }
729 
730  if (diagnostics->frequency_ratios) {
731  CPssmIntermediateData::TFreqRatios& freq_ratios =
732  asn1_pssm.SetIntermediateData().SetFreqRatios();
733  if (asn1_pssm.GetByRow() == false) {
734  for (unsigned int i = 0; i < pssm->ncols; i++) {
735  for (unsigned int j = 0; j < pssm->nrows; j++) {
736  freq_ratios.push_back(diagnostics->frequency_ratios[i][j]);
737  }
738  }
739  } else {
740  for (unsigned int i = 0; i < pssm->nrows; i++) {
741  for (unsigned int j = 0; j < pssm->ncols; j++) {
742  freq_ratios.push_back(diagnostics->frequency_ratios[j][i]);
743  }
744  }
745  }
746  }
747 
748  if (diagnostics->gapless_column_weights) {
750  asn1_pssm.SetIntermediateData().SetGaplessColumnWeights();
751  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
752  gcw.push_back(diagnostics->gapless_column_weights[i]);
753  }
754  }
755 
756  if (diagnostics->sigma) {
758  asn1_pssm.SetIntermediateData().SetSigma();
759  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
760  sigma.push_back(diagnostics->sigma[i]);
761  }
762  }
763 
764  if (diagnostics->interval_sizes) {
765  CPssmIntermediateData::TIntervalSizes& interval_sizes =
766  asn1_pssm.SetIntermediateData().SetIntervalSizes();
767  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
768  interval_sizes.push_back(diagnostics->interval_sizes[i]);
769  }
770  }
771 
772  if (diagnostics->num_matching_seqs) {
773  CPssmIntermediateData::TNumMatchingSeqs& num_matching_seqs =
774  asn1_pssm.SetIntermediateData().SetNumMatchingSeqs();
775  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
776  num_matching_seqs.push_back(diagnostics->num_matching_seqs[i]);
777  }
778  }
779 
780  if (diagnostics->independent_observations) {
781  CPssmIntermediateData::TNumIndeptObsr& num_indept_obsr =
782  asn1_pssm.SetIntermediateData().SetNumIndeptObsr();
783  for (Uint4 i = 0; i < diagnostics->query_length; i++) {
784  num_indept_obsr.push_back(diagnostics->independent_observations[i]);
785  }
786  }
787 
788  return retval;
789 }
790 
791 END_SCOPE(blast)
793 
794 /* @} */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
Blast_Message * Blast_MessageFree(Blast_Message *blast_msg)
Deallocates message memory.
Definition: blast_message.c:80
The structures and functions in blast_options.
Int2 PSIBlastOptionsValidate(const PSIBlastOptions *psi_options, Blast_Message **blast_msg)
Validates the PSI BLAST options so that they have sane values.
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
Int2 BlastScoringOptionsSetMatrix(BlastScoringOptions *opts, const char *matrix_name)
Resets matrix name option.
const double kPSSM_NoImpalaScaling
Value used to indicate that no IMPALA-style scaling should be performed when scaling a PSSM.
Definition: blast_options.c:43
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypeBlastp
Definition: blast_program.h:73
int PSICreatePssmFromCDD(const PSICdMsa *cd_msa, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine for computing CDD-based PSSMs.
Definition: blast_psi.c:229
int PSICreatePssmFromFrequencyRatios(const Uint1 *query, Uint4 query_length, BlastScoreBlk *sbp, double **freq_ratios, double impala_scaling_factor, PSIMatrix **pssm)
Top-level function to create a PSSM given a matrix of frequency ratios and perform scaling on the res...
Definition: blast_psi.c:344
int PSICreatePssmWithDiagnostics(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine which allows to request diagnostics information.
Definition: blast_psi.c:105
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSIERR_ENDINGGAP
Found flanking gap at end of alignment.
#define PSIERR_COLUMNOFGAPS
Found an entire column full of GAP residues.
#define PSIERR_OUTOFMEM
Out of memory.
#define PSIERR_BADPROFILE
Errors in conserved domain profile.
#define PSIERR_POSITIVEAVGSCORE
Positive average score found when scaling matrix.
#define PSIERR_NOALIGNEDSEQS
After purge stage of PSSM creation, no sequences are left.
#define PSIERR_NOFREQRATIOS
No frequency ratios were found for the given scoring matrix.
#define PSIERR_STARTINGGAP
Found flanking gap at start of alignment.
#define PSIERR_BADSEQWEIGHTS
Sequence weights do not add to 1.
#define PSI_SUCCESS
Successful operation.
#define PSIERR_UNALIGNEDCOLUMN
Found an entire column with no participating sequences.
#define PSIERR_GAPINQUERY
GAP residue found in query sequence.
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
Utilities initialize/setup BLAST.
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Definition: blast_setup.c:456
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence, Int4 seqlen)
Stores the sequence in the sequence block structure.
Definition: blast_util.c:147
Int2 BlastSeqBlkNew(BLAST_SequenceBlk **retval)
Allocates a new sequence block structure.
Definition: blast_util.c:133
Wrapper class for BLAST_SequenceBlk .
Definition: blast_aux.hpp:309
Defines BLAST error codes (user errors included)
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:311
Wrapper class for BlastScoringOptions .
Definition: blast_aux.hpp:334
Defines system exceptions occurred while running BLAST.
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
size_t GetCols() const
get the number of columns in this matrix
Definition: matrix.hpp:305
Wrapper class for PSIDiagnosticsResponse .
Definition: blast_aux.hpp:348
Wrapper class for PSIMatrix .
Definition: blast_aux.hpp:346
Exception class for the CPssmEngine class.
Definition: pssm_engine.hpp:63
CSeq_entry & SetQuery()
Retrieve the query sequence.
Definition: Pssm.hpp:55
void SetHUngapped(double val)
Definition: Pssm.cpp:188
void SetH(double val)
Definition: Pssm.cpp:170
void SetLambdaUngapped(double val)
Definition: Pssm.cpp:176
void SetKappa(double val)
Definition: Pssm.cpp:164
void SetKappaUngapped(double val)
Definition: Pssm.cpp:182
void SetLambda(double val)
Definition: Pssm.cpp:158
Interface for strategy to pre-process multiple alignment of conserved domains matches as input data f...
static tds_mutex mtx
Definition: condition.c:43
virtual void Process()=0
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
static void s_CheckAgainstNullData(IPssmInputData *pssm_input_msa)
This function makes sure that none of the required data is returned as NULL or "empty".
Definition: pssm_engine.cpp:69
CRef< objects::CPssmWithParameters > x_CreatePssmFromMsa()
Using IPssmInputData as a delegate to provide input data in the form of a multiple sequence alignment...
CRef< objects::CPssmWithParameters > x_CreatePssmFromFreqRatios()
Using IPssmInputFreqRatios as a delegate to provide the input PSSM's frequency ratios,...
CBlastScoreBlk m_ScoreBlk
Blast score block structure.
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
const Blast_KarlinBlk * GetPsiGappedKarlinBlk() const
Retrieve PSI-BLAST gapped Karlin parameters.
const char * x_GetMatrixName() const
Private interface to retrieve matrix name from its data source interface.
virtual void Process(void)=0
Pre-process CDs used for PSSM computation.
static unsigned char * x_GuardProteinQuery(const unsigned char *query, unsigned int query_length)
Copies query sequence and adds protein sentinel bytes at the beginning and at the end of the sequence...
virtual const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
Definition: pssm_input.hpp:68
CRef< objects::CPssmWithParameters > x_CreatePssmFromCDD()
Using IPssmInputCdd as a delegate to provide data in the form of multiple alignment of CDs,...
static CRef< objects::CPssmWithParameters > x_PSIMatrix2Asn1(const PSIMatrix *pssm, const char *matrix_name, const PSIBlastOptions *opts=NULL, const PSIDiagnosticsResponse *diagnostics=NULL)
Converts the PSIMatrix structure into a ASN.1 CPssmWithParameters object.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
CPssmEngine()
Default constructor available for derived test classes.
size_t m_NumCols
number of columns in the matrix (for deallocation)
virtual unsigned char * GetQuery()=0
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest(void)
Get diagnostics options.
void x_InitializeScoreBlock(const unsigned char *query, unsigned int query_length, const char *matrix_name, int gap_existence, int gap_extension)
Initializes the BlastScoreBlk data member required to run the PSSM engine.
BlastScoreBlk * Get() const
Definition: blast_aux.hpp:333
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
Definition: pssm_input.hpp:123
virtual double GetImpalaScaleFactor()
Definition: pssm_input.hpp:144
int x_GetGapExtension() const
Private interface to retrieve gap extension cost from data source.
virtual int GetGapExistence()
Obtain the gap existence value for the underlying matrix used to build the PSSM.
Definition: pssm_input.hpp:73
virtual unsigned int GetQueryLength()=0
Get the query's length.
int x_GetGapExistence() const
Private interface to retrieve gap existence cost from data source.
IPssmInputFreqRatios * m_PssmInputFreqRatios
Pointer to input data to create PSSM from frequency ratios.
const Blast_KarlinBlk * GetPsiUngappedKarlinBlk() const
Retrieve PSI-BLAST ungapped Karlin parameters.
IPssmInputData * m_PssmInput
Handle to strategy to process raw PSSM input data.
virtual int GetGapExtension()
Obtain the gap extension value for the underlying matrix used to build the PSSM.
Definition: pssm_input.hpp:78
virtual const PSIBlastOptions * GetOptions()=0
Obtain the options for the PSSM engine.
virtual const PSIBlastOptions * GetOptions(void)=0
Get CDD-related PSI-BLAST options.
SNcbiMatrix2DoubleMatrix(const CNcbiMatrix< double > &m)
Constructor.
static void s_Validate(IPssmInputData *pssm_input_msa)
Performs validation on data provided before invoking the CORE PSSM engine.
IPssmInputCdd * m_PssmInputCdd
Pointer to strategy to process raw PSSM input data Note: Only one m_PssmInput* should be non-NULL.
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
virtual const CNcbiMatrix< double > & GetData()=0
Obtain a matrix of frequency ratios with this->GetQueryLength() columns and BLASTAA_SIZE rows.
virtual PSICdMsa * GetData(void)=0
Get CD data for PSSM computation.
unsigned int x_GetQueryLength() const
Private interface to retrieve query length from its data source interface.
void Reset(BlastScoreBlk *p=NULL)
Definition: blast_aux.hpp:333
~CPssmEngine()
Destructor.
void SetUngappedStatisticalParams(CConstRef< CBlastAncillaryData > ancillary_data)
Sets the Karlin & Altschul parameters in the BlastScoreBlk to be used in PSSM generation.
virtual void Process()=0
Algorithm to produce the PSSM's frequecy ratios should be implemented in this method.
virtual PSIMsa * GetData()=0
Obtain the multiple sequence alignment structure.
static std::string x_ErrorCodeToString(int error_code)
Convert a PSSM return status into a string.
virtual CRef< objects::CBioseq > GetQueryForPssm()
Get a CBioseq object for attachment into the CPssmWithParameters that CPssmEngine produces (only atta...
Definition: pssm_input.hpp:88
unsigned char * x_GetQuery() const
Private interface to retrieve query sequence from its data source interface.
double ** m_Data
double** representation of a CNcbiMatrix
~SNcbiMatrix2DoubleMatrix()
Destructor.
Uint1 GetSentinelByte(EBlastEncoding encoding) THROWS((CBlastException))
Convenience function to centralize the knowledge of which sentinel bytes we use for supported encodin...
BlastQueryInfo * x_InitializeQueryInfo(unsigned int query_length)
Initialiazes the core BlastQueryInfo structure for a single protein sequence.
@ eBlastEncodingProtein
NCBIstdaa.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * release(void)
Release will release ownership of pointer to caller.
Definition: ncbimisc.hpp:472
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
void SetParams(TParams &value)
Assign a value to Params data member.
void SetIsProtein(TIsProtein value)
Assign a value to IsProtein data member.
Definition: Pssm_.hpp:551
void SetByRow(TByRow value)
Assign a value to ByRow data member.
Definition: Pssm_.hpp:741
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
void SetIntermediateData(TIntermediateData &value)
Assign a value to IntermediateData data member.
Definition: Pssm_.cpp:99
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
Definition: Pssm_.cpp:116
TByRow GetByRow(void) const
Get the ByRow member data.
Definition: Pssm_.hpp:735
void SetNumColumns(TNumColumns value)
Assign a value to NumColumns data member.
Definition: Pssm_.hpp:666
void SetNumRows(TNumRows value)
Assign a value to NumRows data member.
Definition: Pssm_.hpp:619
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
static const int kScaleFactor
Definition: hyperclust.cpp:176
static int input()
int i
void abort()
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
C++ API for the PSI-BLAST PSSM engine.
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
The query related information.
BlastContextInfo * contexts
Information per context.
Uint4 max_length
Length of the longest among the concatenated queries.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Definition: blast_stat.h:207
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Definition: blast_stat.h:213
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Definition: blast_stat.h:208
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Definition: blast_stat.h:216
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:215
Int4 gap_open
Extra penalty for starting a gap.
Int4 gap_extend
Penalty for each gap residue.
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double H
H value used in statistics.
Definition: blast_stat.h:70
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
char * message
User message to be saved.
Definition: blast_message.h:73
Abstract base class to encapsulate the source(s) and pre-processing of PSSM input data as well as opt...
Definition: pssm_input.hpp:106
Interface used to retrieve the PSSM frequency ratios to allow for "restart" processing in PSI-BLAST: ...
Definition: pssm_input.hpp:131
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double impala_scaling_factor
Scaling factor as used in IMPALA to do the matrix rescaling.
Int4 pseudo_count
Pseudocount constant.
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Definition: blast_psi.h:201
double * information_content
position information content (query_length elements)
Definition: blast_psi.h:202
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:204
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Definition: blast_psi.h:208
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Definition: blast_psi.h:218
Uint4 alphabet_size
Specifies length of alphabet.
Definition: blast_psi.h:225
Uint4 query_length
Specifies the number of positions in the PSSM.
Definition: blast_psi.h:223
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
Definition: blast_psi.h:215
double * independent_observations
Effective number of observations per column.
Definition: blast_psi.h:227
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
Definition: blast_psi.h:220
double * sigma
sigma (query_length elements)
Definition: blast_psi.h:217
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:212
This is the main return value from the PSSM engine.
Definition: blast_psi.h:150
double ung_lambda
Ungapped Lambda Karlin-Altschul parameter.
Definition: blast_psi.h:157
double kappa
Kappa Karlin-Altschul parameter.
Definition: blast_psi.h:155
int ** pssm
Position-specific score matrix.
Definition: blast_psi.h:153
double ung_kappa
Ungapped Kappa Karlin-Altschul parameter.
Definition: blast_psi.h:158
Uint4 ncols
Number of columns in PSSM (query_length)
Definition: blast_psi.h:151
double ung_h
Ungapped H Karlin-Altschul parameter.
Definition: blast_psi.h:159
double lambda
Lambda Karlin-Altschul parameter.
Definition: blast_psi.h:154
Uint4 nrows
Number of rows in PSSM (alphabet_size)
Definition: blast_psi.h:152
double h
H Karlin-Altschul parameter.
Definition: blast_psi.h:156
Auxiliary class to convert from a CNcbiMatrix into a double** as required by the C API.
static string query
#define _ASSERT
voidp malloc(uInt size)
Modified on Wed Apr 17 13:09:07 2024 by modify_doxy.py rev. 669887