NCBI C++ ToolKit
psiblast_aux_priv.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 /* ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /// @file psiblast_aux_priv.cpp
31 /// Definitions of auxiliary functions/classes for PSI-BLAST
32 
33 #include <ncbi_pch.hpp>
34 #include "psiblast_aux_priv.hpp"
38 
44 #include "blast_aux_priv.hpp"
45 #include "../core/blast_psi_priv.h"
46 
47 // Utility headers
48 #include <util/format_guess.hpp>
49 #include <util/math/matrix.hpp>
50 
51 // Object includes
65 #include <sstream>
66 
67 /** @addtogroup AlgoBlast
68  *
69  * @{
70  */
71 
74 BEGIN_SCOPE(blast)
75 
78  TSearchMessages& messages,
79  CConstRef<CBlastOptions> options)
80 {
81  _ASSERT(score_blk);
82  _ASSERT(pssm.NotEmpty());
83 
84  if ( !score_blk->protein_alphabet ) {
85  NCBI_THROW(CBlastException, eInvalidArgument,
86  "BlastScoreBlk is not configured for a protein alphabet");
87  }
88 
89  // Assign the ungapped Karlin-Altschul block
90  if (pssm->GetPssm().GetLambdaUngapped() != CPssm::kInvalidStat) {
91  score_blk->kbp_psi[0]->Lambda = pssm->GetPssm().GetLambdaUngapped();
92  } else if (score_blk->kbp_std[0]->Lambda > 0.0) {
93  score_blk->kbp_psi[0]->Lambda = score_blk->kbp_std[0]->Lambda;
94  }
95 
96  if (pssm->GetPssm().GetKappaUngapped() != CPssm::kInvalidStat) {
97  score_blk->kbp_psi[0]->K = pssm->GetPssm().GetKappaUngapped();
98  } else if (score_blk->kbp_std[0]->K > 0.0) {
99  score_blk->kbp_psi[0]->K = score_blk->kbp_std[0]->K;
100  }
101  score_blk->kbp_psi[0]->logK = log(score_blk->kbp_psi[0]->K);
102 
103  if (pssm->GetPssm().GetHUngapped() != CPssm::kInvalidStat) {
104  score_blk->kbp_psi[0]->H = pssm->GetPssm().GetHUngapped();
105  } else if (score_blk->kbp_std[0]->K > 0.0) {
106  score_blk->kbp_psi[0]->H = score_blk->kbp_std[0]->H;
107  }
108 
109  // Assign the gapped Karlin-Altschul block
110  if (pssm->GetPssm().GetLambda() != CPssm::kInvalidStat) {
111  score_blk->kbp_gap_psi[0]->Lambda = pssm->GetPssm().GetLambda();
112  } else if (score_blk->kbp_gap_std[0]->Lambda > 0.0) {
113  score_blk->kbp_gap_psi[0]->Lambda = score_blk->kbp_gap_std[0]->Lambda;
114  }
115 
116  if (pssm->GetPssm().GetKappa() != CPssm::kInvalidStat) {
117  score_blk->kbp_gap_psi[0]->K = pssm->GetPssm().GetKappa();
118  } else if (score_blk->kbp_gap_std[0]->K > 0.0) {
119  score_blk->kbp_gap_psi[0]->K = score_blk->kbp_gap_std[0]->K;
120  }
121  score_blk->kbp_gap_psi[0]->logK = log(score_blk->kbp_gap_psi[0]->K);
122 
123  if (pssm->GetPssm().GetH() != CPssm::kInvalidStat) {
124  score_blk->kbp_gap_psi[0]->H = pssm->GetPssm().GetH();
125  } else if (score_blk->kbp_gap_std[0]->H > 0.0) {
126  score_blk->kbp_gap_psi[0]->H = score_blk->kbp_gap_std[0]->H;
127  }
128 
129  // Assign the matrix scores/frequency ratios
130  const size_t kQueryLength = pssm->GetPssm().GetNumColumns();
131  score_blk->psi_matrix = SPsiBlastScoreMatrixNew(kQueryLength);
132 
133  // Get the scores
134  bool missing_scores = false;
135  try {
136  unique_ptr< CNcbiMatrix<int> > scores
138  _ASSERT(score_blk->psi_matrix->pssm->ncols == scores->GetCols());
139  _ASSERT(score_blk->psi_matrix->pssm->nrows == scores->GetRows());
140 
141  for (TSeqPos c = 0; c < scores->GetCols(); c++) {
142  for (TSeqPos r = 0; r < scores->GetRows(); r++) {
143  score_blk->psi_matrix->pssm->data[c][r] = (*scores)(r, c);
144  }
145  }
146  } catch (const std::runtime_error&) {
147  missing_scores = true;
148  }
149 
150  // Get the frequency ratios
151  bool missing_freq_ratios = false;
152  // are all of the frequency ratios zeros? if so, issue a warning
153  bool freq_ratios_all_zeros = true;
154 
155  try {
156  unique_ptr< CNcbiMatrix<double> > freq_ratios
158  _ASSERT(score_blk->psi_matrix->pssm->ncols ==
159  freq_ratios->GetCols());
160  _ASSERT(score_blk->psi_matrix->pssm->nrows ==
161  freq_ratios->GetRows());
162 
163  for (TSeqPos c = 0; c < freq_ratios->GetCols(); c++) {
164  for (TSeqPos r = 0; r < freq_ratios->GetRows(); r++) {
165  score_blk->psi_matrix->freq_ratios[c][r] =
166  (*freq_ratios)(r, c);
167  if ((*freq_ratios)(r,c) > kEpsilon) {
168  freq_ratios_all_zeros = false;
169  }
170  }
171  }
172  } catch (const std::runtime_error&) {
173  missing_freq_ratios = true;
174  }
175 
176  if (missing_scores && missing_freq_ratios) {
177  NCBI_THROW(CBlastException, eInvalidArgument,
178  "Missing scores and frequency ratios in PSSM");
179  }
180 
181  _ASSERT(options->GetCompositionBasedStats() < eNumCompoAdjustModes);
182  // the message below is meaningless for deltablast
183  if (options->GetProgram() != eDeltaBlast &&
184  (options->GetCompositionBasedStats() != eNoCompositionBasedStats) &&
185  freq_ratios_all_zeros) {
186  ostringstream os;
187  os << "Frequency ratios for PSSM are all zeros, frequency ratios for ";
188  os << options->GetMatrixName() << " will be used during traceback ";
189  os << "in composition based statistics";
191  os.str()));
192  _ASSERT(messages.size() == 1); // PSI-BLAST only works with one query
193  messages.front().push_back(sm);
194  }
195 
196  if (options->GetCompositionBasedStats() > eCompositionBasedStats) {
197  // ugly, but necessary
198  const_cast<CBlastOptions*>(&*options)
199  ->SetCompositionBasedStats(eCompositionBasedStats);
200  ostringstream os;
201  os << "Composition-based score adjustment conditioned on "
202  << "sequence properties and unconditional composition-based score "
203  << "adjustment is not supported with PSSMs, resetting to default "
204  << "value of standard composition-based statistics";
206  os.str()));
207  _ASSERT(messages.size() == 1); // PSI-BLAST only works with one query
208  messages.front().push_back(sm);
209  }
210 }
211 
212 /// Convert a list of values into a CNcbiMatrix
213 /// @param source source of data [in]
214 /// @param dest destination of data [out]
215 /// @param by_row is the matrix data stored by row? [in]
216 /// @param num_rows number of rows [in]
217 /// @param num_cols number of columns [in]
218 template <class T>
219 void Convert2Matrix(const list<T>& source, CNcbiMatrix<T>& dest,
220  bool by_row, SIZE_TYPE num_rows, SIZE_TYPE num_columns)
221 {
222  typename list<T>::const_iterator itr = source.begin();
223  if (by_row == true) {
224  for (SIZE_TYPE r = 0; r < num_rows; r++) {
225  for (SIZE_TYPE c = 0; c < num_columns; c++) {
226  dest(r, c) = *itr++;
227  }
228  }
229  } else {
230  for (SIZE_TYPE c = 0; c < num_columns; c++) {
231  for (SIZE_TYPE r = 0; r < num_rows; r++) {
232  dest(r, c) = *itr++;
233  }
234  }
235  }
236  _ASSERT(itr == source.end());
237 }
238 
240 CScorematPssmConverter::GetScores(const objects::CPssmWithParameters& pssm_asn)
241 {
242  if ( !pssm_asn.GetPssm().CanGetFinalData() ||
243  !pssm_asn.GetPssm().GetFinalData().CanGetScores() ||
244  pssm_asn.GetPssm().GetFinalData().GetScores().empty() ) {
245  throw runtime_error("Cannot obtain scores from ASN.1 PSSM");
246  }
247 
248  const CPssm& pssm = pssm_asn.GetPssm();
249  _ASSERT((size_t)pssm.GetFinalData().GetScores().size() ==
250  (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
251 
252  unique_ptr< CNcbiMatrix<int> > retval
254  pssm.GetNumColumns(),
255  BLAST_SCORE_MIN));
256 
258  *retval, pssm.GetByRow(), pssm.GetNumRows(),
259  pssm.GetNumColumns());
260  return retval.release();
261 }
262 
264 CScorematPssmConverter::GetFreqRatios(const objects::CPssmWithParameters&
265  pssm_asn)
266 {
267  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
268  !pssm_asn.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
269  pssm_asn.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
270  throw runtime_error("Cannot obtain frequency ratios from ASN.1 PSSM");
271  }
272 
273  const CPssm& pssm = pssm_asn.GetPssm();
274  _ASSERT((size_t)pssm.GetIntermediateData().GetFreqRatios().size() ==
275  (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
276 
277  unique_ptr< CNcbiMatrix<double> > retval
278  (new CNcbiMatrix<double>(BLASTAA_SIZE, pssm.GetNumColumns(), 0.0));
279 
281  *retval, pssm.GetByRow(), pssm.GetNumRows(),
282  pssm.GetNumColumns());
283  return retval.release();
284 }
285 
288  (const objects::CPssmWithParameters& pssm_asn)
289 {
290  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
291  !pssm_asn.GetPssm().GetIntermediateData().CanGetResFreqsPerPos() ||
292  pssm_asn.GetPssm().GetIntermediateData().GetResFreqsPerPos().empty() )
293  {
294  return NULL;
295  }
296 
297  const CPssm& pssm = pssm_asn.GetPssm();
298  _ASSERT((size_t)pssm.GetIntermediateData().GetResFreqsPerPos().size() ==
299  (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
300 
301  unique_ptr< CNcbiMatrix<int> > retval
302  (new CNcbiMatrix<int>(BLASTAA_SIZE, pssm.GetNumColumns(), 0));
303 
305  *retval, pssm.GetByRow(), pssm.GetNumRows(),
306  pssm.GetNumColumns());
307  return retval.release();
308 }
309 
312  (const objects::CPssmWithParameters& pssm_asn)
313 {
314  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
315  !pssm_asn.GetPssm().GetIntermediateData().
316  CanGetWeightedResFreqsPerPos() ||
317  pssm_asn.GetPssm().GetIntermediateData().
318  GetWeightedResFreqsPerPos().empty() ) {
319  return NULL;
320  }
321 
322  const CPssm& pssm = pssm_asn.GetPssm();
323  _ASSERT((size_t)pssm.GetIntermediateData().
324  GetWeightedResFreqsPerPos().size() ==
325  (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
326 
327  unique_ptr< CNcbiMatrix<double> > retval
328  (new CNcbiMatrix<double>(BLASTAA_SIZE, pssm.GetNumColumns(), 0.0));
329 
331  *retval, pssm.GetByRow(), pssm.GetNumRows(),
332  pssm.GetNumColumns());
333  return retval.release();
334 }
335 
336 void
338  (const objects::CPssmWithParameters& pssm_asn,
339  vector<double>& retval)
340 {
341  retval.clear();
342  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
343  !pssm_asn.GetPssm().GetIntermediateData().CanGetInformationContent() ||
344  pssm_asn.GetPssm().
345  GetIntermediateData().GetInformationContent().empty() ) {
346  return;
347  }
348  const CPssm& pssm = pssm_asn.GetPssm();
351  back_inserter(retval));
352 }
353 
354 void
356  (const objects::CPssmWithParameters& pssm_asn,
357  vector<double>& retval)
358 {
359  retval.clear();
360  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
361  !pssm_asn.GetPssm().
362  GetIntermediateData().CanGetGaplessColumnWeights() ||
363  pssm_asn.GetPssm().
364  GetIntermediateData().GetGaplessColumnWeights().empty() ) {
365  return;
366  }
367  const CPssm& pssm = pssm_asn.GetPssm();
370  back_inserter(retval));
371 }
372 
373 void
374 CScorematPssmConverter::GetSigma(const objects::CPssmWithParameters& pssm_asn,
375  vector<double>& retval)
376 {
377  retval.clear();
378  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
379  !pssm_asn.GetPssm().GetIntermediateData().CanGetSigma() ||
380  pssm_asn.GetPssm().GetIntermediateData().GetSigma().empty() ) {
381  return;
382  }
383  const CPssm& pssm = pssm_asn.GetPssm();
384  copy(pssm.GetIntermediateData().GetSigma().begin(),
385  pssm.GetIntermediateData().GetSigma().end(),
386  back_inserter(retval));
387 }
388 
389 void
391  (const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
392 {
393  retval.clear();
394  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
395  !pssm_asn.GetPssm().
396  GetIntermediateData().CanGetIntervalSizes() ||
397  pssm_asn.GetPssm().
398  GetIntermediateData().GetIntervalSizes().empty() ) {
399  return;
400  }
401  const CPssm& pssm = pssm_asn.GetPssm();
402  copy(pssm.GetIntermediateData().GetIntervalSizes().begin(),
403  pssm.GetIntermediateData().GetIntervalSizes().end(),
404  back_inserter(retval));
405 }
406 
407 void
409  (const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
410 {
411  retval.clear();
412  if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
413  !pssm_asn.GetPssm().
414  GetIntermediateData().CanGetNumMatchingSeqs() ||
415  pssm_asn.GetPssm().
416  GetIntermediateData().GetNumMatchingSeqs().empty() ) {
417  return;
418  }
419  const CPssm& pssm = pssm_asn.GetPssm();
422  back_inserter(retval));
423 }
424 
425 void
426 PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters& pssm,
427  int gap_open,
428  int gap_extend)
429 {
430  _ASSERT(pssm.GetParams().GetRpsdbparams().IsSetMatrixName());
431  pssm.SetParams().SetRpsdbparams().SetGapOpen(gap_open);
432  pssm.SetParams().SetRpsdbparams().SetGapExtend(gap_extend);
433 }
434 
435 /** After creating the PSSM from frequency ratios, adjust the frequency ratios
436  * matrix to match the dimensions of the score matrix
437  * @param pssm matrix to adjust [in|out]
438  */
439 static void
441  pssm)
442 {
443  _ASSERT(pssm.GetPssm().GetNumRows() < BLASTAA_SIZE);
444  if (pssm.GetPssm().CanGetFinalData()) {
445  _ASSERT(pssm.GetPssm().GetFinalData().GetScores().size() ==
446  (size_t)BLASTAA_SIZE*pssm.GetPssm().GetNumColumns());
447  }
448 
449  const size_t diff = (size_t)BLASTAA_SIZE - pssm.GetPssm().GetNumRows();
451  pssm.SetPssm().SetIntermediateData().SetFreqRatios();
452 
453  if (pssm.GetPssm().GetByRow() == true) {
454  freq_ratios.resize(pssm.GetPssm().GetNumColumns() * BLASTAA_SIZE, 0.0);
455  } else {
456  CPssmIntermediateData::TFreqRatios::iterator itr = freq_ratios.begin();
457  for (int c = 0; c < pssm.GetPssm().GetNumColumns(); c++) {
458  advance(itr, pssm.GetPssm().GetNumRows());
459  freq_ratios.insert(itr, diff, 0.0);
460  }
461  }
462 
463  pssm.SetPssm().SetNumRows() = BLASTAA_SIZE;
464 }
465 
467  const CBlastOptions& opts)
468 {
469  CConstRef<CBioseq> query(&pssm->GetQuery().GetSeq());
470  CRef<IQueryFactory> seq_fetcher(new CObjMgrFree_QueryFactory(query)); /* NCBI_FAKE_WARNING */
471 
472  CRef<ILocalQueryData> query_data(seq_fetcher->MakeLocalQueryData(&opts));
473  BLAST_SequenceBlk* seqblk = query_data->GetSequenceBlk();
474  _ASSERT(query_data->GetSeqLength(0) == (size_t)seqblk->length);
475  _ASSERT(query_data->GetSeqLength(0) ==
476  (size_t)pssm->GetPssm().GetNumColumns());
477  unique_ptr< CNcbiMatrix<double> > freq_ratios
479 
480  CPsiBlastInputFreqRatios pssm_engine_input(seqblk->sequence,
481  seqblk->length,
482  *freq_ratios,
483  opts.GetMatrixName());
484  CPssmEngine pssm_engine(&pssm_engine_input);
485  CRef<CPssmWithParameters> pssm_with_scores(pssm_engine.Run());
486 
487  if (pssm->GetPssm().GetNumRows() !=
488  pssm_with_scores->GetPssm().GetNumRows()) {
489  _ASSERT(pssm_with_scores->GetPssm().GetNumRows() == BLASTAA_SIZE);
491  }
492  pssm->SetPssm().SetFinalData().SetScores() =
493  pssm_with_scores->GetPssm().GetFinalData().GetScores();
494  pssm->SetPssm().SetFinalData().SetLambda() =
495  pssm_with_scores->GetPssm().GetFinalData().GetLambda();
496  pssm->SetPssm().SetFinalData().SetKappa() =
497  pssm_with_scores->GetPssm().GetFinalData().GetKappa();
498  pssm->SetPssm().SetFinalData().SetH() =
499  pssm_with_scores->GetPssm().GetFinalData().GetH();
500 
502  opts.GetGapOpeningCost(),
503  opts.GetGapExtensionCost());
504 }
505 
506 /// Returns the evalue from this score object
507 /// @param score ASN.1 score object [in]
508 static double s_GetEvalue(const CScore& score)
509 {
510  string score_type = score.GetId().GetStr();
511  if (score.GetValue().IsReal() &&
512  (score_type == "e_value" || score_type == "sum_e")) {
513  return score.GetValue().GetReal();
514  }
516 }
517 
518 /// Returns the bit_score from this score object
519 /// @param score ASN.1 score object [in]
520 static double s_GetBitScore(const CScore& score)
521 {
522  string score_type = score.GetId().GetStr();
523  if (score.GetValue().IsReal() && score_type == "bit_score") {
524  return score.GetValue().GetReal();
525  }
526  return BLAST_EXPECT_VALUE;
527 }
528 
529 double GetLowestEvalue(const objects::CDense_seg::TScores& scores,
530  double* bit_score /* = NULL */)
531 {
532  double retval = BLAST_EXPECT_VALUE;
533  double tmp;
534  if (bit_score) {
535  *bit_score = retval;
536  }
537 
538  ITERATE(CDense_seg::TScores, i, scores) {
539  if ( (tmp = s_GetEvalue(**i)) < retval) {
540  retval = tmp;
541  }
542  if (bit_score && ((tmp = s_GetBitScore(**i)) > *bit_score)) {
543  *bit_score = tmp;
544  }
545  }
546  return retval;
547 }
548 
549 void
550 CPsiBlastAlignmentProcessor::operator()
551  (const objects::CSeq_align_set& alignments,
552  double evalue_inclusion_threshold,
554 {
555  output.clear();
556 
557  ITERATE(CSeq_align_set::Tdata, hsp, alignments.Get()) {
558  // Look for HSP with score less than inclusion_ethresh
559  double e = GetLowestEvalue((*hsp)->GetScore());
560  if (e < evalue_inclusion_threshold) {
561  CSeq_id_Handle sid =
562  CSeq_id_Handle::GetHandle((*hsp)->GetSeq_id(1));
563  output.insert(sid);
564  }
565  }
566 }
567 
568 void
569 CPsiBlastValidate::Pssm(const objects::CPssmWithParameters& pssm,
570  bool require_scores)
571 {
572  if ( !pssm.CanGetPssm() ) {
573  NCBI_THROW(CBlastException, eInvalidArgument,
574  "Missing PSSM data");
575  }
576 
577  bool missing_scores(false);
578  if ( !pssm.GetPssm().CanGetFinalData() ||
579  !pssm.GetPssm().GetFinalData().CanGetScores() ||
580  pssm.GetPssm().GetFinalData().GetScores().empty() ) {
581  missing_scores = true;
582  }
583 
584  bool missing_freq_ratios(false);
585  if ( !pssm.GetPssm().CanGetIntermediateData() ||
586  !pssm.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
587  pssm.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
588  missing_freq_ratios = true;
589  }
590 
591  if (missing_freq_ratios && missing_scores) {
592  NCBI_THROW(CBlastException, eInvalidArgument,
593  "PSSM data must contain either scores or frequency ratios");
594  }
595  if (missing_scores && require_scores) {
596  NCBI_THROW(CBlastException, eInvalidArgument,
597  "PSSM data must contain scores (did you run the PSSM engine?)");
598  }
599 
600  // Only unscaled PSSMs are supported
601  if (!missing_scores &&
602  pssm.GetPssm().GetFinalData().CanGetScalingFactor() &&
603  pssm.GetPssm().GetFinalData().GetScalingFactor() != 1) {
604  string msg("PSSM has a scaling factor of ");
605  msg += NStr::IntToString(pssm.GetPssm()
606  .GetFinalData()
607  .GetScalingFactor());
608  msg += ". PSI-BLAST does not accept scaled PSSMs";
609  NCBI_THROW(CBlastException, eInvalidArgument, msg);
610  }
611 
612  if ( !pssm.HasQuery() ) {
613  NCBI_THROW(CBlastException, eInvalidArgument,
614  "Missing query sequence in PSSM");
615  }
616  if ( !pssm.GetQuery().IsSeq() ) {
617  NCBI_THROW(CBlastException, eInvalidArgument,
618  "Query sequence in ASN.1 PSSM is not a single Bioseq");
619  }
620 
621  if ( !pssm.GetPssm().GetIsProtein() ) {
622  NCBI_THROW(CBlastException, eInvalidArgument,
623  "PSSM does not represent protein scoring matrix");
624  }
625 }
626 
627 void
629  const CBlastOptionsHandle& opts_handle,
630  EQueryFactoryType qf_type)
631 {
632  CRef<ILocalQueryData> query_data =
633  query_factory->MakeLocalQueryData(&opts_handle.GetOptions());
634 
635  // Compose the exception error message
636  string excpt_msg("PSI-BLAST only accepts ");
637  if (qf_type == eQFT_Query) {
638  excpt_msg += "one protein sequence as query";
639  } else if (qf_type == eQFT_Subject) {
640  excpt_msg += "protein sequences as subjects";
641  } else {
642  abort();
643  }
644 
645  if (qf_type == eQFT_Query) {
646  if (query_data->GetNumQueries() != 1) {
647  NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
648  }
649  }
650 
651  BLAST_SequenceBlk* sblk = NULL;
652  try { sblk = query_data->GetSequenceBlk(); }
653  catch (const CBlastException& e) {
654  if (e.GetMsg().find("Incompatible sequence codings") != NPOS) {
655  NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
656  }
657  }
658  _ASSERT(sblk);
659  _ASSERT(sblk->length > 0);
660 
661  CFormatGuess::ESequenceType sequence_type =
662  CFormatGuess::SequenceType((const char*)sblk->sequence_start,
663  static_cast<unsigned>(sblk->length));
664  if (sequence_type == CFormatGuess::eNucleotide) {
665  excpt_msg.assign("PSI-BLAST cannot accept nucleotide ");
666  excpt_msg += (qf_type == eQFT_Query ? "queries" : "subjects");
667  NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
668  }
669 }
670 
671 END_SCOPE(blast)
673 
674 /* @} */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Auxiliary functions for BLAST.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
Declares the BLAST exception class.
@ eBlastSevWarning
Definition: blast_message.h:57
The structures and functions in blast_options.
#define BLAST_EXPECT_VALUE
Default parameters for saving hits.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
const double kEpsilon
Small constant to test against 0.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
Definition: blast_stat.h:121
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
Definition: blast_stat.c:805
@ eDeltaBlast
Delta Blast.
Definition: blast_types.hpp:71
Defines BLAST error codes (user errors included)
Handle to the options to the BLAST algorithm.
Encapsulates ALL the BLAST algorithm's options.
CConstRef –.
Definition: ncbiobj.hpp:1266
static ESequenceType SequenceType(const char *str, unsigned length=0, ESTStrictness strictness=eST_Default)
Guess sequence type.
NCBI C++ Object Manager free implementation of IQueryFactory.
Implements the interface to retrieve data for the last 2 stages of the PSSM creation.
Computes a PSSM as specified in PSI-BLAST.
Definition: Pssm.hpp:55
static const double kInvalidStat
Definition: Pssm.hpp:71
CRef –.
Definition: ncbiobj.hpp:618
Definition: Score.hpp:57
Error or Warning Message from search.
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
@ eNoCompositionBasedStats
Don't use composition based statistics.
@ eNumCompoAdjustModes
static SQLCHAR output[256]
Definition: print.c:5
static char tmp[3200]
Definition: utf8.c:42
static void QueryFactory(CRef< IQueryFactory > query_factory, const CBlastOptionsHandle &opts_handle, EQueryFactoryType query_factory_type=eQFT_Query)
Function to perform sanity checks on the query factory.
static CNcbiMatrix< int > * GetResidueFrequencies(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
void PsiBlastSetupScoreBlock(BlastScoreBlk *score_blk, CConstRef< objects::CPssmWithParameters > pssm, TSearchMessages &messages, CConstRef< CBlastOptions > options)
Setup CORE BLAST score block structure with data from the scoremat PSSM.
void Convert2Matrix(const list< T > &source, CNcbiMatrix< T > &dest, bool by_row, SIZE_TYPE num_rows, SIZE_TYPE num_columns)
Convert a list of values into a CNcbiMatrix.
static CNcbiMatrix< int > * GetScores(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static void GetSigma(const objects::CPssmWithParameters &pssm, vector< double > &retval)
Data used in sequence weights computation.
static CNcbiMatrix< double > * GetWeightedResidueFrequencies(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
virtual BLAST_SequenceBlk * GetSequenceBlk()=0
Accessor for the BLAST_SequenceBlk structure.
CRef< ILocalQueryData > MakeLocalQueryData(const CBlastOptions *opts)
Creates and caches an ILocalQueryData.
Definition: query_data.cpp:52
EQueryFactoryType
Enumeration to specify the different uses of the query factory.
int GetGapExtensionCost() const
static double s_GetBitScore(const CScore &score)
Returns the bit_score from this score object.
static void s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(objects::CPssmWithParameters &pssm)
After creating the PSSM from frequency ratios, adjust the frequency ratios matrix to match the dimens...
static double s_GetEvalue(const CScore &score)
Returns the evalue from this score object.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
static void GetNumMatchingSeqs(const objects::CPssmWithParameters &pssm, vector< int > &retval)
Gets the number of matching sequences per position of the PSSM.
void PsiBlastComputePssmScores(CRef< objects::CPssmWithParameters > pssm, const CBlastOptions &opts)
Given a PSSM with frequency ratios and options, invoke the PSSM engine to compute the scores.
virtual size_t GetNumQueries()=0
Get the number of queries.
double GetLowestEvalue(const objects::CDense_seg::TScores &scores, double *bit_score)
Returns the lowest score from the list of scores in CDense_seg::TScores.
void PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters &pssm, int gap_open, int gap_extend)
Even though the query sequence and the matrix gap costs are not a product of the PSSM engine,...
static void Pssm(const objects::CPssmWithParameters &pssm, bool require_scores=false)
Perform validation on the PSSM.
static void GetGaplessColumnWeights(const objects::CPssmWithParameters &pssm, vector< double > &retval)
Returns the relative gapless PSSM column weights to pseudocounts for the provided PSSM.
virtual size_t GetSeqLength(size_t index)=0
Get the length of the sequence indicated by index.
int GetGapOpeningCost() const
static void GetInformationContent(const objects::CPssmWithParameters &pssm, vector< double > &retval)
Returns the information content per position of the PSSM.
const char * GetMatrixName() const
static void GetIntervalSizes(const objects::CPssmWithParameters &pssm, vector< int > &retval)
Length of the aligned regions per position of the query sequence.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumMatchingSeqs & GetNumMatchingSeqs(void) const
Get the NumMatchingSeqs member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
Definition: Pssm_.hpp:610
const TGaplessColumnWeights & GetGaplessColumnWeights(void) const
Get the GaplessColumnWeights member data.
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
const TIntervalSizes & GetIntervalSizes(void) const
Get the IntervalSizes member data.
const TSigma & GetSigma(void) const
Get the Sigma member data.
const TInformationContent & GetInformationContent(void) const
Get the InformationContent member data.
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
Definition: Pssm_.hpp:814
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
Definition: Pssm_.hpp:657
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
Definition: Pssm_.hpp:793
TByRow GetByRow(void) const
Get the ByRow member data.
Definition: Pssm_.hpp:735
const TResFreqsPerPos & GetResFreqsPerPos(void) const
Get the ResFreqsPerPos member data.
const TPssm & GetPssm(void) const
Get the Pssm member data.
TLambda GetLambda(void) const
Get the Lambda member data.
bool IsReal(void) const
Check if variant Real is selected.
Definition: Score_.hpp:378
const TValue & GetValue(void) const
Get the Value member data.
Definition: Score_.hpp:465
vector< CRef< CScore > > TScores
Definition: Dense_seg_.hpp:110
list< CRef< CSeq_align > > Tdata
TReal GetReal(void) const
Get the variant data.
Definition: Score_.hpp:384
const TId & GetId(void) const
Get the Id member data.
Definition: Score_.hpp:444
int i
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
const CharType(& source)[N]
Definition: pointer.h:1149
T max(T x_, T y_)
void abort()
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
Declarations of auxiliary functions/classes for PSI-BLAST.
C++ API for the PSI-BLAST PSSM engine.
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence_start
Start of sequence, usually one byte before sequence as that byte is a NULL sentinel byte.
Definition: blast_def.h:244
Int4 length
Length of sequence.
Definition: blast_def.h:246
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Structure used for scoring calculations.
Definition: blast_stat.h:177
static string query
#define _ASSERT
Modified on Thu Apr 25 08:18:53 2024 by modify_doxy.py rev. 669887