NCBI C++ ToolKit
pssmcreate_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: pssmcreate_unit_test.cpp 92005 2020-12-17 15:27:24Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file pssmcreate-cppunit.cpp
31  * Unit test module for creation of PSSMs from multiple sequence alignments.
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 
36 #include <corelib/ncbi_limits.hpp>
37 
38 // Serial library includes
39 #include <serial/serial.hpp>
40 #include <serial/objistr.hpp>
41 
42 #include <util/random_gen.hpp>
43 #include <util/math/matrix.hpp>
44 
45 // Object includes
52 
54 
55 // ASN.1 definition for PSSM (scoremat)
62 
63 // BLAST includes
70 #include <blast_objmgr_priv.hpp>
71 #include <blast_psi_priv.h>
72 #include <blast_posit.h>
73 #include "psiblast_aux_priv.hpp" // for CScorematPssmConverter
74 
77 
78 // Unit test auxiliary includes
79 #include "blast_test_util.hpp"
80 #include "pssm_test_util.hpp"
81 // #include "psiblast_test_util.hpp"
82 
83 // Object manager includes
84 #include <objmgr/util/sequence.hpp>
85 
86 // Standard scoring matrices
88 
89 // Seqport utilities
91 
92 #include "test_objmgr.hpp"
93 
94 using namespace std;
95 using namespace ncbi;
96 using namespace ncbi::objects;
97 using namespace ncbi::blast;
98 
99 
100 /******************************* copied from blast_psi_cxx.cpp **************/
101 
102 /// Mock object for the PSSM input data which returns multiple sequence
103 /// alignment data which has flanking gaps
105 {
106 public:
108  const unsigned int kQuerySize = 10;
109  const unsigned int kNumSeqs = 2;
110  const unsigned char kQuery[] = { 3, 9, 14, 20, 6, 23, 1, 7, 16, 5 };
111 
112  m_query = new unsigned char[kQuerySize];
113  memcpy((void*) m_query, (void*) kQuery, kQuerySize*sizeof(*kQuery));
114 
115  m_dim.query_length = kQuerySize;
116  m_dim.num_seqs = kNumSeqs;
117 
118  m_msa = PSIMsaNew(&m_dim);
119 
120  for (unsigned int i = 0; i < m_dim.query_length; i++) {
121  for (unsigned int j = 0; j < m_dim.num_seqs+1; j++) {
122  m_msa->data[j][i].letter = kQuery[i];
123  m_msa->data[j][i].is_aligned = true;
124  }
125  }
126 
127  // Add the flanking gaps
128  m_msa->data[1][0].letter =
129  m_msa->data[2][0].letter =
130  m_msa->data[2][m_dim.query_length-1].letter =
132 
133  m_options = NULL;
134  PSIBlastOptionsNew(&m_options);
135 
136  // don't request any diagnostics data
137  memset((void*) &m_diag_request, 0, sizeof(m_diag_request));
138  }
139 
141  delete [] m_query;
142  m_msa = PSIMsaFree(m_msa);
143  m_options = PSIBlastOptionsFree(m_options);
144  }
145 
146  void Process() {}
147  unsigned char* GetQuery() { return m_query; }
148  unsigned int GetQueryLength() { return m_dim.query_length; }
149  PSIMsa* GetData() { return m_msa; }
150  const PSIBlastOptions* GetOptions() { return m_options; }
152  return &m_diag_request;
153  }
154 
155 protected:
156 
157  unsigned char* m_query;
162 };
163 
164 /// Mock object for the PSSM input data which returns a query sequence with a
165 /// gap in it
167 {
168 public:
170  // initialize multiple sequence alignment data with valid data
171  for (unsigned int i = 0; i < m_dim.query_length; i++) {
172  for (unsigned int j = 0; j < m_dim.num_seqs+1; j++) {
173  m_msa->data[j][i].letter = m_query[i];
174  m_msa->data[j][i].is_aligned = true;
175  }
176  }
177 
178  // Randomly assign a position in the query to contain a gap
179  CRandom r((CRandom::TValue)time(0));
180  int gap_position = r.GetRand(0, GetQueryLength() - 1);
181  m_query[gap_position] = AMINOACID_TO_NCBISTDAA[(int)'-'];
182  m_msa->data[0][gap_position].letter = m_query[gap_position];
183  }
184 };
185 
186 /// Mock object for the PSSM input data which returns a query sequence with a
187 /// gap in it
189 {
190 public:
191  unsigned int GetQueryLength() { return 0; }
192 };
193 
194 /// Mock object for the PSSM input data which returns NULLs for all its methods
196 {
197 public:
198  void Process() {}
199  unsigned char* GetQuery() { return NULL; }
200  unsigned int GetQueryLength() { return 0; }
201  PSIMsa* GetData() { return NULL; }
202  const PSIBlastOptions* GetOptions() { return NULL; }
203  const char* GetMatrixName() { return NULL; }
205 };
206 
208 {
209 public:
210  const char* GetMatrixName() { return "TEST"; }
211 };
212 
213 /// Mock object for the PSSM input data which can be configured to have
214 /// different combinations of aligned sequences. Currently used to test the
215 /// purging of biased sequences in multiple sequence alignment data
217 {
218 public:
219  // Convenience for defining an aligned segment/region in the multiple
220  // sequence alignment
221  typedef pair<TSeqPos, TSeqPos> TAlignedSegment;
222 
223  // Enumeration to specify the various data setups that can be created with
224  // this class
226  eSelfHit, // Single pairwise alignment which is a self hit
227  eDuplicateHit, // 2 pairwise alignments where hits 1 and 2 are
228  // identical
229  eNearIdenticalHits, // 2 pairswise alignments where hits 1 and 2 are
230  // 94% identical
231  eMsaHasUnalignedRegion, // multiple sequence alignment with 3 sequences
232  // (including the query) which contain a region
233  // where the query is unaligned to any other
234  // sequences, i.e.:
235  // query: AAAAAAAAAABBBBBBBCCCCCCCCCCC
236  // sbjct: DDDDDDDDDD------------------
237  // sbjct: -----------------EEEEEEEEEEE
238  eQueryAlignedWithInternalGaps, // multiple sequence alignment with 2
239  // sequences which contain regions where internal
240  // (as opposed to flanking) gaps are aligned to the
241  // query sequence, i.e.:
242 // num_seqs: 1, query_length: 87
243 // MFKVYGYDSNIHKCGPCDNAKRLLTVKKQPFEFINIMPEKGVFDDEKIAELLTKLGRDTQIGLTMPQVFAPDGSHIGGFDQLREYFK
244 // KVVVFIKP----TCPFCRKTQELLSQLPFLLEFVDITAT--SDTNEIQDYLQQLTGA-----RTVPRVFIG-KECIGGCTDLESMHK
245  eHenikoffsPaper
246  };
247 
249 
250  // Clean up data allocated by parent class
251  if (m_query) {
252  delete [] m_query;
253  m_query = NULL;
254  m_msa = PSIMsaFree(m_msa);
255  m_options = PSIBlastOptionsFree(m_options);
256  }
257 
258  PSIBlastOptionsNew(&m_options);
259  if (opts) {
260  memcpy((void*)&m_options, (void*)opts, sizeof(PSIBlastOptions));
261  }
262 
263  switch (type) {
264  case eSelfHit:
265  SetupSelfHit();
266  break;
267 
268  case eDuplicateHit:
269  SetupDuplicateHit();
270  break;
271 
272  case eNearIdenticalHits:
273  SetupNearIdenticalHits();
274  break;
275 
276  case eMsaHasUnalignedRegion:
277  SetupMsaHasUnalignedRegion();
278  break;
279 
280  case eQueryAlignedWithInternalGaps:
281  SetupQueryAlignedWithInternalGaps();
282  break;
283 
284  case eHenikoffsPaper:
285  SetupHenikoffsPositionBasedSequenceWeights();
286  break;
287 
288  default:
289  throw std::logic_error("Unsupported alignment test data");
290  }
291  }
292 
294  delete [] m_query;
295  m_query = NULL;
296  m_msa = PSIMsaFree(m_msa);
297  m_options = PSIBlastOptionsFree(m_options);
298  }
299 
300 
301 private:
302 // Gi 129295
303 static const size_t kQueryLength = 232;
304 static const Uint1 kQuery[kQueryLength];
305 
306  void SetupSelfHit(void) {
307  const Uint4 kNumAlignedSeqs = 1; // does not include query
308 
309  m_dim.query_length = kQueryLength;
310  m_dim.num_seqs = kNumAlignedSeqs;
311  m_msa = PSIMsaNew(&m_dim);
312  m_query = new unsigned char[kQueryLength];
313 
314  // Initialize sequence 1 with the query (self-hit)
315  for (unsigned int i = 0; i < kQueryLength; i++) {
316  for (unsigned int seq_idx = 0; seq_idx < kNumAlignedSeqs + 1;
317  seq_idx++) {
318  m_msa->data[seq_idx][i].letter = m_query[i] = kQuery[i];
319  m_msa->data[seq_idx][i].is_aligned = true;
320  }
321  }
322  }
323 
325  (Uint1 res, const SNCBIPackedScoreMatrix* score_matrix)
326  {
327  BOOST_REQUIRE(score_matrix);
328  Uint1 retval = AMINOACID_TO_NCBISTDAA[(int)'-'];
329  int max_score = BLAST_SCORE_MIN;
330 
331  for (size_t i = 0; i < BLASTAA_SIZE; i++) {
332  // alignment with itself is not allowed :)
333  if (i == res) {
334  continue;
335  }
336  int score =
337  static_cast<int>(NCBISM_GetScore(score_matrix, res, i));
338  if (score > max_score) {
339  max_score = score;
340  retval = i;
341  }
342  }
343  BOOST_REQUIRE(retval != AMINOACID_TO_NCBISTDAA[(int)'-']);
344  return retval;
345  }
346 
348  const Uint4 kNumAlignedSeqs = 2; // does not include query
349 
350  m_dim.query_length = kQueryLength;
351  m_dim.num_seqs = kNumAlignedSeqs;
352  m_msa = PSIMsaNew(&m_dim);
353  m_query = new unsigned char[kQueryLength];
354 
355  // Initialize query sequence
356  for (unsigned int i = 0; i < kQueryLength; i++) {
357  m_msa->data[0][i].letter = m_query[i] = kQuery[i];
358  m_msa->data[0][i].is_aligned = true;
359  }
360 
361  const SNCBIPackedScoreMatrix* score_matrix = &NCBISM_Blosum62;
362 
363  // Initialize sequence 1 with the highest scoring residues that can be
364  // aligned with the query for the first 100 residues
365  // This is done so that the aligned sequences are not purged in the
366  // first stage of PSSM creation
367  const TAlignedSegment kFirstAlignment(0, 100);
368  for (unsigned int i = kFirstAlignment.first;
369  i < kFirstAlignment.second; i++) {
370  m_msa->data[1][i].letter =
371  FindNonIdenticalHighScoringResidue(kQuery[i], score_matrix);
372  m_msa->data[1][i].is_aligned = true;
373  }
374 
375  // Initialize sequence 2 with the highest scoring residues that can be
376  // aligned with the query for residue positions 200-kQueryLength
377  // This is done so that the aligned sequences are not purged in the
378  // first stage of PSSM creation
379  const TAlignedSegment kSecondAlignment(200, kQueryLength);
380  for (unsigned int i = kSecondAlignment.first;
381  i < kSecondAlignment.second; i++) {
382  m_msa->data[2][i].letter =
383  FindNonIdenticalHighScoringResidue(kQuery[i], score_matrix);
384  m_msa->data[2][i].is_aligned = true;
385  }
386  }
387 
389  using std::pair;
390  using std::string;
391  using std::vector;
392 
393  const Uint4 kNumAlignedSeqs = 1;
394  const size_t kLocalQueryLength = 87;
395 
396  m_dim.query_length = kLocalQueryLength;
397  m_dim.num_seqs = kNumAlignedSeqs;
398  m_msa = PSIMsaNew(&m_dim);
399  m_query = new unsigned char[kLocalQueryLength];
400 
401  string query_seq("MFKVYGYDSNIHKCGPCDNAKRLLTVKKQPFEFINIM");
402  query_seq += string("PEKGVFDDEKIAELLTKLGRDTQIGLTMPQVFAPDGSHIGGFD");
403  query_seq += string("QLREYFK");
404 
405  typedef pair<TAlignedSegment, string> TAlignedSequence;
406  vector<TAlignedSequence> aligned_sequence;
407 
408  TAlignedSequence region(make_pair(make_pair(0U, 8U),
409  string("KVVVFIKP")));
410  aligned_sequence.push_back(region);
411 
412  region = make_pair(make_pair(12U, 39U),
413  string("TCPFCRKTQELLSQLPFLLEFVDITAT"));
414  aligned_sequence.push_back(region);
415 
416  region = make_pair(make_pair(41U, 57U), string("SDTNEIQDYLQQLTGA"));
417  aligned_sequence.push_back(region);
418 
419  region = make_pair(make_pair(62U, 71U), string("RTVPRVFIG"));
420  aligned_sequence.push_back(region);
421 
422  region = make_pair(make_pair(72U, 87U), string("KECIGGCTDLESMHK"));
423  aligned_sequence.push_back(region);
424 
425 
426  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA[(int)'-'];
427  for (Uint4 i = 0; i < kLocalQueryLength; i++) {
429  query_seq.substr(i, 1));
430  m_msa->data[0][i].letter = m_query[i];
431  m_msa->data[0][i].is_aligned = true;
432 
433  // align the second sequence to gaps
434  m_msa->data[1][i].letter = kGapResidue;
435  m_msa->data[1][i].is_aligned = true;
436  }
437 
438  // Now overwrite the gaps with the aligned sequences
439  ITERATE(vector<TAlignedSequence>, itr, aligned_sequence) {
440  TAlignedSegment loc = itr->first; // location in the sequence
441  string sequence_data = itr->second;
442 
443  for (Uint4 i = loc.first, j = 0; i < loc.second; i++, j++) {
444  m_msa->data[1][i].letter =
446  sequence_data.substr(j, 1));
447  }
448  }
449  }
450 
452  const Uint4 kNumAlignedSeqs = 3; // does not include query
453  const Uint1 kQuerySequence[5] = { 7, 22, 19, 7, 17 };
454  const Uint1 kSeq1[5] = { 7, 6, 4, 7, 6 };
455  const Uint1 kSeq2[5] = { 7, 22, 4, 7, 6 };
456  const Uint1 kSeq3[5] = { 7, 22, 15, 7, 7 };
457 
458  m_dim.query_length = sizeof(kQuery);
459  m_dim.num_seqs = kNumAlignedSeqs;
460  m_msa = PSIMsaNew(&m_dim);
461  m_query = new unsigned char[sizeof(kQuerySequence)];
462 
463  // Initialize aligned sequences
464  for (Uint4 s = 0; s < kNumAlignedSeqs; s++) {
465 
466  const Uint1* sequence = NULL;
467  switch (s) {
468  case 0: sequence = kSeq1; break;
469  case 1: sequence = kSeq2; break;
470  case 2: sequence = kSeq3; break;
471  default: abort(); // should never happen
472  }
473 
474  for (Uint4 i = 0; i < sizeof(kQuerySequence); i++) {
475  m_query[i] = kQuerySequence[i];
476  m_msa->data[s][i].letter = sequence[i];
477  m_msa->data[s][i].is_aligned = true;
478  }
479  }
480  }
481 
482  void SetupDuplicateHit(void) {
483  const Uint4 kNumAlignedSeqs = 2; // does not include query
484 
485  // This sequence is used as aligned sequence #1 and #2, i.e. it is a
486  // duplicate hit
487  const Uint1 kGi_129296_[388] = {
488  12, 4, 17, 9, 17, 19, 18, 13, 1, 10, 6, 3, 6, 4, 19,
489  6, 13, 5, 12, 10, 19, 8, 8, 19, 13, 5, 13, 9, 11, 22,
490  3, 14, 11, 17, 9, 11, 18, 1, 11, 1, 12, 19, 22, 11, 7,
491  1, 16, 7, 13, 18, 5, 17, 15, 12, 10, 10, 19, 11, 8, 6,
492  4, 17, 9, 18, 7, 1, 7, 17, 18, 18, 4, 17, 15, 3, 7,
493  17, 17, 5, 22, 19, 8, 13, 11, 6, 10, 5, 11, 11, 17, 5,
494  9, 18, 16, 14, 13, 1, 18, 22, 17, 11, 5, 9, 1, 4, 10,
495  11, 22, 19, 4, 10, 18, 6, 17, 19, 11, 14, 5, 22, 11, 17,
496  3, 1, 16, 10, 6, 22, 18, 7, 7, 19, 5, 5, 19, 13, 6,
497  10, 18, 1, 1, 5, 5, 1, 16, 15, 11, 9, 13, 17, 20, 19,
498  5, 10, 5, 18, 13, 7, 15, 9, 10, 4, 11, 11, 19, 17, 17,
499  17, 9, 4, 6, 7, 18, 18, 12, 19, 6, 9, 13, 18, 9, 22,
500  6, 10, 7, 9, 20, 10, 9, 1, 6, 13, 18, 5, 4, 18, 16,
501  5, 12, 14, 6, 17, 12, 18, 10, 5, 5, 17, 10, 14, 19, 15,
502  12, 12, 3, 12, 13, 13, 17, 6, 13, 19, 1, 18, 11, 14, 1,
503  5, 10, 12, 10, 9, 11, 5, 11, 14, 22, 1, 17, 7, 4, 11,
504  17, 12, 11, 19, 11, 11, 14, 4, 5, 19, 17, 7, 11, 5, 16,
505  9, 5, 10, 18, 9, 13, 6, 4, 10, 11, 16, 5, 20, 18, 17,
506  18, 13, 1, 12, 1, 10, 10, 17, 12, 10, 19, 22, 11, 14, 16,
507  12, 10, 9, 5, 5, 10, 22, 13, 11, 18, 17, 9, 11, 12, 1,
508  11, 7, 12, 18, 4, 11, 6, 17, 16, 17, 1, 13, 11, 18, 7,
509  9, 17, 17, 19, 4, 13, 11, 12, 9, 17, 4, 1, 19, 8, 7,
510  19, 6, 12, 5, 19, 13, 5, 5, 7, 18, 5, 1, 18, 7, 17,
511  18, 7, 1, 9, 7, 13, 9, 10, 8, 17, 11, 5, 11, 5, 5,
512  6, 16, 1, 4, 8, 14, 6, 11, 6, 6, 9, 16, 22, 13, 14,
513  18, 13, 1, 9, 11, 6, 6, 7, 16, 22, 20, 17, 14};
514 
515  m_dim.query_length = kQueryLength;
516  m_dim.num_seqs = kNumAlignedSeqs;
517  m_msa = PSIMsaNew(&m_dim);
518  m_query = new unsigned char[kQueryLength];
519 
520  for (unsigned int i = 0; i < kQueryLength; i++) {
521  m_msa->data[kQueryIndex][i].letter = m_query[i] = kQuery[i];
522  m_msa->data[kQueryIndex][i].is_aligned = true;
523  }
524 
525  for (unsigned int i = 1; i < kNumAlignedSeqs + 1; i++) {
526  for (unsigned int j = 0; j < kQueryLength; j++) {
527  m_msa->data[i][j].letter = kGi_129296_[j];
528  m_msa->data[i][j].is_aligned = true;
529  }
530  }
531  }
532 
534  SetupDuplicateHit();
535 
536  const Uint4 kHitIndex = 2; // index of the near identical hit
537  const Uint4 kNumIdenticalResidues = (Uint4) (GetQueryLength() *
538  (kPSINearIdentical + 0.01));
539 
540  for (Uint4 i = kNumIdenticalResidues; i < GetQueryLength(); i++) {
541  Uint1& residue = m_msa->data[kHitIndex][i].letter;
542  residue = (residue + 1) % BLASTAA_SIZE;
543  BOOST_REQUIRE(residue > 0 && residue < BLASTAA_SIZE);
544  }
545  }
546 };
547 
550  15, 9, 10, 4, 11, 11, 19, 17, 17, 17, 18, 4, 11, 4, 18,
551  18, 11, 19, 11, 19, 13, 1, 9, 22, 6, 10, 7, 12, 20, 10,
552  18, 1, 6, 13, 1, 5, 4, 18, 16, 5, 12, 14, 6, 8, 19,
553  18, 10, 15, 5, 17, 10, 14, 19, 15, 12, 12, 3, 12, 13, 13,
554  17, 6, 13, 19, 1, 18, 11, 14, 1, 5, 10, 12, 10, 9, 11,
555  5, 11, 14, 6, 1, 17, 7, 4, 11, 17, 12, 11, 19, 11, 11,
556  14, 4, 5, 19, 17, 4, 11, 5, 16, 9, 5, 10, 18, 9, 13,
557  6, 5, 10, 11, 18, 5, 20, 18, 13, 14, 13, 18, 12, 5, 10,
558  16, 16, 19, 10, 19, 22, 11, 14, 15, 12, 10, 9, 5, 5, 10,
559  22, 13, 11, 18, 17, 19, 11, 12, 1, 11, 7, 12, 18, 4, 11,
560  6, 9, 14, 17, 1, 13, 11, 18, 7, 9, 17, 17, 1, 5, 17,
561  11, 10, 9, 17, 15, 1, 19, 8, 7, 1, 6, 12, 5, 11, 17,
562  5, 4, 7, 9, 5, 12, 1, 7, 17, 18, 7, 19, 9, 5, 4,
563  9, 10, 8, 17, 14, 5, 17, 5, 15, 6, 16, 1, 4, 8, 14,
564  6, 11, 6, 11, 9, 10, 8, 13, 14, 18, 13, 18, 9, 19, 22,
565  6, 7, 16, 22, 20, 17, 14};
566 
567 
568 BOOST_FIXTURE_TEST_SUITE(pssmcreate, CPssmCreateTestFixture)
569 
570 
571 BOOST_AUTO_TEST_CASE(testFullPssmEngineRunWithDiagnosticsRequest) {
572 
573  const string seqalign("data/nr-129295.new.asn.short");
574  unique_ptr<CObjectIStream> in
576 
578  *in >> *sas;
579 
580  CSeq_id qid("gi|129295"), sid("gi|6");
581  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(qid));
582  SBlastSequence seq(GetSequence(*q->seqloc, eBlastEncodingProtein, q->scope));
583 
584  CPSIBlastOptions opts;
585  PSIBlastOptionsNew(&opts);
586 
587  PSIDiagnosticsRequest request;
588  memset((void*) &request, 0, sizeof(request));
589  request.information_content = true;
590  request.residue_frequencies = true;
591  request.weighted_residue_frequencies = true;
592  request.frequency_ratios = true;
593  request.gapless_column_weights = true;
594  request.sigma = true;
595  request.interval_sizes = true;
596  request.num_matching_seqs = true;
597 
598  const string kTitle("Test defline");
599 
600  CRef<IPssmInputData> pssm_strategy(
601  new CPsiBlastInputData(seq.data.get()+1,
602  seq.length-2, // don't count sentinels
603  sas, q->scope,
604  *opts,
605  "BLOSUM80",
606  11,
607  1,
608  &request,
609  kTitle));
610  CRef<CPssmEngine> pssm_engine(new CPssmEngine(pssm_strategy));
611  CRef<CPssmWithParameters> pssm = pssm_engine->Run();
612 
613  CRef<CBioseq> bioseq = pssm_strategy->GetQueryForPssm();
614  BOOST_REQUIRE_EQUAL(bioseq->GetLength(), seq.length-2);
615 
616  string query_descr = NcbiEmptyString;
617  if (bioseq->IsSetDescr()) {
618  const CBioseq::TDescr::Tdata& data = bioseq->GetDescr().Get();
620  if((*iter)->IsTitle()) {
621  query_descr += (*iter)->GetTitle();
622  }
623  }
624  }
625  BOOST_REQUIRE_EQUAL(query_descr, kTitle);
626 
627 
628  const size_t kNumElements =
629  pssm_strategy->GetQueryLength() * BLASTAA_SIZE;
630  // Verify the residue frequencies came back
631  const CPssmIntermediateData::TResFreqsPerPos& res_freqs =
633  BOOST_REQUIRE_EQUAL(kNumElements, res_freqs.size());
634 
637  BOOST_REQUIRE_EQUAL(kNumElements, wres_freqs.size());
638 
639  const CPssmIntermediateData::TFreqRatios& freq_ratios =
641  BOOST_REQUIRE_EQUAL(kNumElements, freq_ratios.size());
642 
643  //TestUtil::PrintTextAsn1Object("pssm-diags.asn", &*pssm);
644 }
645 
646 // test sequence alignment convertion to multiple sequence alignment
647 // structure
648 BOOST_AUTO_TEST_CASE(testSeqAlignToPsiBlastMultipleSequenceAlignment) {
649 
650  /*** Setup code ***/
651  CSeq_id qid("gi|129295"), sid("gi|6");
652  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(qid));
653  unique_ptr<SSeqLoc> s(CTestObjMgr::Instance().CreateSSeqLoc(sid));
654  CBl2Seq blaster(*q, *s, eBlastp);
655  TSeqAlignVector sasv = blaster.Run();
656  BOOST_REQUIRE(sasv.size() != 0);
657 
658  CPSIBlastOptions opts;
659  PSIBlastOptionsNew(&opts);
660 
662  opts->use_best_alignment = FALSE;
663 
664  // Retrieve the query sequence, but skip the sentinel bytes!
665  SBlastSequence seq(GetSequence(*q->seqloc, eBlastEncodingProtein, q->scope));
666 
667  try {
668  unique_ptr<CPsiBlastInputData> pssm_input(
669  new CPsiBlastInputData(seq.data.get()+1,
670  seq.length-2,
671  sasv[0], q->scope, *opts));
672  // Create the score matrix builder!
673  CPssmEngine pssm_engine(pssm_input.get());
674  pssm_input->Process();
675  // include query
677 
678  /*** End Setup code ***/
679 
680  // Actual unit tests follow:
681  // Walk through the alignment segments and ensure m_AlignmentData
682  // is filled properly
683 
684  TSeqPos seq_index = 1; // skip the query sequence
685  const PSIMsaCell kNullPSIMsaCell = {
686  (unsigned char) 0, // letter
687  false // is_aligned
688  };
689 
690  // vector to keep track of aligned positions of a particular
691  // subject w.r.t the query/query sequence
692  vector<PSIMsaCell> aligned_pos(pssm_input->GetQueryLength());
693  fill(aligned_pos.begin(), aligned_pos.end(), kNullPSIMsaCell);
694 
695  // Iterate over all HSPs and populate the aligned_pos vector.
696  // This should be identical to what the pssm_engine object
697  // calculated.
698  ITERATE(CSeq_align_set::Tdata, hsp, sasv[0]->Get()) {
699  const CDense_seg& ds = (*hsp)->GetSegs().GetDenseg();
700  string subj;
702  *s->scope, subj);
703  const vector<TSignedSeqPos>& starts = ds.GetStarts();
704  const vector<TSeqPos>& lengths = ds.GetLens();
705 
706  for (int i = 0; i < ds.GetNumseg(); i++) {
707  TSignedSeqPos q_index = starts[i*ds.GetDim()];
708  TSignedSeqPos s_index = starts[i*ds.GetDim()+1];
709 // FIXME
710 #define GAP_IN_ALIGNMENT -1
711  if (s_index == (int)GAP_IN_ALIGNMENT) {
712  for (TSeqPos pos = 0; pos < lengths[i]; pos++) {
713  PSIMsaCell& pd = aligned_pos[q_index++];
715  pd.is_aligned = true;
716  }
717  } else if (q_index == (int)GAP_IN_ALIGNMENT) {
718  s_index += lengths[i];
719  continue;
720  } else {
721  s_index = (i == 0) ? 0 : (s_index - starts[1]);
722  for (TSeqPos pos = 0; pos < lengths[i]; pos++) {
723  PSIMsaCell& pd = aligned_pos[q_index++];
724  pd.letter = subj[s_index++];
725  pd.is_aligned = true;
726  }
727  }
728  }
729  }
730 
731  stringstream ss;
732  // Now compare each position for this sequence
733  for (TSeqPos i = 0; i < pssm_input->GetQueryLength(); i++) {
734  BOOST_REQUIRE(seq_index < nseqs);
735  const PSIMsaCell& pos_desc =
736  pssm_input->GetData()->data[seq_index][i];
737  ss.str("");
738  ss << "Sequence " << seq_index << ", position " << i
739  << " differ";
740  BOOST_REQUIRE_MESSAGE(aligned_pos[i].letter == pos_desc.letter &&
741  aligned_pos[i].is_aligned == pos_desc.is_aligned, ss.str());
742  }
743 
744  seq_index++;
745  } catch (const exception& e) {
746  cerr << e.what() << endl;
747  BOOST_REQUIRE(false);
748  } catch (...) {
749  cerr << "Unknown exception" << endl;
750  BOOST_REQUIRE(false);
751  }
752 }
753 
754 /// Unit test the individual stages of the PSSM creation algorithm (core
755 /// layer):
756 /// 1. purged biased sequences
757 BOOST_AUTO_TEST_CASE(testPurgeSequencesWithNull) {
758  int rv = _PSIPurgeBiasedSegments(NULL);
759  BOOST_REQUIRE_EQUAL(PSIERR_BADPARAM, rv);
760 }
761 
762 BOOST_AUTO_TEST_CASE(testPurgeSelfHit) {
763  unique_ptr<IPssmInputData> pssm_input
765  pssm_input->Process(); // standard calling convention
766  AutoPtr<_PSIPackedMsa> msa(_PSIPackedMsaNew(pssm_input->GetData()));
767  int rv = _PSIPurgeBiasedSegments(msa.get());
768  BOOST_REQUIRE_EQUAL(PSI_SUCCESS, rv);
769  const Uint4 kSelfHitIndex = 1;
770  BOOST_REQUIRE_EQUAL(true, !!msa->use_sequence[kQueryIndex]);
771  BOOST_REQUIRE_EQUAL(false, !!msa->use_sequence[kSelfHitIndex]);
772 }
773 
774 BOOST_AUTO_TEST_CASE(testPurgeDuplicateHit) {
775  unique_ptr<IPssmInputData> pssm_input
777  pssm_input->Process(); // standard calling convention
778  AutoPtr<_PSIPackedMsa> msa(_PSIPackedMsaNew(pssm_input->GetData()));
779  int rv = _PSIPurgeBiasedSegments(msa.get());
780  BOOST_REQUIRE_EQUAL(PSI_SUCCESS, rv);
781  const Uint4 kDuplicateHitIndex = 2;
782  BOOST_REQUIRE_EQUAL(false, !!msa->use_sequence[kDuplicateHitIndex]);
783  BOOST_REQUIRE_EQUAL(true, !!msa->use_sequence[kQueryIndex]);
784  BOOST_REQUIRE_EQUAL(true, !!msa->use_sequence[kQueryIndex + 1]);
785 }
786 
787 BOOST_AUTO_TEST_CASE(testPurgeNearIdenticalHits) {
788  unique_ptr<IPssmInputData> pssm_input
790  pssm_input->Process(); // standard calling convention
791  AutoPtr<_PSIPackedMsa> msa(_PSIPackedMsaNew(pssm_input->GetData()));
792  int rv = _PSIPurgeBiasedSegments(msa.get());
793  BOOST_REQUIRE_EQUAL(PSI_SUCCESS, rv);
794  const Uint4 kRemovedHitIndex = 2;
795  BOOST_REQUIRE_EQUAL(false,
796  !! msa->use_sequence[kRemovedHitIndex]);
797  BOOST_REQUIRE_EQUAL(true, !!msa->use_sequence[kQueryIndex]);
798  BOOST_REQUIRE_EQUAL(true, !! msa->use_sequence[kQueryIndex + 1]);
799 }
800 
801 BOOST_AUTO_TEST_CASE(testQueryAlignedWithInternalGaps) {
802  unique_ptr<IPssmInputData> pssm_input
803  (new CPssmInputTestData
805  BOOST_REQUIRE_EQUAL(string("BLOSUM62"),
806  string(pssm_input->GetMatrixName()));
807  CPssmEngine pssm_engine(pssm_input.get());
808  CRef<CPssmWithParameters> pssm_asn = pssm_engine.Run();
809 
810  unique_ptr< CNcbiMatrix<int> > pssm
812 
813  /* Make sure that the resulting PSSM's scores are based on the scores
814  * of the underlying scoring matrix and the query sequence (i.e.: the
815  * PSSM scores should be within one or two values from those in the
816  * underlying scoring matrix) */
817 
818  const SNCBIPackedScoreMatrix* score_matrix = &NCBISM_Blosum62;
819  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA[(int)'-'];
820  const Uint1 kBResidue = AMINOACID_TO_NCBISTDAA[(int)'B'];
821  const Uint1 kZResidue = AMINOACID_TO_NCBISTDAA[(int)'Z'];
822  const Uint1 kUResidue = AMINOACID_TO_NCBISTDAA[(int)'U'];
823  const Uint1 kOResidue = AMINOACID_TO_NCBISTDAA[(int)'O'];
824  stringstream ss;
825  BOOST_REQUIRE_EQUAL((size_t)pssm_asn->GetPssm().GetNumColumns(),
826  (size_t)pssm->GetCols());
827  BOOST_REQUIRE_EQUAL((size_t)pssm_asn->GetPssm().GetNumRows(),
828  (size_t)pssm->GetRows());
829  for (int i = 0; i < pssm_asn->GetPssm().GetNumColumns(); i++) {
830  for (int j = 0; j < pssm_asn->GetPssm().GetNumRows(); j++) {
831 
832 
833  // Query positions aligned to residues in the subject sequence
834  // may have different PSSM scores than in the underlaying
835  // scoring matrix
836  if (pssm_input->GetData()->data[1][i].is_aligned
837  && pssm_input->GetData()->data[1][i].letter != kGapResidue) {
838  continue;
839  }
840 
841  // Exceptional residues get value of BLAST_SCORE_MIN
842  if (j == kGapResidue || j == kBResidue || j == kZResidue
843  || j == kUResidue || j >= kOResidue) {
844  ss.str("");
845  ss << "Position " << i << " residue "
846  << TestUtil::GetResidue(j) << " differ on PSSM";
847  BOOST_REQUIRE_MESSAGE(BLAST_SCORE_MIN == (*pssm)(j, i), ss.str());
848  } else {
849  int score =
850  (int)NCBISM_GetScore(score_matrix,
851  pssm_input->GetQuery()[i], j);
852 
853  ss.str("");
854  ss << "Position " << i << " residue "
855  << TestUtil::GetResidue(j) << " differ on PSSM: "
856  << "expected=" << NStr::IntToString(score)
857  << " actual=" << NStr::IntToString((*pssm)(j, i));
858 
859  // The difference is due to distributing gap frequency
860  // over all residues
861  BOOST_REQUIRE_MESSAGE (score - (*pssm)(j, i) <= 3, ss.str());
862  }
863  }
864  }
865 }
866 
867 BOOST_AUTO_TEST_CASE(testMultiSeqAlignmentHasRegionsUnalignedToQuery) {
868  unique_ptr<IPssmInputData> pssm_input
869  (new
871  pssm_input->Process(); // standard calling convention
872  BOOST_REQUIRE_EQUAL(string("BLOSUM62"),
873  string(pssm_input->GetMatrixName()));
874 
875 
876  /*** Run the stage to purge biased alignment segments */
877  AutoPtr<_PSIPackedMsa> packed_msa
878  (_PSIPackedMsaNew(pssm_input->GetData()));
879  int rv = _PSIPurgeBiasedSegments(packed_msa.get());
880  BOOST_REQUIRE_EQUAL(PSI_SUCCESS, rv);
881  BOOST_REQUIRE_EQUAL(true,
882  !!packed_msa->use_sequence[kQueryIndex]);
883  BOOST_REQUIRE_EQUAL(true, !! packed_msa->use_sequence[1]);
884  BOOST_REQUIRE_EQUAL(true, !! packed_msa->use_sequence[2]);
885 
886  AutoPtr<_PSIMsa> msa(_PSIMsaNew(packed_msa.get(), BLASTAA_SIZE));
887  /*** Run the stage to calculate alignment extents */
888  CPSIBlastOptions opts;
889  PSIBlastOptionsNew(&opts);
890  AutoPtr<_PSIAlignedBlock> aligned_blocks(
891  _PSIAlignedBlockNew(pssm_input->GetQueryLength()));
892  rv = _PSIComputeAlignmentBlocks(msa.get(), aligned_blocks.get());
893  stringstream ss;
894  ss << "_PSIComputeAlignmentBlocks failed: "
896  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
897 
898  // Verify the alignment extents for aligned regions to the query
899  vector<CPssmInputTestData::TAlignedSegment> aligned_regions;
900  aligned_regions.push_back(make_pair(0U, 99U));
901  aligned_regions.push_back(make_pair(200U,
902  pssm_input->GetQueryLength()-1));
903 
904  for (vector<CPssmInputTestData::TAlignedSegment>::const_iterator i =
905  aligned_regions.begin();
906  i != aligned_regions.end(); ++i) {
907  for (TSeqPos pos = i->first; pos < i->second; pos++) {
908  ss.str("");
909  ss << "Alignment extents differ at position "
910  << NStr::IntToString(pos);
911  BOOST_REQUIRE_MESSAGE((int)i->first == (int)aligned_blocks->pos_extnt[pos].left, ss.str());
912  BOOST_REQUIRE_MESSAGE((int)i->second == (int)aligned_blocks->pos_extnt[pos].right, ss.str());
913  BOOST_REQUIRE_MESSAGE( (int)(i->second - i->first + 1) == (int)aligned_blocks->size[pos], ss.str());
914  }
915  }
916 
917  // Verify the alignment extents for unaligned regions to the query
918  const CPssmInputTestData::TAlignedSegment kUnalignedRange(100, 200);
919  for (size_t i = kUnalignedRange.first;
920  i < kUnalignedRange.second; i++) {
921  ss.str("");
922  ss << "Alignment extents differ at position "
924  BOOST_REQUIRE_MESSAGE((int)-1 == (int)aligned_blocks->pos_extnt[i].left, ss.str());
925  BOOST_REQUIRE_MESSAGE( (int)pssm_input->GetQueryLength() == (int)aligned_blocks->pos_extnt[i].right, ss.str());
926  BOOST_REQUIRE_MESSAGE(
927  (int)(aligned_blocks->pos_extnt[i].right - aligned_blocks->pos_extnt[i].left + 1) == (int)aligned_blocks->size[i],
928  ss.str());
929  }
930 
931  /*** Run the stage to compute the sequence weights */
932  blast::TAutoUint1Ptr query_with_sentinels
933  (CPssmCreateTestFixture::x_GuardProteinQuery(pssm_input->GetQuery(),
934  pssm_input->GetQueryLength()));;
935  CBlastScoreBlk sbp;
936  sbp.Reset
938  (query_with_sentinels.get(), pssm_input->GetQueryLength()));
939  AutoPtr<_PSISequenceWeights> seq_weights(
940  _PSISequenceWeightsNew(msa->dimensions,
941  sbp));
942  rv = _PSIComputeSequenceWeights(msa.get(), aligned_blocks.get(),
944  seq_weights.get());
945  ss.str("");
946  ss << "_PSIComputeSequenceWeights failed: "
948  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
949 
950  // Verify the validity of sequence weights corresponding to the aligned
951  // regions
952  BOOST_REQUIRE_EQUAL(false, !!opts->nsg_compatibility_mode);
953  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA[(int)'X'];
954  for (vector<CPssmInputTestData::TAlignedSegment>::const_iterator i =
955  aligned_regions.begin();
956  i != aligned_regions.end(); ++i) {
957  for (TSeqPos pos = i->first; pos < i->second; pos++) {
958  double total_sequence_weights_for_column = 0.0;
959  for (size_t res = 0; res < msa->alphabet_size; res++) {
960  if (res == kXResidue) continue;
961  total_sequence_weights_for_column +=
962  seq_weights->match_weights[pos][res];
963  }
964  BOOST_REQUIRE(total_sequence_weights_for_column > 0.99 &&
965  total_sequence_weights_for_column < 1.01);
966  }
967  }
968  // Verify that the unaligned sequence weights are all zero's
969  for (size_t pos = kUnalignedRange.first;
970  pos < kUnalignedRange.second; pos++) {
971  double total_sequence_weights_for_column = 0.0;
972  for (size_t res = 0; res < msa->alphabet_size; res++) {
973  if (res == kXResidue) continue;
974  total_sequence_weights_for_column +=
975  seq_weights->match_weights[pos][res];
976  }
977  BOOST_REQUIRE(total_sequence_weights_for_column == 0.0);
978  }
979 
980  /*** run the stage to compute the PSSM's frequency ratios ***/
981  AutoPtr<_PSIInternalPssmData> internal_pssm(
982  _PSIInternalPssmDataNew(pssm_input->GetQueryLength(),
983  sbp->alphabet_size));
984  rv = _PSIComputeFreqRatios(msa.get(), seq_weights.get(), sbp,
985  aligned_blocks.get(), opts->pseudo_count,
987  internal_pssm.get());
988  ss.str("");
989  ss << "_PSIComputeResidueFrequencies failed: "
991  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
992 
993  /***** Run the stage to convert residue frequencies to PSSM **********/
994  rv = _PSIConvertFreqRatiosToPSSM(internal_pssm.get(),
995  msa->query,
996  sbp,
997  seq_weights->std_prob);
998  ss.str("");
999  ss << "_PSIConvertResidueFreqsToPSSM failed: "
1001  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1002 
1003  /**************** Run the stage to scale the PSSM ********************/
1004  rv = _PSIScaleMatrix(msa->query,
1005  seq_weights->std_prob,
1006  internal_pssm.get(),
1007  sbp);
1008  ss.str("");
1009  ss << "_PSIScaleMatrix failed: "
1011  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1012 
1013  BOOST_REQUIRE_EQUAL(msa->dimensions->num_seqs, 3u);
1014 
1015  /* Make sure that the resulting PSSM's scores are based on the scores
1016  * of the underlying scoring matrix and the query sequence (i.e.: the
1017  * PSSM scores should be within one or two values from those in the
1018  * underlying scoring matrix) */
1019  const SNCBIPackedScoreMatrix* score_matrix = &NCBISM_Blosum62;
1020  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA[(int)'-'];
1021  const Uint1 kBResidue = AMINOACID_TO_NCBISTDAA[(int)'B'];
1022  const Uint1 kZResidue = AMINOACID_TO_NCBISTDAA[(int)'Z'];
1023  const Uint1 kUResidue = AMINOACID_TO_NCBISTDAA[(int)'U'];
1024  const Uint1 kOResidue = AMINOACID_TO_NCBISTDAA[(int)'O'];
1025  for (Uint4 i = 0; i < pssm_input->GetQueryLength(); i++) {
1026  for (Uint4 j = 0; j < (Uint4) sbp->alphabet_size; j++) {
1027 
1028  // we are not comparing PSSM scores for the aligned positions
1029  if (msa->cell[1][i].is_aligned || msa->cell[2][i].is_aligned
1030  || msa->cell[3][i].is_aligned) {
1031  continue;
1032  }
1033 
1034  // these residues may have different scores than in the
1035  // underlying scoring matrix
1036  if (j == kBResidue || j == kZResidue || j == kUResidue
1037  || j >= kOResidue) {
1038  continue;
1039  }
1040 
1041  // Exceptional residues get value of BLAST_SCORE_MIN
1042  if (j == kGapResidue) {
1043  ss.str("");
1044  ss << "Position " << i << " residue "
1045  << TestUtil::GetResidue(j) << " differ on PSSM";
1046  BOOST_REQUIRE_MESSAGE(BLAST_SCORE_MIN == internal_pssm->pssm[i][j], ss.str());
1047  } else {
1048  int score =
1049  (int)NCBISM_GetScore(score_matrix, msa->query[i], j);
1050 
1051  ss.str("");
1052  ss << "Position " << i << " residue "
1053  << TestUtil::GetResidue(j) << " differ on PSSM: "
1054  << "expected=" << NStr::IntToString(score)
1055  << " actual=" <<
1056  NStr::IntToString(internal_pssm->pssm[i][j]);
1057  BOOST_REQUIRE_MESSAGE(score-1 <= internal_pssm->pssm[i][j] && internal_pssm->pssm[i][j] <= score+1, ss.str());
1058  }
1059  }
1060  }
1061 }
1062 
1063 /// test the case when only a segment of the query sequence is the only
1064 /// aligned sequence in the multiple sequence alignment.
1065 /// The scores in the PSSM should be based on the underlying scoring matrix
1066 BOOST_AUTO_TEST_CASE(testQueryIsOnlyAlignedSequenceInMsa) {
1067  unique_ptr<IPssmInputData> pssm_input
1069  pssm_input->Process(); // standard calling convention
1070  BOOST_REQUIRE_EQUAL(string("BLOSUM62"),
1071  string(pssm_input->GetMatrixName()));
1072 
1073 
1074  /*** Run the stage to purge biased alignment segments */
1075  AutoPtr<_PSIPackedMsa> packed_msa
1076  (_PSIPackedMsaNew(pssm_input->GetData()));
1077  int rv = _PSIPurgeBiasedSegments(packed_msa.get());
1078  BOOST_REQUIRE_EQUAL(PSI_SUCCESS, rv);
1079  const Uint4 kSelfHitIndex = 1;
1080  BOOST_REQUIRE_EQUAL(true,
1081  !! packed_msa->use_sequence[kQueryIndex]);
1082  BOOST_REQUIRE_EQUAL(false,
1083  !! packed_msa->use_sequence[kSelfHitIndex]);
1084 
1085  AutoPtr<_PSIMsa> msa(_PSIMsaNew(packed_msa.get(), BLASTAA_SIZE));
1086  /*** Run the stage to calculate alignment extents */
1087  CPSIBlastOptions opts;
1088  PSIBlastOptionsNew(&opts);
1089  AutoPtr<_PSIAlignedBlock> aligned_blocks(
1090  _PSIAlignedBlockNew(pssm_input->GetQueryLength()));
1091  rv = _PSIComputeAlignmentBlocks(msa.get(), aligned_blocks.get());
1092  stringstream ss;
1093  ss << "_PSIComputeAlignmentBlocks failed: "
1095  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1096 
1097  for (size_t i = 0; i < pssm_input->GetQueryLength(); i++) {
1098  BOOST_REQUIRE_EQUAL((int)-1,
1099  (int)aligned_blocks->pos_extnt[i].left);
1100  BOOST_REQUIRE_EQUAL((int)pssm_input->GetQueryLength(),
1101  (int)aligned_blocks->pos_extnt[i].right);
1102  BOOST_REQUIRE_EQUAL((int)pssm_input->GetQueryLength() + 2,
1103  (int)aligned_blocks->size[i]);
1104  }
1105 
1106  /*** Run the stage to compute the sequence weights */
1107  blast::TAutoUint1Ptr query_with_sentinels
1108  (CPssmCreateTestFixture::x_GuardProteinQuery(pssm_input->GetQuery(),
1109  pssm_input->GetQueryLength()));;
1110  CBlastScoreBlk sbp;
1111  sbp.Reset
1113  (query_with_sentinels.get(), pssm_input->GetQueryLength()));
1114  AutoPtr<_PSISequenceWeights> seq_weights(
1115  _PSISequenceWeightsNew(msa->dimensions,
1116  sbp));
1117  rv = _PSIComputeSequenceWeights(msa.get(), aligned_blocks.get(),
1118  // N.B.: we're deliberately ignoring
1119  // the sequence weights check!!!!
1120  TRUE,
1121  seq_weights.get());
1122  ss.str("");
1123  ss << "_PSIComputeSequenceWeights failed: "
1125  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1126 
1127  /*** run the stage to compute the PSSM's frequency ratios ***/
1128  AutoPtr<_PSIInternalPssmData> internal_pssm(
1129  _PSIInternalPssmDataNew(pssm_input->GetQueryLength(),
1130  sbp->alphabet_size));
1131  rv = _PSIComputeFreqRatios(msa.get(), seq_weights.get(), sbp,
1132  aligned_blocks.get(), opts->pseudo_count,
1133  opts->nsg_compatibility_mode,
1134  internal_pssm.get());
1135  ss.str("");
1136  ss << "_PSIComputeResidueFrequencies failed: "
1138  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1139 
1140  /***** Run the stage to convert residue frequencies to PSSM **********/
1141  rv = _PSIConvertFreqRatiosToPSSM(internal_pssm.get(),
1142  msa->query,
1143  sbp,
1144  seq_weights->std_prob);
1145  ss.str("");
1146  ss << "_PSIConvertResidueFreqsToPSSM failed: "
1148  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1149 
1150  /**************** Run the stage to scale the PSSM ********************/
1151  rv = _PSIScaleMatrix(msa->query,
1152  seq_weights->std_prob,
1153  internal_pssm.get(),
1154  sbp);
1155  ss.str("");
1156  ss << "_PSIScaleMatrix failed: "
1158  BOOST_REQUIRE_MESSAGE(PSI_SUCCESS == rv, ss.str());
1159 
1160  /* Make sure that the resulting PSSM's scores are based on the scores
1161  * of the underlying scoring matrix and the query sequence (i.e.: the
1162  * PSSM scores should be within one or two values from those in the
1163  * underlying scoring matrix) */
1164  const SNCBIPackedScoreMatrix* score_matrix = &NCBISM_Blosum62;
1165  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA[(int)'-'];
1166  const Uint1 kBResidue = AMINOACID_TO_NCBISTDAA[(int)'B'];
1167  const Uint1 kZResidue = AMINOACID_TO_NCBISTDAA[(int)'Z'];
1168  const Uint1 kUResidue = AMINOACID_TO_NCBISTDAA[(int)'U'];
1169  const Uint1 kOResidue = AMINOACID_TO_NCBISTDAA[(int)'O'];
1170  for (Uint4 i = 0; i < pssm_input->GetQueryLength(); i++) {
1171  for (Uint4 j = 0; j < (Uint4) sbp->alphabet_size; j++) {
1172 
1173  // Exceptional residues get value of BLAST_SCORE_MIN
1174  if (j == kGapResidue || j == kBResidue || j == kZResidue
1175  || j == kUResidue || j >= kOResidue) {
1176  ss.str("");
1177  ss << "Position " << i << " residue "
1178  << TestUtil::GetResidue(j) << " differ on PSSM";
1179  BOOST_REQUIRE_MESSAGE(BLAST_SCORE_MIN == internal_pssm->pssm[i][j], ss.str());
1180  } else {
1181  int score =
1182  (int)NCBISM_GetScore(score_matrix, msa->query[i], j);
1183 
1184  ss.str("");
1185  ss << "Position " << i << " residue "
1186  << TestUtil::GetResidue(j) << " differ on PSSM: "
1187  << "expected=" << NStr::IntToString(score)
1188  << " actual=" <<
1189  NStr::IntToString(internal_pssm->pssm[i][j]);
1190  BOOST_REQUIRE_MESSAGE(score-1 <= internal_pssm->pssm[i][j] && internal_pssm->pssm[i][j] <= score+1, ss.str());
1191  }
1192  }
1193  }
1194 }
1195 
1196 BOOST_AUTO_TEST_CASE(testRejectFlankingGaps) {
1197  unique_ptr<IPssmInputData> bad_pssm_data(new CPssmInputFlankingGaps());
1198  CPssmEngine pssm_engine(bad_pssm_data.get());
1199  BOOST_REQUIRE_THROW(pssm_engine.Run(), CBlastException);
1200 }
1201 
1202 BOOST_AUTO_TEST_CASE(testRejectGapInQuery) {
1203  unique_ptr<IPssmInputData> bad_pssm_data(new CPssmInputGapsInQuery());
1204  CPssmEngine pssm_engine(bad_pssm_data.get());
1205  BOOST_REQUIRE_THROW(pssm_engine.Run(), CBlastException);
1206 }
1207 
1208 BOOST_AUTO_TEST_CASE(testRejectQueryLength0) {
1209  unique_ptr<IPssmInputData> bad_pssm_data(new CPssmInputQueryLength0());
1210  BOOST_REQUIRE_THROW(CPssmEngine pssm_engine(bad_pssm_data.get()), CPssmEngineException);
1211 }
1212 
1213 BOOST_AUTO_TEST_CASE(testRejectNullPssmInputData) {
1214  IPssmInputData* null_ptr = NULL;
1215  BOOST_REQUIRE_THROW(CPssmEngine pssm_engine(null_ptr), CPssmEngineException);
1216 }
1217 
1218 BOOST_AUTO_TEST_CASE(testRejectNullsReturnedByPssmInput) {
1219  unique_ptr<IPssmInputData> bad_pssm_data(new CNullPssmInput());
1220  BOOST_REQUIRE_THROW(CPssmEngine pssm_engine(bad_pssm_data.get()), CBlastException);
1221 }
1222 
1223 BOOST_AUTO_TEST_CASE(testRejectUnsupportedMatrix) {
1224  unique_ptr<IPssmInputData> bad_pssm_data(new
1226  BOOST_REQUIRE_THROW(CPssmEngine pssm_engine(bad_pssm_data.get()), CBlastException);
1227 }
1228 
1229 // Deliberately ask for an alignment data structure that too large to test
1230 // the error handling. Should not be run under valgrind
1231 BOOST_AUTO_TEST_CASE(testPsiAlignmentDataCreation_TooMuchMemory) {
1232  Uint4 big_num = ncbi::numeric_limits<int>::max()/sizeof(void*);
1233  const PSIMsaDimensions kDimensions = { big_num, big_num};
1234  PSIMsa* msa = PSIMsaNew(&kDimensions);
1235  BOOST_REQUIRE(msa == NULL);
1236 }
1237 
1238 
1239 BOOST_AUTO_TEST_CASE(testPsiLowerCaseMatrix) {
1240 
1241  SFreqRatios* freq_ratios = _PSIMatrixFrequencyRatiosNew("blosum62");
1242  BOOST_REQUIRE(freq_ratios != NULL);
1243  freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
1244  BOOST_REQUIRE(freq_ratios == NULL);
1245 
1246 }
1247 
1248 
1250 
1251 /*
1252 * ===========================================================================
1253 *
1254 * $Log: pssmcreate-cppunit.cpp,v $
1255 * Revision 1.86 2008/03/13 19:41:58 camacho
1256 * Bring up to date with current CScorematPssmConverter interface
1257 *
1258 * Revision 1.85 2007/12/07 17:19:17 camacho
1259 * Bring in sync with svn revision 115203
1260 *
1261 * Revision 1.84 2007/04/10 18:24:36 madden
1262 * Remove discontinuous seq-aligns
1263 *
1264 * Revision 1.83 2007/01/23 18:02:19 camacho
1265 * + new parameter to posPurgeMatches
1266 *
1267 * Revision 1.82 2006/11/17 17:58:01 camacho
1268 * Update to use new definition of CPsiBlastInputData::x_GetSubjectSequence
1269 *
1270 * Revision 1.81 2006/11/16 14:06:20 camacho
1271 * Add missing Deleter specialization
1272 *
1273 * Revision 1.80 2006/11/14 15:56:41 camacho
1274 * Bring up to date with most recent PSSM engine optimizations
1275 *
1276 * Revision 1.79 2006/08/31 22:04:52 camacho
1277 * Minor fix
1278 *
1279 * Revision 1.78 2006/07/05 15:24:15 camacho
1280 * Changes to support new value of BLASTAA_SIZE
1281 *
1282 * Revision 1.77 2006/06/05 13:34:05 madden
1283 * Changes to remove [GS]etMatrixPath and use callback instead
1284 *
1285 * Revision 1.76 2006/05/24 17:22:43 madden
1286 * remove call to FindMatrixPath
1287 *
1288 * Revision 1.75 2006/04/26 14:24:47 camacho
1289 * Fix compiler warning
1290 *
1291 * Revision 1.74 2006/02/21 22:10:15 camacho
1292 * Use CNcbiOstrstream and CNcbiOstrstreamToString
1293 *
1294 * Revision 1.73 2006/02/17 18:50:38 camacho
1295 * Replace ostringstream for CNcbiOstrstream for portability issues
1296 *
1297 * Revision 1.72 2006/01/30 17:30:34 camacho
1298 * Relax the maximum permissible difference when comparing doubles
1299 *
1300 * Revision 1.71 2005/11/28 20:46:04 camacho
1301 * Fixes to temporary BLAST object manager class to create CScopes
1302 *
1303 * Revision 1.70 2005/11/10 23:43:31 camacho
1304 * Use TestUtil::CTmpObjMgrBlastDbDataLoader
1305 *
1306 * Revision 1.69 2005/10/26 14:30:46 camacho
1307 * Remove redundant code, reuse private PSI-BLAST auxiliary functions
1308 *
1309 * Revision 1.68 2005/10/14 13:47:32 camacho
1310 * Fixes to pacify icc compiler
1311 *
1312 * Revision 1.67 2005/09/26 16:35:15 camacho
1313 * Use CRef<> to store CPssmEngine
1314 *
1315 * Revision 1.66 2005/09/26 14:41:44 camacho
1316 * Renamed blast_psi.hpp -> pssm_engine.hpp
1317 *
1318 * Revision 1.65 2005/09/23 18:59:11 camacho
1319 * Rollback accidental commit
1320 *
1321 * Revision 1.63 2005/08/26 17:14:06 camacho
1322 * Remove unneeded typedefs
1323 *
1324 * Revision 1.62 2005/08/24 14:46:48 camacho
1325 * Updated tests for PSSM engine
1326 *
1327 * Revision 1.61 2005/06/09 20:37:06 camacho
1328 * Use new private header blast_objmgr_priv.hpp
1329 *
1330 * Revision 1.60 2005/05/20 18:33:20 camacho
1331 * refactorings to use CAsn1PssmConverter
1332 *
1333 * Revision 1.59 2005/05/10 16:09:04 camacho
1334 * Changed *_ENCODING #defines to EBlastEncoding enumeration
1335 *
1336 * Revision 1.58 2005/05/04 13:28:38 camacho
1337 * Fix to previous commit
1338 *
1339 * Revision 1.57 2005/05/03 20:45:07 camacho
1340 * Added test for query aligned with gaps
1341 *
1342 * Revision 1.56 2005/04/29 14:44:53 bealer
1343 * - Fix for inverted test in DOUBLES_EQUAL_MSG (required for release mode).
1344 *
1345 * Revision 1.55 2005/04/27 20:08:40 dondosha
1346 * PHI-blast boolean argument has been removed from BlastSetup_ScoreBlkInit
1347 *
1348 * Revision 1.54 2005/04/22 13:32:13 camacho
1349 * Fix to previous commit
1350 *
1351 * Revision 1.53 2005/04/21 20:45:58 camacho
1352 * Added test for the case when the query sequence is aligned with internal gaps only on a given column
1353 *
1354 * Revision 1.52 2005/03/23 14:27:00 camacho
1355 * Fix compiler warnings
1356 *
1357 * Revision 1.51 2005/03/22 15:47:50 camacho
1358 * added tests for backwards compatibility with old PSSM engine
1359 *
1360 * Revision 1.50 2005/03/21 23:34:44 bealer
1361 * - Doubles/message macro.
1362 *
1363 * Revision 1.49 2005/03/04 17:20:45 bealer
1364 * - Command line option support.
1365 *
1366 * Revision 1.48 2005/03/03 17:45:58 camacho
1367 * fix to loading pssm
1368 *
1369 * Revision 1.47 2005/02/25 19:48:14 camacho
1370 * Added unit test for comparing new vs. old IMPALA scaling
1371 *
1372 * Revision 1.46 2005/02/22 22:51:20 camacho
1373 * + impala_scaling_factor, first cut
1374 *
1375 * Revision 1.45 2005/02/14 14:17:17 camacho
1376 * Changes to use SBlastScoreMatrix
1377 *
1378 * Revision 1.44 2005/02/10 15:43:28 dondosha
1379 * Small memory leak fix
1380 *
1381 * Revision 1.43 2005/01/26 17:52:13 camacho
1382 * Remove unused variables
1383 *
1384 * Revision 1.42 2005/01/22 16:57:01 camacho
1385 * cosmetic change
1386 *
1387 * Revision 1.41 2005/01/10 15:43:52 camacho
1388 * + data/seqp database to database loader
1389 *
1390 * Revision 1.40 2004/12/28 16:48:26 camacho
1391 * 1. Use typedefs to AutoPtr consistently
1392 * 2. Use SBlastSequence structure instead of std::pair as return value to
1393 * blast::GetSequence
1394 *
1395 * Revision 1.39 2004/12/22 16:26:56 camacho
1396 * Remove diagnostics output
1397 *
1398 * Revision 1.38 2004/12/13 22:37:56 camacho
1399 * Consolidated structure group customizations in option: nsg_compatibility_mode
1400 *
1401 * Revision 1.37 2004/12/09 15:24:10 dondosha
1402 * BlastSetup_GetScoreBlock renamed to BlastSetup_ScoreBlkInit
1403 *
1404 * Revision 1.36 2004/11/30 20:43:38 camacho
1405 * Replace call to GetLoaderNameFromArgs
1406 *
1407 * Revision 1.35 2004/11/29 20:18:03 camacho
1408 * Fix setUp/tearDown methods to avoid creating/deleting the Genbank data loader
1409 * as this spawns many maintenance threads and causes valgrind to fail.
1410 *
1411 * Revision 1.34 2004/11/24 15:16:58 camacho
1412 * + test for default PSIBLAST input data strategy
1413 *
1414 * Revision 1.33 2004/11/23 21:50:08 camacho
1415 * Removed local initialization of ideal Karlin-Altschul parameters
1416 *
1417 * Revision 1.32 2004/11/23 17:53:18 camacho
1418 * Return NULL rather than "" in null matrix test case
1419 *
1420 * Revision 1.31 2004/11/22 15:18:13 camacho
1421 * + tests & mock object for purge stage of PSSM creation
1422 *
1423 * Revision 1.30 2004/11/02 21:27:22 camacho
1424 * Fixes for recent changes in PSI-BLAST function names
1425 *
1426 * Revision 1.29 2004/10/18 14:51:49 camacho
1427 * Added argument to _PSIComputeSequenceWeights
1428 *
1429 * Revision 1.28 2004/10/13 20:49:22 camacho
1430 * + support for requesting diagnostics information and specifying underlying matrix
1431 *
1432 * Revision 1.27 2004/10/13 15:46:23 camacho
1433 * + tests for invalid PSSM data
1434 *
1435 * Revision 1.26 2004/10/13 01:43:54 camacho
1436 * + unit test for checking 0-length queries
1437 *
1438 * Revision 1.25 2004/10/12 21:27:49 camacho
1439 * + mock objects to simulate bad pssm input data
1440 *
1441 * Revision 1.24 2004/10/12 14:19:36 camacho
1442 * Update for scoremat.asn reorganization
1443 *
1444 * Revision 1.23 2004/08/31 16:10:07 camacho
1445 * Use CppUnit assertions for floating point values
1446 *
1447 * Revision 1.22 2004/08/05 19:20:27 camacho
1448 * Temporarily disable failing test
1449 *
1450 * Revision 1.21 2004/08/04 21:20:55 camacho
1451 * Change seq-align file
1452 *
1453 * Revision 1.20 2004/08/04 20:28:49 camacho
1454 * Updated to reflect recent changes in core PSSM engine structures
1455 *
1456 * Revision 1.19 2004/08/02 13:31:28 camacho
1457 * Renaming of PSSM engine structures
1458 *
1459 * Revision 1.18 2004/07/29 17:56:12 camacho
1460 * Updated to use new interfaces, needs more test data
1461 *
1462 * Revision 1.17 2004/07/22 16:37:59 camacho
1463 * Fixes for exchanging data loaders
1464 *
1465 * Revision 1.16 2004/07/22 13:58:59 camacho
1466 * Use the new C++ Object Manager interfaces
1467 *
1468 * Revision 1.15 2004/07/21 17:51:03 camacho
1469 * disable failing unit tests for right now
1470 *
1471 * Revision 1.14 2004/07/07 18:55:38 camacho
1472 * Add test for handling out-of-memory conditions
1473 *
1474 * Revision 1.13 2004/07/06 15:58:45 dondosha
1475 * Use EBlastProgramType enumeration type for program when calling C functions
1476 *
1477 * Revision 1.12 2004/07/02 18:02:54 camacho
1478 * Added more tests for purging matching sequences and sequence weights
1479 * computation.
1480 *
1481 * Revision 1.11 2004/06/22 16:46:19 camacho
1482 * Changed the blast_type_* definitions for the EBlastProgramType enumeration.
1483 *
1484 * Revision 1.10 2004/06/21 15:51:34 camacho
1485 * Added compute extents tests, fixed memory leaks
1486 *
1487 * Revision 1.9 2004/06/18 15:05:34 camacho
1488 * Added more comparison tests
1489 *
1490 * Revision 1.8 2004/06/16 15:23:48 camacho
1491 * Added posPurgeMatches unit tests
1492 *
1493 * Revision 1.7 2004/06/16 12:48:26 camacho
1494 * Fix compiler warnings
1495 *
1496 * Revision 1.6 2004/06/16 12:12:47 camacho
1497 * Remove extra comma in enumerated type
1498 *
1499 * Revision 1.5 2004/06/14 21:33:49 camacho
1500 * Refactored test code to use a pssm engine mock object
1501 *
1502 * Revision 1.4 2004/06/09 21:34:20 camacho
1503 * Minor changes
1504 *
1505 * Revision 1.3 2004/06/09 16:45:17 camacho
1506 * Fix for solaris build
1507 *
1508 * Revision 1.2 2004/06/09 16:17:29 camacho
1509 * Minor fixes
1510 *
1511 * Revision 1.1 2004/06/09 14:58:55 camacho
1512 * Initial revision
1513 *
1514 *
1515 * ===========================================================================
1516 */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares the CBl2Seq (BLAST 2 Sequences) class.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
Declares the BLAST exception class.
Definitions which are dependant on the NCBI C++ Object Manager.
PSIBlastOptions * PSIBlastOptionsFree(PSIBlastOptions *psi_options)
Deallocate PSI BLAST options.
#define BLAST_EXPECT_VALUE
Default parameters for saving hits.
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
Port of posit.h structures and impalaScaling for implementing composition based statistics for PSI-BL...
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
Definition: blast_psi.c:513
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
Definition: blast_psi.c:462
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
const double kPSINearIdentical
Percent identity threshold for discarding near-identical matches.
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSI_SUCCESS
Successful operation.
Utilities initialize/setup BLAST.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
Definition: blast_stat.h:121
vector< CRef< objects::CSeq_align_set > > TSeqAlignVector
Vector of Seq-align-sets.
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
AutoPtr –.
Definition: ncbimisc.hpp:401
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Runs the BLAST algorithm between 2 sequences.
Definition: bl2seq.hpp:58
Defines BLAST error codes (user errors included)
Wrapper class for BlastScoreBlk .
Definition: blast_aux.hpp:333
Mock object for the PSSM input data which returns NULLs for all its methods.
PSIMsa * GetData()
Obtain the multiple sequence alignment structure.
unsigned int GetQueryLength()
Get the query's length.
unsigned char * GetQuery()
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
const PSIBlastOptions * GetOptions()
Obtain the options for the PSSM engine.
void Process()
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
Wrapper class for PSIBlastOptions .
Definition: blast_aux.hpp:330
This class is a concrete strategy for IPssmInputData, and it implements the traditional PSI-BLAST alg...
This class exists merely to call private methods in CPsiBlastInputData and CPssmEngine.
static unsigned int GetNumAlignedSequences(const CPsiBlastInputData &input)
Accesses CPsiBlastInputData private method.
static string x_ErrorCodeToString(int error_code)
Gets error strings from a CPssmEngine private method.
static unsigned char * x_GuardProteinQuery(const unsigned char *query, unsigned int query_length)
Accesses CPssmEngine private method.
static void x_GetSubjectSequence(const objects::CDense_seg &ds, objects::CScope &scope, string &sequence_data)
Gets Subject sequence from a CPsiBlastInputData private method.
Exception class for the CPssmEngine class.
Definition: pssm_engine.hpp:63
Computes a PSSM as specified in PSI-BLAST.
Mock object for the PSSM input data which returns multiple sequence alignment data which has flanking...
const PSIBlastOptions * GetOptions()
Obtain the options for the PSSM engine.
void Process()
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
PSIMsa * GetData()
Obtain the multiple sequence alignment structure.
PSIDiagnosticsRequest m_diag_request
const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
unsigned int GetQueryLength()
Get the query's length.
unsigned char * GetQuery()
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
Mock object for the PSSM input data which returns a query sequence with a gap in it.
Mock object for the PSSM input data which returns a query sequence with a gap in it.
unsigned int GetQueryLength()
Get the query's length.
Mock object for the PSSM input data which can be configured to have different combinations of aligned...
CPssmInputTestData(EAlignmentType type, PSIBlastOptions *opts=NULL)
pair< TSeqPos, TSeqPos > TAlignedSegment
void SetupHenikoffsPositionBasedSequenceWeights(void)
Uint1 FindNonIdenticalHighScoringResidue(Uint1 res, const SNCBIPackedScoreMatrix *score_matrix)
static const size_t kQueryLength
static const Uint1 kQuery[kQueryLength]
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
CRandom::
Definition: random_gen.hpp:66
static TIndex GetIndex(CSeq_data::E_Choice code_type, const string &code)
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:69
char data[12]
Definition: iconv.c:80
static const char * kQuery
TSeqPos length
Length of the buffer above (not necessarily sequence length!)
Definition: blast_setup.hpp:65
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
static CNcbiMatrix< int > * GetScores(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
#define BLASTAA_SIZE
Size of aminoacid alphabet.
TAutoUint1Ptr data
Sequence data.
Definition: blast_setup.hpp:64
virtual TSeqAlignVector Run()
Perform BLAST search Assuming N queries and M subjects, the structure of the returned vector is as fo...
Definition: bl2seq.cpp:173
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
void Reset(BlastScoreBlk *p=NULL)
Definition: blast_aux.hpp:333
AutoPtr< Uint1, CDeleter< Uint1 > > TAutoUint1Ptr
Declares TAutoUint1Ptr (for Uint1 arrays allocated with malloc/calloc)
Definition: blast_aux.hpp:98
SBlastSequence GetSequence(const objects::CSeq_loc &sl, EBlastEncoding encoding, objects::CScope *scope, objects::ENa_strand strand=objects::eNa_strand_plus, ESentinelType sentinel=eSentinels, std::string *warnings=NULL)
Retrieves a sequence using the object manager.
@ eBlastEncodingProtein
NCBIstdaa.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2742
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
#define NcbiEmptyString
Definition: ncbistr.hpp:122
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
Definition: Pssm_.hpp:610
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
Definition: Pssm_.hpp:657
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
Definition: Pssm_.hpp:793
const TResFreqsPerPos & GetResFreqsPerPos(void) const
Get the ResFreqsPerPos member data.
const TPssm & GetPssm(void) const
Get the Pssm member data.
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
SFreqRatios * _PSIMatrixFrequencyRatiosFree(SFreqRatios *freq_ratios)
Deallocate the frequency ratios structure.
SFreqRatios * _PSIMatrixFrequencyRatiosNew(const char *matrix_name)
Retrive the matrix's frequency ratios.
const TYPE & Get(const CNamedParameterList *param)
char GetResidue(unsigned int res)
Returns character representation of a residue from ncbistdaa.
Magic spell ;-) needed for some weird compilers... very empiric.
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
T max(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
void abort()
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Utilities to develop and debug unit tests for BLAST.
#define U
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
Declarations of auxiliary functions/classes for PSI-BLAST.
C++ API for the PSI-BLAST PSSM engine.
Defines interface for a sequence alignment processor that can populate a multiple alignment data stru...
BlastScoreBlk * InitializeBlastScoreBlk(const unsigned char *query, Uint4 query_size)
Utilities to develop and debug unit tests that deal with PSSM computation.
#define GAP_IN_ALIGNMENT
BOOST_AUTO_TEST_CASE(testFullPssmEngineRunWithDiagnosticsRequest)
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
TNCBIScore NCBISM_GetScore(const SNCBIPackedScoreMatrix *sm, int aa1, int aa2)
Look up an entry in a packed score matrix.
Definition: raw_scoremat.c:67
Int2 alphabet_size
size of alphabet.
Definition: blast_stat.h:181
Abstract base class to encapsulate the source(s) and pre-processing of PSSM input data as well as opt...
Definition: pssm_input.hpp:106
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Boolean nsg_compatibility_mode
Compatibility option for the NCBI's structure group (note nsg_ prefix, stands for NCBI's structure gr...
double inclusion_ethresh
Minimum evalue for inclusion in PSSM calculation.
Int4 pseudo_count
Pseudocount constant.
Boolean use_best_alignment
If set to TRUE, use the best alignment when multiple HSPs are found in a query-subject alignment (i....
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Boolean information_content
request information content
Definition: blast_psi.h:182
Boolean frequency_ratios
request frequency ratios
Definition: blast_psi.h:187
Boolean weighted_residue_frequencies
request observed weighted residue frequencies
Definition: blast_psi.h:185
Boolean gapless_column_weights
request gapless column weights
Definition: blast_psi.h:188
Boolean num_matching_seqs
request number of matching sequences
Definition: blast_psi.h:192
Boolean sigma
request sigma
Definition: blast_psi.h:190
Boolean residue_frequencies
request observed residue frequencies
Definition: blast_psi.h:183
Boolean interval_sizes
request interval sizes
Definition: blast_psi.h:191
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
Definition: blast_psi.h:49
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Structure representing the dimensions of the multiple sequence alignment data structure.
Definition: blast_psi.h:57
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
Structure to store sequence data and its length for use in the CORE of BLAST (it's a malloc'ed array ...
Definition: blast_setup.hpp:62
Stores the frequency ratios along with their bit scale factor.
Definition: type.c:6
Utility stuff for more convenient using of Boost.Test library.
static const string kTitle
CTraceGlyph inline method implementation.
static Uint4 letter(char c)
Modified on Fri Sep 20 14:57:03 2024 by modify_doxy.py rev. 669887