NCBI C++ ToolKit
pssmcreate_cdd_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: pssmcreate_cdd_unit_test.cpp 100942 2023-10-03 17:36:50Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Greg Boratyn
27  *
28  */
29 
30 /** @file pssmcreate_cdd_unit_test.cpp
31  * Unit test module for creation of PSSMs from multiple alignments of
32  conserved domains.
33  */
34 #include <ncbi_pch.hpp>
35 
36 #define BLAST_SECONDARY_SOURCE 1
37 
38 #include <corelib/test_boost.hpp>
39 
40 // Serial library includes
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 
44 // Object includes
50 #include <objects/seq/Bioseq.hpp>
52 
53 // ASN.1 definition for PSSM (scoremat)
59 
60 // BLAST includes
64 #include <algo/blast/api/rps_aux.hpp> // for CBlastRPSInfo
65 #include <blast_objmgr_priv.hpp>
66 #include <blast_psi_priv.h>
67 #include "psiblast_aux_priv.hpp" // for CScorematPssmConverter
68 
69 // Unit test auxiliary includes
70 #include "blast_test_util.hpp"
71 #include "pssm_test_util.hpp"
72 #include "test_objmgr.hpp"
73 
74 using namespace std;
75 using namespace ncbi;
76 using namespace ncbi::objects;
77 using namespace ncbi::blast;
78 
79 
80 /// Query id used for tests
81 static const string kQueryId = "gi|129295";
82 
83 /// Domain subject id used for tests (present in test CDD)
84 static const string kSubjectId = "gnl|CDD|29117";
85 
86 /// Class for testing methods of CCddInputData class.
87 /// The class creates instances of CCddInputData and accesses CCddInputData
88 /// private attributes and methods.
90 {
91 public:
92 
93  /// Type of test multiple alignment of CDs
94  enum EType {
95  /// Duplicate CDD hit
97  /// Duplicate CDD hit that
98  /// does not intersect query range
100  };
101 
102  /// Create CCddInputData test object with given CD alignment
103  /// @param type Alignment type
104  /// @return Pssm strategy
105  static CRef<CCddInputData> CreatePssmInput(EType type);
106 
107  /// Get number of CDs in the internal MSA
108  static size_t GetMsaSize(const CCddInputData& input);
109 
110  /// Get alphabet size used by CCddInputData
111  static int GetAlphabetSize(void);
112 
113  /// Get the scale factor for residue frequencis and independent
114  /// observations stored in CDD
115  static int GetProfilesDataScale(void);
116 
117  /// Find index of a CD in MSA by Seq_id
118  /// @param input PSSM strategy object
119  /// @param subject Seq_id of the CD
120  /// @return Index of subject in MSA or -1 if subject not found in MSA
121  static int GetSubjectMsaIndex(const CCddInputData& input,
122  const CSeq_id& subject);
123 
124  /// Get number of CDD hits in CCddInputData object
125  static size_t GetNumHits(const CCddInputData& input);
126 
127  /// Call CCddInputData method that removes multiple CD hits
128  static void RemoveMultipleCdHits(CCddInputData& input);
129 
130  /// Create a dummy alignment with one segment
131  /// @param query_id Query id
132  /// @param subject_id Subject id
133  /// @param qfrom Query start position
134  /// @param sfrom Subject start position
135  /// @param len Alignment length
136  /// @return Seq_align object
137  static CRef<CSeq_align> x_CreateAlignment(CRef<CSeq_id> query_id,
138  CRef<CSeq_id> subject_id,
139  int qfrom, int sfrom, int len);
140 
141  /// Create two alignments with the same CD that overlap
142  static CRef<CSeq_align_set> x_CreateDuplicateOverlappingHit(
143  CRef<CSeq_id> query_id);
144 
145 
146  /// Create two alignment with the same CD that do not overlap
147  static CRef<CSeq_align_set> x_CreateDuplicateNonOverlappingHit(
148  CRef<CSeq_id> query_id);
149 };
150 
151 // A series of classes for testing PSSM computation on the core level
152 
153 /// Simple PSSM computation strategy with one CD
155 {
156 public:
157 
158  CPssmInputWithNoCDs(void);
159 
160  virtual ~CPssmInputWithNoCDs() {}
161 
162  virtual PSICdMsa* GetData(void) {return &m_CdMsa;}
163 
164  virtual const PSIBlastOptions* GetOptions(void) {return m_Options;}
165 
166  virtual void Process(void) {}
167 
168  virtual unsigned char* GetQuery(void) {return &m_Query[0];}
169 
170  virtual unsigned int GetQueryLength(void) {return kQueryLength;}
171 
172 
173 protected:
174 
178  vector<unsigned char> m_Query;
179  vector<PSICdMsaCell*> m_Msa;
180 
181  static const int kQueryLength = 6;
182 };
183 
184 
185 /// Simple PSSM computation strategy with one CD
187 {
188 public:
190 
191  virtual ~CPssmInputWithSingleCD();
192 
193 protected:
194  vector<double> m_Freqs;
195  vector<PSICdMsaCellData> m_MsaData;
196 };
197 
198 /// Simple PSSM computation strategy with two CDs
200 {
201 public:
202  CPssmInputWithTwoCDs(void);
203 
205 };
206 
207 /// PSSM computation strategy with gaps in query
209 {
210 public:
212  _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
213  _ASSERT(m_CdMsa.msa[0][0].data);
214  _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
215 
216  m_CdMsa.query[1] = AMINOACID_TO_NCBISTDAA[(int)'-'];
217  }
218 };
219 
220 /// PSSM computation strategy with domains with negative residue frequencies
222 {
223 public:
225  _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
226  _ASSERT(m_CdMsa.msa[0][0].data);
227  _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
228 
229  const Uint1 kResidueA = AMINOACID_TO_NCBISTDAA[(int)'A'];
230  const Uint1 kResidueC = AMINOACID_TO_NCBISTDAA[(int)'C'];
231 
232  m_CdMsa.msa[0][0].data->wfreqs[kResidueA] = -0.001;
233  m_CdMsa.msa[0][0].data->wfreqs[kResidueC] += 0.001;
234  }
235 };
236 
237 /// PSSM computation strategy with domains with frequencies that do not sum
238 /// to 1
240 {
241 public:
243  _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
244  _ASSERT(m_CdMsa.msa[0][0].data);
245  _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
246 
247  const Uint1 kResidueA = AMINOACID_TO_NCBISTDAA[(int)'A'];
248 
249  m_CdMsa.msa[0][0].data->wfreqs[kResidueA] += 0.01;
250  }
251 };
252 
253 /// PSSM computation strategy with domains with zero observations
255 {
256 public:
258  _ASSERT(m_CdMsa.dimensions->num_seqs > 0);
259  _ASSERT(m_CdMsa.msa[0][0].data);
260  _ASSERT(m_CdMsa.msa[0][0].data->wfreqs);
261 
262  m_CdMsa.msa[0][0].data->iobsr = 0.0;
263  }
264 };
265 
266 
267 // Test computing frequency ratios and PSSM scores
268 static void s_TestCreatePssmFromFreqs(const PSICdMsa* cd_msa,
269  CBlastScoreBlk& sbp,
270  const PSIBlastOptions* opts,
271  AutoPtr<_PSISequenceWeights>& seq_weights);
272 
273 
274 
275 BOOST_AUTO_TEST_SUITE(pssmcreate_cdd)
276 
277 
278 // Tests for code in algo/blast/core for computing PSSM from conserved domains
279 
280 // Tests for pre and post conditions for computing residue frequencies,
281 // frequency ratios and PSSM scores
282 BOOST_AUTO_TEST_CASE(TestCreatePssmFromSingleCd)
283 {
284  // create pssm input with a single CD in the alignment
285  CPssmInputWithSingleCD pssm_input;
286 
287  blast::TAutoUint1Ptr query_with_sentinels
289  pssm_input.GetQueryLength()));
290 
291 
292  PSICdMsa* cd_msa = pssm_input.GetData();
293  CBlastScoreBlk sbp;
294  sbp.Reset(InitializeBlastScoreBlk(query_with_sentinels.get(),
295  pssm_input.GetQueryLength()));
296 
297  const PSIBlastOptions* opts = pssm_input.GetOptions();
298 
300  cd_msa->dimensions, sbp));
301 
302  // compute and verify residue frequencies
303 
304  // verify that that the function returns success
305  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(cd_msa, sbp.Get(), opts,
306  seq_weights.get()) == 0);
307 
308  // verify that for a pssm input with a single CD hit residue frequencies
309  // and number of indpendent observations are the same as in domain model
310  for (int i=0;i < (int)cd_msa->dimensions->query_length;i++) {
311  if (cd_msa->msa[0][i].is_aligned) {
312  BOOST_REQUIRE_CLOSE(seq_weights->independent_observations[i],
313  cd_msa->msa[0][i].data->iobsr,
314  1e-5);
315 
316  for (int j=0;j < (int)sbp->alphabet_size;j++) {
317  BOOST_REQUIRE_CLOSE(seq_weights->match_weights[i][j],
318  cd_msa->msa[0][i].data->wfreqs[j],
319  1e-5);
320  }
321  }
322  }
323 
324  s_TestCreatePssmFromFreqs(cd_msa, sbp, opts, seq_weights);
325 }
326 
327 
328 BOOST_AUTO_TEST_CASE(TestCreatePssmFromMultipleCds)
329 {
330  // create pssm input with two CD in the alignment
331  CPssmInputWithTwoCDs pssm_input;
332 
333  blast::TAutoUint1Ptr query_with_sentinels
335  pssm_input.GetQueryLength()));
336 
337 
338  PSICdMsa* cd_msa = pssm_input.GetData();
339  CBlastScoreBlk sbp;
340  sbp.Reset(InitializeBlastScoreBlk(query_with_sentinels.get(),
341  pssm_input.GetQueryLength()));
342 
343  const PSIBlastOptions* opts = pssm_input.GetOptions();
344 
346  cd_msa->dimensions, sbp));
347 
348  // verify that computing frequencies finishes with success
349  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(cd_msa, sbp.Get(), opts,
350  seq_weights.get()) == 0);
351 
352  for (int i=0;i < (int)cd_msa->dimensions->query_length;i++) {
353 
354  // verify that CDs are aligned to query in each column
355  BOOST_REQUIRE(cd_msa->msa[0][i].is_aligned
356  && cd_msa->msa[1][i].is_aligned);
357 
358  // verify that number of observations is the same as the sum of
359  // observations from each CD in the alignment
360  BOOST_REQUIRE_CLOSE(seq_weights->independent_observations[i],
361  cd_msa->msa[0][i].data->iobsr
362  + cd_msa->msa[1][i].data->iobsr,
363  1e-5);
364 
365  // verify that residue frequencies sum to 1
366  double sum = 0.0;
367  for (int j=0;j < (int)sbp->alphabet_size;j++) {
368  sum += seq_weights->match_weights[i][j];
369  }
370  BOOST_REQUIRE_CLOSE(sum, 1.0, 1e-5);
371  }
372 
373  s_TestCreatePssmFromFreqs(cd_msa, sbp, opts, seq_weights);
374 }
375 
376 BOOST_AUTO_TEST_CASE(TestCreatePssmFromNoCds)
377 {
378  // create pssm input with no CDs in the alignment
379  CPssmInputWithNoCDs pssm_input;
380 
381  blast::TAutoUint1Ptr query_with_sentinels
383  pssm_input.GetQueryLength()));
384 
385 
386  PSICdMsa* cd_msa = pssm_input.GetData();
387  CBlastScoreBlk sbp;
388  sbp.Reset(InitializeBlastScoreBlk(query_with_sentinels.get(),
389  pssm_input.GetQueryLength()));
390 
391  const PSIBlastOptions* opts = pssm_input.GetOptions();
392 
394  cd_msa->dimensions, sbp));
395 
396  // compute and verify residue frequencies
397 
398  // verify that that the function returns success
399  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(cd_msa, sbp.Get(), opts,
400  seq_weights.get()) == 0);
401 
402  // verify computation of frequency ratios and pssm scores
403  s_TestCreatePssmFromFreqs(cd_msa, sbp, opts, seq_weights);
404 }
405 
406 // Verify that CdMsa with gaps in query returns error
407 BOOST_AUTO_TEST_CASE(TestRejectGapsInQuery)
408 {
409  CPssmInputWithGapsInQuery pssm_input;
410 
411  CPssmEngine pssm_engine(&pssm_input);
412  BOOST_REQUIRE_THROW(pssm_engine.Run(), CBlastException);
413 }
414 
415 
416 BOOST_AUTO_TEST_CASE(TestRejectDomainsWithNegativeFreqs)
417 {
418  CPssmInputWithNegativeFreqs pssm_input;
419 
420  CPssmEngine pssm_engine(&pssm_input);
421  BOOST_REQUIRE_THROW(pssm_engine.Run(), CBlastException);
422 }
423 
424 BOOST_AUTO_TEST_CASE(TestRejectDomainsWithUnnormalizedFreqs)
425 {
427 
428  CPssmEngine pssm_engine(&pssm_input);
429  BOOST_REQUIRE_THROW(pssm_engine.Run(), CBlastException);
430 }
431 
432 BOOST_AUTO_TEST_CASE(TestRejectDomainsWithZeroObservations)
433 {
435 
436  CPssmEngine pssm_engine(&pssm_input);
437  BOOST_REQUIRE_THROW(pssm_engine.Run(), CBlastException);
438 }
439 
440 
441 BOOST_AUTO_TEST_CASE(TestRejectNullInput)
442 {
443  CPssmInputWithSingleCD pssm_input;
444 
445  blast::TAutoUint1Ptr query_with_sentinels
447  pssm_input.GetQueryLength()));
448 
449 
450  PSICdMsa* cd_msa = pssm_input.GetData();
451  CBlastScoreBlk sbp;
452  sbp.Reset(InitializeBlastScoreBlk(query_with_sentinels.get(),
453  pssm_input.GetQueryLength()));
454 
455  // set default options
456  const PSIBlastOptions* opts = pssm_input.GetOptions();
457 
459  cd_msa->dimensions, sbp));
460 
461  // verify that an error code is returned for missing argument
462  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(NULL, sbp.Get(), opts,
463  seq_weights.get()));
464 
465  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(cd_msa, NULL, opts,
466  seq_weights.get()));
467 
468  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(cd_msa, sbp.Get(), NULL,
469  seq_weights.get()));
470 
471  BOOST_REQUIRE(_PSIComputeFrequenciesFromCDs(cd_msa, sbp.Get(), opts, NULL));
472 
473  Int4 pseudo_count = 0;
475  cd_msa->dimensions->query_length,
476  (Uint4)sbp->alphabet_size));
477 
478  BOOST_REQUIRE(_PSIComputeFreqRatiosFromCDs(NULL, seq_weights.get(),
479  sbp.Get(), pseudo_count,
480  internal_pssm.get()));
481 
482  BOOST_REQUIRE(_PSIComputeFreqRatiosFromCDs(cd_msa, NULL, sbp.Get(),
483  pseudo_count,
484  internal_pssm.get()));
485 
486  BOOST_REQUIRE(_PSIComputeFreqRatiosFromCDs(cd_msa, seq_weights.get(), NULL,
487  pseudo_count,
488  internal_pssm.get()));
489 
490  BOOST_REQUIRE(_PSIComputeFreqRatiosFromCDs(cd_msa, seq_weights.get(),
491  sbp.Get(), -1,
492  internal_pssm.get()));
493 
494  BOOST_REQUIRE(_PSIComputeFreqRatiosFromCDs(cd_msa, seq_weights.get(),
495  sbp.Get(), pseudo_count, NULL));
496 }
497 
498 
499 //---------------------------------------------------------------------
500 
501 
502 
503 // Tests for CCddInputData class -- strategy for computing PSSM from CDD hits
504 
505 // Verify that CDD search results are correctly converted to multiple alignment
506 // of CDs
507 BOOST_AUTO_TEST_CASE(TestConvertSeqalignToCdMsa)
508 {
509  const string seqalign("data/cdd-129295.asn");
510  const string rpsdb("data/deltatest");
511 
512  /*** Setup code ***/
513  CRef<CSeq_id> qid(new CSeq_id("gi|129295"));
514  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(*qid));
515 
516  // read alignments
517  unique_ptr<CObjectIStream> in
519 
521  *in >> *sas;
522  BOOST_REQUIRE(sas->Get().size() != 0);
523 
524  CPSIBlastOptions opts;
525  PSIBlastOptionsNew(&opts);
526 
527  // retrieve the query sequence, but skip the sentinel bytes
529  q->scope));
530 
531  try {
532  // create pssm engine strategy
533  CRef<CCddInputData> pssm_input(new CCddInputData(seq.data.get() + 1,
534  seq.length - 2,
535  sas,
536  *opts,
537  rpsdb));
538 
539  pssm_input->Process();
540 
541  // open CDD database for checking residue frequencies and
542  // indepedent observations
543  CBlastRPSInfo profile_data(rpsdb, CBlastRPSInfo::fDeltaBlast);
544 
545  // verify that CDD was open properly
546  BOOST_REQUIRE(profile_data()->freq_header);
547  BOOST_REQUIRE(profile_data()->obsr_header);
548  BOOST_REQUIRE_EQUAL(profile_data()->freq_header->num_profiles,
549  profile_data()->obsr_header->num_profiles);
550 
551  // get residue freqs from CDD
552  BlastRPSProfileHeader* freq_header = profile_data()->freq_header;
553  int kNumDomains = freq_header->num_profiles;
554 
555  const Int4* freq_offsets = freq_header->start_offsets;
556  const CCddInputData::TFreqs* freq_start =
557  (CCddInputData::TFreqs*)(freq_header->start_offsets
558  + kNumDomains + 1);
559 
560  // get independent observations from CDD
561  BlastRPSProfileHeader* obsr_header = profile_data()->obsr_header;
562  const Int4* obsr_offsets = obsr_header->start_offsets;
563  const CCddInputData::TObsr* obsr_start =
564  (CCddInputData::TObsr*)obsr_header->start_offsets
565  + kNumDomains + 1;
566 
567  CSeqDB seqdb(rpsdb, CSeqDB::eProtein);
568 
569 
570  /*** End Setup code ***/
571 
572 
573  // Walk through the alignment segments and ensure that PSICdMsa
574  // is filled properly
575 
576  // TO DO: Make sure that each subject appears once in the
577  // seq-align-set
578 
579  const PSICdMsa* cd_msa = pssm_input->GetData();
580 
581  // verify query length in msa
582  BOOST_REQUIRE_EQUAL(cd_msa->dimensions->query_length,
583  seq.length-2);
584 
585  // verify number of subjects
586  BOOST_REQUIRE_EQUAL(cd_msa->dimensions->num_seqs,
587  sas->Get().size());
588 
589  // verify that msa size is the same as provided by the
590  // cd_msa->dimensions structure
591  BOOST_REQUIRE_EQUAL(cd_msa->dimensions->query_length
592  * cd_msa->dimensions->num_seqs,
593  CPssmCddInputTest::GetMsaSize(*pssm_input));
594 
595  // verify query sequence in msa
596  for (int i=0;i < (int)seq.length-2;i++) {
597  BOOST_REQUIRE_EQUAL((Uint1)cd_msa->query[i],
598  seq.data.get()[i + 1]);
599  }
600 
603 
604 
605  ITERATE (CSeq_align_set::Tdata, hsp, sas->Get()) {
606  const CDense_seg& ds = (*hsp)->GetSegs().GetDenseg();
607  BOOST_REQUIRE_EQUAL(ds.GetDim(), 2);
608  const CSeq_id& subject = ds.GetSeq_id(1);
609  const vector<TSignedSeqPos>& starts = ds.GetStarts();
610  const vector<TSeqPos>& lengths = ds.GetLens();
611 
612  // get subject domain database ordinal id
613  int db_oid;
614  seqdb.SeqidToOid(subject, db_oid);
615  BOOST_REQUIRE(db_oid >= 0 && db_oid < kNumDomains);
616 
617  // get subject frequency data from CDD
618  const CCddInputData::TFreqs* freqs =
619  freq_start + freq_offsets[db_oid] * kAlphabetSize;
620 
621  // get subject observations data from CDD
622  const CCddInputData::TObsr* obsr_c =
623  obsr_start + obsr_offsets[db_oid];
624 
625  // decompress independent observations
626  int obsr_size =
627  obsr_offsets[db_oid + 1] - obsr_offsets[db_oid];
628  vector<CCddInputData::TObsr> obsr;
629  for (int i=0;i < obsr_size;i+=2) {
630  CCddInputData::TObsr val = obsr_c[i];
631  Int4 num = (Int4)obsr_c[i + 1];
632 
633  for (int j=0;j < num;j++) {
634  obsr.push_back(val);
635  }
636  }
637 
638  // get subject index in CdMSA
640  *pssm_input,
641  subject);
642 
643  // verify that that subject index is sane
644  BOOST_REQUIRE(msa_index >= 0
645  && msa_index < (int)sas->Get().size());
646 
647  // walk through alignment segments
648  int k = 0;
649  const int kGap = -1;
650  for (int i=0;i < ds.GetNumseg(); i++) {
651  TSignedSeqPos q_index = starts[i*ds.GetDim()];
652  TSignedSeqPos s_index = starts[i*ds.GetDim()+1];
653 
654  // verify that segments not present in denseg
655  // are marked as not aligned in MSA
656  while (k < q_index) {
657  BOOST_REQUIRE_EQUAL(cd_msa->msa[msa_index][k].is_aligned,
658  (Uint1)false);
659  k++;
660  }
661 
662  if (s_index == kGap) {
663 
664  // verify that deletions in subject are marked as
665  // not aligned in MSA
666  for (TSeqPos pos = 0; pos < lengths[i]; pos++) {
667 
668  BOOST_REQUIRE_EQUAL(
669  cd_msa->msa[msa_index][q_index + pos].is_aligned,
670  (Uint1)false);
671  }
672  } else if (q_index == kGap) {
673  s_index += lengths[i];
674  continue;
675  } else {
676  for (TSeqPos pos = 0; pos < lengths[i]; pos++) {
677 
678  // verify that aligned segments in denseg are
679  // marked as aligned in MSA
680  BOOST_REQUIRE_EQUAL(
681  cd_msa->msa[msa_index][q_index + pos].is_aligned,
682  (Uint1)true);
683 
684  // verify profile data in msa
686  cd_msa->msa[msa_index][q_index + pos].data;
687 
688  BOOST_REQUIRE(data);
689 
690  // verify that number of independent observations
691  // is correct;
692  // we expec a small difference due to converting
693  // real numbers to integers
694  BOOST_REQUIRE(abs((Int4)(data->iobsr * kScale)
695  - (Int4)obsr[s_index + pos]) < 2);
696 
697  // verify that residue frequencies are correct
698  for (int j=0;j < kAlphabetSize;j++) {
699 
700  // residue frequencies in MSA may have sligtly
701  // different values than in the database,
702  // so we are only checking if frequncy
703  // is/is not equal to zero
704  BOOST_REQUIRE_EQUAL(
705  (int)(data->wfreqs[j] * kScale)== 0,
706  (int)freqs[(s_index + pos)
707  * kAlphabetSize + j] == 0);
708 
709  }
710  }
711  }
712  k = q_index + lengths[i];
713 
714  }
715 
716  }
717 
718  } catch (const exception& e) {
719  cerr << e.what() << endl;
720  BOOST_REQUIRE(false);
721  } catch (...) {
722  cerr << "Unknown exception" << endl;
723  BOOST_REQUIRE(false);
724  }
725 }
726 
727 
728 // Verify that an overlapping duplicate CD is removed from alignment
729 BOOST_AUTO_TEST_CASE(TestDuplicateCdHits)
730 {
731  // test overlaping duplicate hit
734 
735  // verify that initially the pssm input object has 2 hits
736  int pre_num_hits = CPssmCddInputTest::GetNumHits(*pssm_input);
737  BOOST_REQUIRE_EQUAL(pre_num_hits, 2);
738 
739  // invoke removing duplicate hits
741 
742  // verify that one hit was removed
743  int post_num_hits = CPssmCddInputTest::GetNumHits(*pssm_input);
744  BOOST_REQUIRE_EQUAL(post_num_hits, 1);
745 
746 
747  // test non-overlaping duplicate hit
750 
751  // verify that initially the pssm inputy object has 2 hits
752  pre_num_hits = CPssmCddInputTest::GetNumHits(*pssm_input);
753  BOOST_REQUIRE_EQUAL(pre_num_hits, 2);
754 
755  // invoke removing duplicate hits
757 
758  // verify that no hit was removed
759  post_num_hits = CPssmCddInputTest::GetNumHits(*pssm_input);
760  BOOST_REQUIRE_EQUAL(post_num_hits, 2);
761 }
762 
763 
764 
765 //---------------------------------------------------
766 
767 
768 // Tests creating PSSM from CD alignment using CCddInputData as strategy
769 // Mostly verify resulting PSSM
770 
771 BOOST_AUTO_TEST_CASE(TestFullPssmEngineRunWithDiagnosticsRequest) {
772 
773  const string seqalign("data/cdd-129295.asn");
774  const string rpsdb("data/deltatest");
775 
776  unique_ptr<CObjectIStream> in
778 
780  *in >> *sas;
781 
782  CRef<CSeq_id> qid(new CSeq_id("gi|129295"));
783 
784  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(*qid));
785  SBlastSequence seq(GetSequence(*q->seqloc, eBlastEncodingProtein, q->scope));
786 
787  CPSIBlastOptions opts;
788  PSIBlastOptionsNew(&opts);
789 
790  PSIDiagnosticsRequest request;
791  memset((void*) &request, 0, sizeof(request));
792  request.information_content = true;
793  request.weighted_residue_frequencies = true;
794  request.frequency_ratios = true;
795  request.independent_observations = true;
796 
797  const string kTitle("Test defline");
798 
799  CRef<IPssmInputCdd> pssm_strategy(new CCddInputData(
800  seq.data.get() + 1,
801  seq.length - 2,
802  sas,
803  *opts,
804  rpsdb,
805  "BLOSUM80",
806  11,
807  1,
808  &request,
809  kTitle));
810 
811  CRef<CPssmEngine> pssm_engine(new CPssmEngine(pssm_strategy));
812  CRef<CPssmWithParameters> pssm = pssm_engine->Run();
813 
814  // verify query length
815  CRef<CBioseq> bioseq = pssm_strategy->GetQueryForPssm();
816  BOOST_REQUIRE_EQUAL(bioseq->GetLength(), seq.length-2);
817 
818  // TO DO: verify query sequence
819 
820  string query_descr;
821  if (bioseq->IsSetDescr()) {
822  const CBioseq::TDescr::Tdata& data = bioseq->GetDescr().Get();
824  if((*iter)->IsTitle()) {
825  query_descr += (*iter)->GetTitle();
826  }
827  }
828  }
829  BOOST_REQUIRE_EQUAL(query_descr, kTitle);
830 
831 
832  const size_t kNumElements =
833  pssm_strategy->GetQueryLength() * BLASTAA_SIZE;
834 
835 
836  // verify that weighted residue frequencies came back
839  BOOST_REQUIRE_EQUAL(kNumElements, wres_freqs.size());
840 
841  // verify that frequency ratios came back
842  const CPssmIntermediateData::TFreqRatios& freq_ratios =
844  BOOST_REQUIRE_EQUAL(kNumElements, freq_ratios.size());
845 
846  // verify that numbers of independent observations came back
849  BOOST_REQUIRE_EQUAL(seq.length-2, obsr.size());
850 
851  // verify that pssm scores came back
852  const CPssmFinalData::TScores& scores =
853  pssm->GetPssm().GetFinalData().GetScores();
854  BOOST_REQUIRE_EQUAL(kNumElements, scores.size());
855 
856 
857  // TO DO: What if an unsupported diagnostic is requested?
858 
859  // currenlty unsupported diagnostics:
860  // residue_frequencies
861  // gapless_columns_weights
862  // sigma
863  // interval_sizes
864  // num_matching_seqs
865 }
866 
867 
868 // Verify that PSSM scores coresponding to gaps in a subject are similar to
869 // scores in the standard scoring matrix (BLOSUM62)
870 BOOST_AUTO_TEST_CASE(TestInternalGapsInSubject) {
871 
872  // create a fake Seq-align-set with valid query and subject
873  CRef<CSeq_id> query_id(new CSeq_id("gi|129295"));
874 
875  // subject domain must be present in test database
876  CRef<CSeq_id> subject_id(new CSeq_id("gnl|CDD|29117"));
877 
878  string rpsdb = "data/deltatest";
879  const string kMatrix = "BLOSUM62";
880 
881  // location and length of the gap
882  const int kGapStart = 2;
883  const int kGapLen = 160;
884 
885  CRef<CSeq_align> seq_align(new CSeq_align());
886  seq_align->SetDim(2);
887 
888  // create a fake alignment with internal gap in the subject:
889  // Query: 1 QQQ...QQQQQ
890  // Sbjct: 1 S--...---SS
891  CDense_seg& denseg = seq_align->SetSegs().SetDenseg();
892  denseg.SetDim(2);
893  denseg.SetNumseg(3);
894  CDense_seg::TIds& ids = denseg.SetIds();
895  ids.push_back(query_id);
896  ids.push_back(subject_id);
897  CDense_seg::TStarts& starts = denseg.SetStarts();
898  CDense_seg::TLens& lens = denseg.SetLens();
899  starts.push_back(1);
900  starts.push_back(1);
901  starts.push_back(kGapStart);
902  starts.push_back(-1);
903  starts.push_back(kGapStart + kGapLen + 1);
904  starts.push_back(kGapStart + kGapLen + 1);
905  lens.push_back(1);
906  lens.push_back(kGapLen);
907  lens.push_back(2);
908 
909  // make sure that denseg is valid
910  denseg.Validate(true);
911 
912  seq_align->SetNamedScore(CSeq_align::eScore_EValue, 0.001);
913 
914  CRef<CSeq_align_set> seq_align_set(new CSeq_align_set());
915  seq_align_set->Set().push_back(seq_align);
916 
917  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(*query_id));
918  CRef<CScope> scope = q->scope;
919 
920  // create PSSM engine strategy
921  CPSIBlastOptions opts;
922  PSIBlastOptionsNew(&opts);
923 
924  // retrieve the query sequence, but skip the sentinel bytes
925  SBlastSequence seq(GetSequence(*q->seqloc, eBlastEncodingProtein, q->scope));
926 
927  CRef<IPssmInputCdd> pssm_input(new CCddInputData(seq.data.get() + 1,
928  seq.length - 2,
929  seq_align_set, *opts,
930  rpsdb, kMatrix));
931 
932  // compute PSSM
933  CPssmEngine pssm_engine(pssm_input);
934  CRef<CPssmWithParameters> pssm = pssm_engine.Run();
935 
936  unique_ptr< CNcbiMatrix<int> > pssm_scores(
938 
939  const SNCBIPackedScoreMatrix* score_matrix = &NCBISM_Blosum62;
940  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA[(int)'-'];
941  stringstream ss;
942  BOOST_REQUIRE_EQUAL((size_t)pssm->GetPssm().GetNumColumns(),
943  (size_t)pssm_scores->GetCols());
944  BOOST_REQUIRE_EQUAL((size_t)pssm->GetPssm().GetNumRows(),
945  (size_t)pssm_scores->GetRows());
946 
947  BOOST_REQUIRE(kGapStart + kGapLen < pssm->GetPssm().GetNumColumns());
948 
949  // Residues U, *, O, and J are not scored in the standard matrices
950  const int kResiduesUOJstar = 24;
951 
952  // Verify that columns correspoding to gaps have similar PSSM scores as
953  // in BLOSUM62 columns corresponing to query residues.
954  // The scores may differ by 1.
955  for (int i=kGapStart; i < kGapStart + kGapLen; i++) {
956  for (int j = 0; j < kResiduesUOJstar; j++) {
957 
958  // gaps get value of BLAST_SCORE_MIN
959  if (j == kGapResidue) {
960  ss.str("");
961  ss << "Position " << i << " residue "
962  << TestUtil::GetResidue(j) << " differ on PSSM";
963  BOOST_REQUIRE_MESSAGE(BLAST_SCORE_MIN == (*pssm_scores)(j, i),
964  ss.str());
965 
966  } else {
967  // get score from standard scoring matrix
968  int bl_score =
969  (int)NCBISM_GetScore(score_matrix,
970  pssm_input->GetQuery()[i], j);
971 
972  ss.str("");
973  ss << "Position " << i << " residue "
974  << TestUtil::GetResidue(j) << " differ on PSSM: "
975  << "expected=" << NStr::IntToString(bl_score)
976  << " actual=" << NStr::IntToString((*pssm_scores)(j, i));
977 
978  BOOST_REQUIRE_MESSAGE (bl_score - (*pssm_scores)(j, i) <= 1
979  && bl_score - (*pssm_scores)(j, i) >= -1,
980  ss.str());
981  }
982  }
983  }
984 }
985 
986 // Verify that PSSM can be computed for PSSM engine strategy object with
987 // no CDD hits. PSSM scores are in that case similar to standard scoring matrix
988 BOOST_AUTO_TEST_CASE(TestNoDomainHits)
989 {
990  string rpsdb = "data/deltatest";
991  const string kMatrix = "BLOSUM62";
992 
993  CRef<CSeq_id> query_id(new CSeq_id("gi|129295"));
994 
995  // Create empty Seq-align-set
996  CRef<CSeq_align_set> seq_align_set(new CSeq_align_set());
997 
998  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(*query_id));
999  CRef<CScope> scope = q->scope;
1000 
1001  // create PSSM engine strategy
1002  CPSIBlastOptions opts;
1003  PSIBlastOptionsNew(&opts);
1004 
1005  // retrieve the query sequence, but skip the sentinel bytes
1006  SBlastSequence seq(GetSequence(*q->seqloc, eBlastEncodingProtein, q->scope));
1007 
1008  CRef<IPssmInputCdd> pssm_input(new CCddInputData(seq.data.get() + 1,
1009  seq.length - 2,
1010  seq_align_set, *opts,
1011  rpsdb, kMatrix));
1012 
1013  // compute PSSM
1014  CPssmEngine pssm_engine(pssm_input);
1015  CRef<CPssmWithParameters> pssm = pssm_engine.Run();
1016 
1017  unique_ptr< CNcbiMatrix<int> > pssm_scores(
1019 
1020  // Get BLOSUM62 scoring matrix
1021  const SNCBIPackedScoreMatrix* score_matrix = &NCBISM_Blosum62;
1022  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA[(int)'-'];
1023  stringstream ss;
1024  BOOST_REQUIRE_EQUAL((size_t)pssm->GetPssm().GetNumColumns(),
1025  (size_t)pssm_scores->GetCols());
1026  BOOST_REQUIRE_EQUAL((size_t)pssm->GetPssm().GetNumRows(),
1027  (size_t)pssm_scores->GetRows());
1028 
1029  // Residues U, *, O, and J are not scored in the standard matrices
1030  const int kResiduesUOJstar = 24;
1031 
1032  // Verify that PSSM has scores as
1033  // in BLOSUM62 columns corresponing to query residues.
1034  // The scores may differ by 1.
1035  for (int i=0; i < pssm->GetPssm().GetNumColumns(); i++) {
1036  for (int j = 0; j < kResiduesUOJstar; j++) {
1037 
1038  // Exceptional residues get value of BLAST_SCORE_MIN
1039  if (j == kGapResidue) {
1040  ss.str("");
1041  ss << "Position " << i << " residue "
1042  << TestUtil::GetResidue(j) << " differ on PSSM";
1043  BOOST_REQUIRE_MESSAGE(BLAST_SCORE_MIN == (*pssm_scores)(j, i),
1044  ss.str());
1045 
1046  } else {
1047  int bl_score =
1048  (int)NCBISM_GetScore(score_matrix,
1049  pssm_input->GetQuery()[i], j);
1050 
1051  ss.str("");
1052  ss << "Position " << i << " residue "
1053  << TestUtil::GetResidue(j) << " differ on PSSM: "
1054  << "expected=" << NStr::IntToString(bl_score)
1055  << " actual=" << NStr::IntToString((*pssm_scores)(j, i));
1056 
1057  BOOST_REQUIRE_MESSAGE (bl_score - (*pssm_scores)(j, i) <= 1
1058  && bl_score - (*pssm_scores)(j, i) >= -1,
1059  ss.str());
1060 
1061  }
1062  }
1063  }
1064 }
1065 
1067 
1068 //-----------------------------------------------------------------
1069 // Implementation for utility classes
1070 
1071 // Implementation for functions declared above
1072 
1073 CRef<CSeq_align> CPssmCddInputTest::x_CreateAlignment(CRef<CSeq_id> query_id,
1074  CRef<CSeq_id> subject_id,
1075  int qfrom, int sfrom, int len)
1076 {
1077  CRef<CSeq_align> seq_align(new CSeq_align());
1078  seq_align->SetDim(2);
1079 
1080  CDense_seg& denseg = seq_align->SetSegs().SetDenseg();
1081  denseg.SetDim(2);
1082  denseg.SetNumseg(1);
1083  CDense_seg::TIds& ids = denseg.SetIds();
1084  ids.push_back(query_id);
1085  ids.push_back(subject_id);
1086  CDense_seg::TStarts& starts = denseg.SetStarts();
1087  CDense_seg::TLens& lens = denseg.SetLens();
1088  starts.push_back(qfrom);
1089  starts.push_back(sfrom);
1090  lens.push_back(len);
1091 
1092  // make sure that denseg is valid
1093  denseg.Validate(true);
1094 
1095  seq_align->SetNamedScore(CSeq_align::eScore_EValue, 0.001);
1096 
1097  return seq_align;
1098 }
1099 
1100 
1102  CRef<CSeq_id> query_id)
1103 {
1104  CRef<CSeq_id> subject_id(new CSeq_id(kSubjectId));
1105 
1106  CRef<CSeq_align_set> seq_align_set(new CSeq_align_set());
1107  seq_align_set->Set().push_back(x_CreateAlignment(query_id, subject_id,
1108  0, 0, 20));
1109 
1110  seq_align_set->Set().push_back(x_CreateAlignment(query_id, subject_id,
1111  0, 0, 20));
1112  return seq_align_set;
1113 }
1114 
1115 
1117  CRef<CSeq_id> query_id)
1118 {
1119  CRef<CSeq_id> subject_id(new CSeq_id(kSubjectId));
1120 
1121  CRef<CSeq_align_set> seq_align_set(new CSeq_align_set());
1122  seq_align_set->Set().push_back(x_CreateAlignment(query_id, subject_id,
1123  0, 0, 20));
1124 
1125  seq_align_set->Set().push_back(x_CreateAlignment(query_id, subject_id,
1126  20, 10, 50));
1127  return seq_align_set;
1128 }
1129 
1130 
1132 {
1133  CRef<CSeq_id> query_id(new CSeq_id(kQueryId));
1134  const string rpsdb = "data/deltatest";
1135  const string kMatrix = "BLOSUM62";
1136 
1137  CRef<CSeq_align_set> seq_align_set;
1138 
1139  switch (type) {
1140 
1141  case eDuplicateOverlappingHit:
1142  seq_align_set = x_CreateDuplicateOverlappingHit(query_id);
1143  break;
1144 
1145  case eDuplicateNonOverlappingHit:
1146  seq_align_set = x_CreateDuplicateNonOverlappingHit(query_id);
1147  break;
1148  }
1149 
1150  unique_ptr<SSeqLoc> q(CTestObjMgr::Instance().CreateSSeqLoc(*query_id));
1151  CRef<CScope> scope = q->scope;
1152 
1153  // create PSSM engine strategy
1154  CPSIBlastOptions opts;
1155  PSIBlastOptionsNew(&opts);
1156 
1157  // retrieve the query sequence, but skip the sentinel bytes
1158  SBlastSequence seq(GetSequence(*q->seqloc, eBlastEncodingProtein, q->scope));
1159 
1160  CRef<CCddInputData> pssm_input(new CCddInputData(seq.data.get() + 1,
1161  seq.length - 2,
1162  seq_align_set, *opts,
1163  rpsdb, kMatrix));
1164 
1165  pssm_input->x_ProcessAlignments(-1.0, 10.0);
1166 
1167  return pssm_input;
1168 }
1169 
1170 
1172 {
1173  return input.m_MsaData.size();
1174 }
1175 
1177 {
1179 }
1180 
1182 {
1184 }
1185 
1187  const CSeq_id& subject)
1188 {
1189  int retval = -1;
1190 
1191  for (int i=0;i < (int)input.m_Hits.size();i++) {
1192  if (input.m_Hits[i]->m_SubjectId->Match(subject)) {
1193  retval = i;
1194  break;
1195  }
1196  }
1197 
1198  return retval;
1199 }
1200 
1202 {
1203  return input.m_Hits.size();
1204 }
1205 
1207 {
1208  input.x_RemoveMultipleCdHits();
1209 }
1210 
1211 
1213 {
1214  // query with the same residue: A
1215  m_Query.resize(kQueryLength, 1);
1216 
1217  m_Dimensions.query_length = kQueryLength;
1218  m_Dimensions.num_seqs = 0;
1219 
1220  m_CdMsa.msa = NULL;
1221  m_CdMsa.dimensions = &m_Dimensions;
1222  m_CdMsa.query = &m_Query[0];
1223 
1224  PSIBlastOptionsNew(&m_Options);
1225 }
1226 
1227 
1229 {
1230  // domain residue frequencies
1231  int freqs[] = {0, 2, 2, 1, 5, 0, 0, 0, 0, 0,
1232  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1233  0, 0, 0, 0, 0, 0, 0, 0};
1234 
1235  // domain independent observations per column times 10
1236  int obsr[] = {21, 34, 56, 21, 21, 21};
1237 
1238  m_Freqs.resize(28);
1239  double sum = 0.0;
1240  for (int i=0;i < 28;i++) {
1241  m_Freqs[i] = (double)freqs[i] / 10.0;
1242  sum += m_Freqs[i];
1243  }
1244  BOOST_REQUIRE_CLOSE(sum, 1.0, 1e-5);
1245 
1246 
1248  data.wfreqs = NULL;
1249  data.iobsr = 0.0;
1250  m_MsaData.resize(kQueryLength, data);
1251 
1252  // this is to prevent reallocation in child classes
1253  m_Msa.reserve(10);
1254  m_Msa.push_back(new PSICdMsaCell[kQueryLength]);
1255  m_CdMsa.msa = &m_Msa[0];
1256 
1257  for (int i=0;i < kQueryLength;i++) {
1258  m_Msa[0][i].is_aligned = true;
1259  m_Msa[0][i].data = &m_MsaData[i];
1260  m_Msa[0][i].data->wfreqs = &m_Freqs[0];
1261  m_Msa[0][i].data->iobsr = (double)obsr[i] / 10.0;
1262  }
1263 
1264  m_Dimensions.num_seqs = 1;
1265 }
1266 
1268 {
1269  ITERATE (vector<PSICdMsaCell*>, it, m_Msa) {
1270  delete [] *it;
1271  }
1272 }
1273 
1275 {
1276  m_Msa.resize(2);
1277  m_Msa[1] = new PSICdMsaCell[kQueryLength];
1278 
1279  int obsr[] = {22, 41, 76, 21, 200, 21};
1280 
1281  for (int i=0;i < kQueryLength;i++) {
1282  m_Msa[1][i].is_aligned = true;
1283  m_Msa[1][i].data = &m_MsaData[i];
1284  m_Msa[1][i].data->wfreqs = &m_Freqs[0];
1285  m_Msa[1][i].data->iobsr = (double)obsr[i] / 10.0;
1286  }
1287 
1288  m_Dimensions.num_seqs = 2;
1289  m_CdMsa.msa = &m_Msa[0];
1290 }
1291 
1292 
1293 
1294 // Test computing frequency ratios and PSSM scores
1296  const PSIBlastOptions* opts,
1297  AutoPtr<_PSISequenceWeights>& seq_weights)
1298 {
1299 
1300  // compute and verify frequency ratios
1301 
1302  AutoPtr<_PSIInternalPssmData> internal_pssm(
1304  (Uint4)sbp->alphabet_size));
1305 
1306  // pre conditions
1307  BOOST_REQUIRE_EQUAL(internal_pssm->ncols, cd_msa->dimensions->query_length);
1308  BOOST_REQUIRE_EQUAL(internal_pssm->nrows, (unsigned int)sbp->alphabet_size);
1309 
1310 
1311  // verify that the function returns success
1312  BOOST_REQUIRE(_PSIComputeFreqRatiosFromCDs(cd_msa, seq_weights.get(),
1313  sbp.Get(), opts->pseudo_count,
1314  internal_pssm.get()) == 0);
1315 
1316  // post conditions
1317  if (cd_msa->dimensions->num_seqs > 0) {
1318  for (int i=0;i < (int)cd_msa->dimensions->query_length;i++) {
1319  for (int j=0;j < (int)sbp->alphabet_size;j++) {
1320 
1321  // verify that frequency ratios for residues with non-zero
1322  // background frequencies are non-zero
1323  BOOST_REQUIRE_EQUAL(internal_pssm->freq_ratios[i][j] < 1e-5,
1324  seq_weights->std_prob[j] < 1e-5);
1325  }
1326  }
1327  }
1328 
1329 
1330  // compute and verify PSSM scores
1331 
1332  // verify that the function returns success
1333  BOOST_REQUIRE(_PSIConvertFreqRatiosToPSSM(
1334  internal_pssm.get(), cd_msa->query,
1335  sbp.Get(), seq_weights->std_prob) == 0);
1336 
1337  const Uint4 kXResidue = AMINOACID_TO_NCBISTDAA[(int)'X'];
1338  const Uint4 kStarResidue = AMINOACID_TO_NCBISTDAA[(int)'*'];
1339 
1340  // post conditions
1341  if (cd_msa->dimensions->num_seqs > 0) {
1342  for (int i=0;i < (int)cd_msa->dimensions->query_length;i++) {
1343  for (Uint4 j=0;j < (Uint4)sbp->alphabet_size;j++) {
1344 
1345  // skip 'X' and '*' residues
1346  if (j == kXResidue || j == kStarResidue) {
1347  continue;
1348  }
1349 
1350  // get the true frequency ratio
1351  double q_over_p_estimate = internal_pssm->freq_ratios[i][j]
1352  / seq_weights->std_prob[j];
1353 
1354  // verify that non-zero frequency ration result in scores larger
1355  // than the minimum score
1356  BOOST_REQUIRE_EQUAL(q_over_p_estimate > 1e-5,
1357  internal_pssm->scaled_pssm[i][j]
1358  > BLAST_SCORE_MIN);
1359 
1360  // verify that frequency ratios > 1 result in scores > 0
1361  BOOST_REQUIRE_EQUAL(q_over_p_estimate > 1.0,
1362  internal_pssm->scaled_pssm[i][j] >= 0);
1363 
1364  // verify that frequency ratios < 1 result in scores < 0
1365  BOOST_REQUIRE_EQUAL(q_over_p_estimate < 1.0
1366  && q_over_p_estimate > 1e-5,
1367  internal_pssm->scaled_pssm[i][j] <= 0
1368  && internal_pssm->scaled_pssm[i][j]
1369  > BLAST_SCORE_MIN);
1370  }
1371  }
1372  }
1373 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const string kScale
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
Definitions which are dependant on the NCBI C++ Object Manager.
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
Definition: blast_stat.h:121
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
AutoPtr –.
Definition: ncbimisc.hpp:401
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Defines BLAST error codes (user errors included)
Wrapper class to manage the BlastRPSInfo structure, as currently there aren't any allocation or deall...
Definition: rps_aux.hpp:68
Wrapper class for BlastScoreBlk .
Definition: blast_aux.hpp:333
Strategy for pre-processing RPS-BLAST matches for PSSM computation.
const CSeq_id & GetSeq_id(TDim row) const
Definition: Dense_seg.cpp:154
void Validate(bool full_test=false) const
Definition: Dense_seg.cpp:274
Wrapper class for PSIBlastOptions .
Definition: blast_aux.hpp:330
Class for testing methods of CCddInputData class.
static int GetProfilesDataScale(void)
Get the scale factor for residue frequencis and independent observations stored in CDD.
static int GetSubjectMsaIndex(const CCddInputData &input, const CSeq_id &subject)
Find index of a CD in MSA by Seq_id.
static CRef< CSeq_align_set > x_CreateDuplicateNonOverlappingHit(CRef< CSeq_id > query_id)
Create two alignment with the same CD that do not overlap.
static CRef< CSeq_align_set > x_CreateDuplicateOverlappingHit(CRef< CSeq_id > query_id)
Create two alignments with the same CD that overlap.
static int GetAlphabetSize(void)
Get alphabet size used by CCddInputData.
static CRef< CCddInputData > CreatePssmInput(EType type)
Create CCddInputData test object with given CD alignment.
static void RemoveMultipleCdHits(CCddInputData &input)
Call CCddInputData method that removes multiple CD hits.
static size_t GetNumHits(const CCddInputData &input)
Get number of CDD hits in CCddInputData object.
static size_t GetMsaSize(const CCddInputData &input)
Get number of CDs in the internal MSA.
EType
Type of test multiple alignment of CDs.
@ eDuplicateOverlappingHit
Duplicate CDD hit.
@ eDuplicateNonOverlappingHit
Duplicate CDD hit that does not intersect query range.
static unsigned char * x_GuardProteinQuery(const unsigned char *query, unsigned int query_length)
Accesses CPssmEngine private method.
Computes a PSSM as specified in PSI-BLAST.
PSSM computation strategy with gaps in query.
PSSM computation strategy with domains with negative residue frequencies.
Simple PSSM computation strategy with one CD.
virtual unsigned int GetQueryLength(void)
Get the query's length.
virtual unsigned char * GetQuery(void)
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
vector< PSICdMsaCell * > m_Msa
virtual PSICdMsa * GetData(void)
Get CD data for PSSM computation.
vector< unsigned char > m_Query
virtual const PSIBlastOptions * GetOptions(void)
Get CDD-related PSI-BLAST options.
virtual void Process(void)
Pre-process CDs used for PSSM computation.
Simple PSSM computation strategy with one CD.
vector< PSICdMsaCellData > m_MsaData
Simple PSSM computation strategy with two CDs.
PSSM computation strategy with domains with frequencies that do not sum to 1.
PSSM computation strategy with domains with zero observations.
CRef –.
Definition: ncbiobj.hpp:618
CSeqDB.
Definition: seqdb.hpp:161
@ eProtein
Definition: seqdb.hpp:174
void SetNamedScore(const string &id, int score)
Definition: Seq_align.cpp:636
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:69
Interface for strategy to pre-process multiple alignment of conserved domains matches as input data f...
char data[12]
Definition: iconv.c:80
TSeqPos length
Length of the buffer above (not necessarily sequence length!)
Definition: blast_setup.hpp:65
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
static CNcbiMatrix< int > * GetScores(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
#define BLASTAA_SIZE
Size of aminoacid alphabet.
BlastScoreBlk * Get() const
Definition: blast_aux.hpp:333
TAutoUint1Ptr data
Sequence data.
Definition: blast_setup.hpp:64
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
static const int kAlphabetSize
void Reset(BlastScoreBlk *p=NULL)
Definition: blast_aux.hpp:333
Uint4 TFreqs
Type used for residue frequencies stored in CDD.
static const int kRpsScaleFactor
Scale of residue frequencies and number of independent observations stored in CDD.
AutoPtr< Uint1, CDeleter< Uint1 > > TAutoUint1Ptr
Declares TAutoUint1Ptr (for Uint1 arrays allocated with malloc/calloc)
Definition: blast_aux.hpp:98
Uint4 TObsr
Type used for number of independent observations stored in CDD.
SBlastSequence GetSequence(const objects::CSeq_loc &sl, EBlastEncoding encoding, objects::CScope *scope, objects::ENa_strand strand=objects::eNa_strand_plus, ESentinelType sentinel=eSentinels, std::string *warnings=NULL)
Retrieves a sequence using the object manager.
@ eBlastEncodingProtein
NCBIstdaa.
@ fDeltaBlast
Flags set for DELTA-BLAST.
Definition: rps_aux.hpp:93
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:469
#define NULL
Definition: ncbistd.hpp:225
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumIndeptObsr & GetNumIndeptObsr(void) const
Get the NumIndeptObsr member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
Definition: Pssm_.hpp:610
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
Definition: Pssm_.hpp:814
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
Definition: Pssm_.hpp:657
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
Definition: Pssm_.hpp:793
const TPssm & GetPssm(void) const
Get the Pssm member data.
Tdata & Set(void)
Assign a value to data member.
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Dense_seg_.hpp:427
vector< CRef< CSeq_id > > TIds
Definition: Dense_seg_.hpp:106
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
const Tdata & Get(void) const
Get the member data.
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static int input()
int i
int len
char GetResidue(unsigned int res)
Returns character representation of a residue from ncbistdaa.
Magic spell ;-) needed for some weird compilers... very empiric.
#define abs(a)
Definition: ncbi_heapmgr.c:130
std::istream & in(std::istream &in_, double &x_)
Utilities to develop and debug unit tests for BLAST.
Declarations of auxiliary functions/classes for PSI-BLAST.
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
C++ API for the PSI-BLAST PSSM engine.
Defines interface for a sequence alignment processor that can populate a multiple alignment data stru...
BlastScoreBlk * InitializeBlastScoreBlk(const unsigned char *query, Uint4 query_size)
Utilities to develop and debug unit tests that deal with PSSM computation.
static const string kSubjectId
Domain subject id used for tests (present in test CDD)
static const string kQueryId
Query id used for tests.
static void s_TestCreatePssmFromFreqs(const PSICdMsa *cd_msa, CBlastScoreBlk &sbp, const PSIBlastOptions *opts, AutoPtr< _PSISequenceWeights > &seq_weights)
BOOST_AUTO_TEST_CASE(TestCreatePssmFromSingleCd)
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
TNCBIScore NCBISM_GetScore(const SNCBIPackedScoreMatrix *sm, int aa1, int aa2)
Look up an entry in a packed score matrix.
Definition: raw_scoremat.c:67
Declares auxiliary classes to manage RPS-BLAST related C-structures.
header of RPS blast '.rps' file
Definition: blast_rps.h:62
Int4 num_profiles
number of PSSMs in the file
Definition: blast_rps.h:64
Int4 start_offsets[1]
start of an Int4 array that gives the starting byte offset of each RPS DB sequence.
Definition: blast_rps.h:65
Int2 alphabet_size
size of alphabet.
Definition: blast_stat.h:181
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Int4 pseudo_count
Pseudocount constant.
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
Definition: blast_psi.h:113
double iobsr
Effective number of independent observations in a CD column.
Definition: blast_psi.h:118
double * wfreqs
Frequencies for each residue in CD column.
Definition: blast_psi.h:115
Alignment cell that represents one column of CD aligned to a position in the query.
Definition: blast_psi.h:124
Uint1 is_aligned
Does this cell represent column aligned to a CD.
Definition: blast_psi.h:125
PSICdMsaCellData * data
Data needed for PSSM computation.
Definition: blast_psi.h:128
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
Definition: blast_psi.h:134
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
Definition: blast_psi.h:136
unsigned char * query
Query sequence as Ncbistdaa.
Definition: blast_psi.h:135
PSICdMsaCell ** msa
Multiple alignment of CDs.
Definition: blast_psi.h:138
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Boolean information_content
request information content
Definition: blast_psi.h:182
Boolean frequency_ratios
request frequency ratios
Definition: blast_psi.h:187
Boolean independent_observations
request number of independent observations
Definition: blast_psi.h:194
Boolean weighted_residue_frequencies
request observed weighted residue frequencies
Definition: blast_psi.h:185
Structure representing the dimensions of the multiple sequence alignment data structure.
Definition: blast_psi.h:57
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
Structure to store sequence data and its length for use in the CORE of BLAST (it's a malloc'ed array ...
Definition: blast_setup.hpp:62
static string subject
Definition: type.c:6
#define _ASSERT
Utility stuff for more convenient using of Boost.Test library.
static const string kTitle
CTraceGlyph inline method implementation.
Modified on Wed Apr 17 13:10:19 2024 by modify_doxy.py rev. 669887