NCBI C++ ToolKit
msa2pssm_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: msa2pssm_unit_test.cpp 100942 2023-10-03 17:36:50Z ucko $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 * File Description:
29 * Unit tests for functionality to convert CLUSTALW-style MSAs to PSSMs
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
36 
38 
39 #include <corelib/test_boost.hpp>
40 
41 using namespace std;
42 using namespace ncbi;
43 using namespace ncbi::objects;
44 using namespace ncbi::blast;
45 
46 BOOST_AUTO_TEST_SUITE(msa2pssm)
47 
48 BOOST_AUTO_TEST_CASE(QueryNotFoundInMsa)
49 {
50  ifstream in("data/msa.clustalw.txt");
51 
52  const unsigned int kQuerySize = 10;
53  const unsigned char kQuery[] = { 3, 9, 14, 20, 6, 23, 1, 7, 16, 5 };
54 
55  CPSIBlastOptions opts;
56  PSIBlastOptionsNew(&opts);
57  unique_ptr<CPsiBlastInputClustalW> pssm_input;
58  BOOST_REQUIRE_THROW(pssm_input.reset(new CPsiBlastInputClustalW(in, *opts,
59  0, 0,
60  kQuery,
61  kQuerySize,
62  0, 0)),
64 }
65 
66 BOOST_AUTO_TEST_CASE(AllUpperCaseMsa)
67 {
68  ifstream in("data/msa.clustalw.txt");
69 
70  CPSIBlastOptions opts;
71  PSIBlastOptionsNew(&opts);
72  CPsiBlastInputClustalW pssm_input(in, *opts);
73  pssm_input.Process();
74 
75  PSIMsa* msa = pssm_input.GetData();
76  BOOST_REQUIRE_EQUAL((Uint4)151, msa->dimensions->query_length);
77  BOOST_REQUIRE_EQUAL((Uint4)24, msa->dimensions->num_seqs);
78 
79  // Ensure there are no gaps in the query
80  for (TSeqPos i = 0; i < msa->dimensions->query_length; i++) {
81  CNcbiOstrstream os;
82  os << "Query has gap in position " << i;
83  BOOST_REQUIRE_MESSAGE(msa->data[0][i].letter !=
84  AMINOACID_TO_NCBISTDAA[(int)'-'],
85  (string)CNcbiOstrstreamToString(os));
86  }
87 }
88 
89 BOOST_AUTO_TEST_CASE(AllUpperCaseMsa_WithQuery)
90 {
91  ifstream in("data/msa.clustalw.txt");
92 
93  const string kQuerySeq("IVLARIDDRFIHGQILTRWIKVHAADRIIVVSDDIAQDEMRKTLILSVAPSNVKASAVSVSKMAKAFHSPRYEGVTAMLLFENPSDIVSLIEAGVPIKTVNVGGMRFENHRRQITKSVSVTEQDIKAFETLSDKGVKLELRQLPSDASEDF");
94  TAutoUint1ArrayPtr query(new Uint1[kQuerySeq.size()]);
95  int i = 0;
96  ITERATE(string, res, kQuerySeq) {
97  query.get()[i] = AMINOACID_TO_NCBISTDAA[(int)*res];
98  i++;
99  }
100 
101  CPSIBlastOptions opts;
102  PSIBlastOptionsNew(&opts);
103  CPsiBlastInputClustalW pssm_input(in, *opts, 0, 0, query.get(), kQuerySeq.size(), 0, 0);
104  pssm_input.Process();
105 
106  PSIMsa* msa = pssm_input.GetData();
107  BOOST_REQUIRE_EQUAL((Uint4)151, msa->dimensions->query_length);
108  BOOST_REQUIRE_EQUAL((Uint4)24, msa->dimensions->num_seqs);
109 
110  // Ensure there are no gaps in the query
111  for (TSeqPos i = 0; i < pssm_input.GetQueryLength(); i++) {
112  CNcbiOstrstream os;
113  os << "Query has gap in position " << i;
114  BOOST_REQUIRE_MESSAGE(msa->data[0][i].letter !=
115  AMINOACID_TO_NCBISTDAA[(int)'-'],
116  (string)CNcbiOstrstreamToString(os));
117  }
118 }
119 
120 BOOST_AUTO_TEST_CASE(MsaWithLowerCaseResidues)
121 {
122  ifstream in("data/sample_msa.txt");
123 
124  CPSIBlastOptions opts;
125  PSIBlastOptionsNew(&opts);
126  CPsiBlastInputClustalW pssm_input(in, *opts);
127  BOOST_REQUIRE_EQUAL(string(BLAST_DEFAULT_MATRIX),
128  string(pssm_input.GetMatrixName()));
129  pssm_input.Process();
130 
131  PSIMsa* msa = pssm_input.GetData();
132  BOOST_REQUIRE_EQUAL((Uint4)176, msa->dimensions->query_length);
133  BOOST_REQUIRE_EQUAL((Uint4)13, msa->dimensions->num_seqs);
134 
135  // Check the aligned query regions
136  TSeqRange unused_range(23, 93);
137  for (TSeqPos i = 0; i < unused_range.GetFrom(); i++) {
138  CNcbiOstrstream os;
139  os << "Query is not aligned at position " << i;
140  BOOST_REQUIRE_MESSAGE(msa->data[0][i].is_aligned == TRUE,
141  (string)CNcbiOstrstreamToString(os));
142  }
143  for (TSeqPos i = unused_range.GetFrom(); i < unused_range.GetTo(); i++)
144  {
145  CNcbiOstrstream os;
146  os << "Query is aligned at position " << i;
147  BOOST_REQUIRE_MESSAGE(msa->data[0][i].is_aligned == FALSE,
148  (string)CNcbiOstrstreamToString(os));
149  }
150  for (TSeqPos i = unused_range.GetToOpen(); i < pssm_input.GetQueryLength();
151  i++) {
152  CNcbiOstrstream os;
153  os << "Query is not aligned at position " << i;
154  BOOST_REQUIRE_MESSAGE(msa->data[0][i].is_aligned == TRUE,
155  (string)CNcbiOstrstreamToString(os));
156  }
157  // Ensure there are no gaps in the query
158  for (TSeqPos i = 0; i < pssm_input.GetQueryLength(); i++) {
159  CNcbiOstrstream os;
160  os << "Query has gap in position " << i;
161  BOOST_REQUIRE_MESSAGE(msa->data[0][i].letter !=
162  AMINOACID_TO_NCBISTDAA[(int)'-'],
163  (string)CNcbiOstrstreamToString(os));
164  }
165 }
166 
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
#define BLAST_DEFAULT_MATRIX
Default matrix name: BLOSUM62.
Definition: blast_options.h:77
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Defines BLAST error codes (user errors included)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
Wrapper class for PSIBlastOptions .
Definition: blast_aux.hpp:330
This class is a concrete strategy for IPssmInputData which converts the CLUSTALW-style output contain...
Ensure direct dependencies on enough of the core xncbi library to satisfy shared libraries that depen...
static const char * kQuery
PSIMsa * GetData()
Obtain the multiple sequence alignment structure.
unsigned int GetQueryLength()
Get the query's length.
void Process()
The work to process the alignment is done here.
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
position_type GetToOpen(void) const
Definition: range.hpp:138
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
BOOST_AUTO_TEST_CASE(QueryNotFoundInMsa)
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST from a multiple sequence alignmen...
Magic spell ;-) needed for some weird compilers... very empiric.
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
std::istream & in(std::istream &in_, double &x_)
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Definition: blast_psi.h:77
PSIMsaDimensions * dimensions
dimensions of the msa
Definition: blast_psi.h:76
static string query
Utility stuff for more convenient using of Boost.Test library.
Modified on Sun Apr 21 03:45:15 2024 by modify_doxy.py rev. 669887