NCBI C++ ToolKit
kmer_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: kmer_unit_test.cpp 100474 2023-08-04 15:13:04Z boratyng $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Greg Boratyn
27 *
28 * File Description:
29 * Unit tests for k-mer counts computing classes
30 *
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include <corelib/ncbi_system.hpp>
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seq/Seq_data.hpp>
42 #include "cobalt_test_util.hpp"
43 
44 // This macro should be defined before inclusion of test_boost.hpp in all
45 // "*.cpp" files inside executable except one. It is like function main() for
46 // non-Boost.Test executables is defined only in one *.cpp file - other files
47 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
48 // then test_boost.hpp will define such "main()" function for tests.
49 //
50 // Usually if your unit tests contain only one *.cpp file you should not
51 // care about this macro at all.
52 //
53 #define NCBI_BOOST_NO_AUTO_TEST_MAIN
54 
55 
56 // This header must be included before all Boost.Test headers if there are any
57 #include <corelib/test_boost.hpp>
58 
59 #ifndef SKIP_DOXYGEN_PROCESSING
60 
62 USING_SCOPE(cobalt);
64 
65 
66 BOOST_AUTO_TEST_SUITE(kmer_counts)
67 
68 static CRef<CBioseq> s_CreateBioseq(const string& sequence, int id)
69 {
70  CRef<CBioseq> bioseq(new CBioseq());
71  CSeq_data& seq_data = bioseq->SetInst().SetSeq_data();
72  vector<char>& data = seq_data.SetNcbistdaa().Set();
73  data.resize(sequence.length());
74  for (size_t i=0;i < sequence.length();i++) {
75  data[i] = (char)AMINOACID_TO_NCBISTDAA[(int)sequence[i]];
76  }
77  bioseq->SetInst().SetLength((unsigned int)sequence.length());
78  bioseq->SetInst().SetMol(CSeq_inst::eMol_aa);
79 
80  bioseq->SetId().clear();
81  bioseq->SetId().push_back(CRef<CSeq_id>(new CSeq_id(CSeq_id::e_Local, id)));
82 
83  return bioseq;
84 }
85 
86 template <class TKmerCounts>
87 static void s_CreateKmerCounts(const string& seq, TKmerCounts& counts)
88 {
89  CRef<CBioseq> bioseq = s_CreateBioseq(seq, 1);
91  CRef<CScope> scope(new CScope(*objmgr));
92  CBioseq_Handle handle = scope->AddBioseq(*bioseq);
93 
95  seqloc.SetId(*handle.GetSeqId());
96 
97  counts.Reset(seqloc, *scope);
98 }
99 
100 
101 BOOST_AUTO_TEST_CASE(TestSparseKmerCounts)
102 {
104 
105  const string seq1 = "AAAAAAAAA";
106  const string seq2 = "BBBBBBBBBB";
107 
108  CSparseKmerCounts counts, counts2;
109  for (int i=3;i < 6;i++) {
110  for (int j=15;j <= 20;j++) {
111 
114 
115  s_CreateKmerCounts(seq1, counts);
116  s_CreateKmerCounts(seq2, counts2);
117 
118  // Number of k-mers is equal to seq_len - k-mer_len + 1
119  // for sequences that do not contain X
120  BOOST_CHECK_EQUAL(counts.GetNumCounts(), seq1.length()
122 
123  // Sequence compared to itself, yields all k-mers common
124  BOOST_CHECK_EQUAL(CSparseKmerCounts::CountCommonKmers(counts,
125  counts), counts.GetNumCounts());
126 
127  // Each common k-mer is counted once in this case
128  BOOST_CHECK_EQUAL((int)CSparseKmerCounts::CountCommonKmers(counts,
129  counts, false), 1);
130 
131  // For sequences that do not share subsequences, zero common k-mers
132  BOOST_CHECK_EQUAL((int)CSparseKmerCounts::CountCommonKmers(counts,
133  counts2), 0);
134 
135  }
136  }
137 
138  const string seq_x = "AXAXAXAXAXAXAX";
139  const string seq_xonly = "XXXXXXXXXXXXX";
140 
141  // Make sure that k-mers that contain X are not counted
143  for (int i=3;i < 6;i++) {
144 
146  CSparseKmerCounts counts_x, counts_xonly;
147  s_CreateKmerCounts(seq_x, counts_x);
148  s_CreateKmerCounts(seq_xonly, counts_xonly);
149 
150  BOOST_CHECK_EQUAL((int)counts_x.GetNumCounts(), 0);
151  BOOST_CHECK_EQUAL((int)counts_xonly.GetNumCounts(), 0);
152  }
153 
154  const string seq_short = "ABC";
155 
156  // Sequence shorter than k-mer length causes exception
158  BOOST_CHECK_THROW(s_CreateKmerCounts(seq_short, counts),
160 
161  // Letter codes are assumed to belong to [1, alphabet_size]
162  // Letter out of alphabet in sequence causes exception
164  BOOST_CHECK_THROW(s_CreateKmerCounts(seq_short + "AAK", counts),
166 
167  // Not specifying translation table for compressed alphabets causes
168  // exception
171  BOOST_CHECK_THROW(s_CreateKmerCounts(seq1, counts), CKmerCountsException);
172 
173  // K-mers containig compressed letters are counted as the same k-mers
174  vector<Uint1>& table = CSparseKmerCounts::SetTransTable();
175  table.resize(3);
176  table[0] = 0;
177  table[1] = 1;
178  table[2] = 1;
182  s_CreateKmerCounts(seq1, counts);
183  s_CreateKmerCounts(seq2, counts2);
184  BOOST_CHECK_EQUAL(CSparseKmerCounts::CountCommonKmers(counts, counts2),
185  counts.GetNumCounts());
186 
187 }
188 
189 
190 BOOST_AUTO_TEST_CASE(TestKmerMethods)
191 {
192  typedef TKmerMethods<CSparseKmerCounts> TKMethods;
193  const int kKmerLen = 4;
194  const int kAlphabetSize = 28;
195 
197  CRef<CScope> scope(new CScope(*objmgr));
198  vector< CRef<CSeq_loc> > seqs;
199  vector<CSparseKmerCounts> counts_vect;
200  TKMethods::TDistMatrix dmat;
201 
202  int status = ReadFastaQueries("data/small.fa", seqs, scope);
203  BOOST_REQUIRE_EQUAL(status, 0);
204  BOOST_REQUIRE(seqs.size() > 0);
205  BOOST_REQUIRE(!scope.Empty());
206 
207  // Supplying empty list of sequences results in exception
208  BOOST_CHECK_THROW(TKMethods::ComputeCounts(vector< CRef<CSeq_loc> >(),
209  *scope, counts_vect),
211 
212  // Distance between a sequence and itself is zero
213  seqs.resize(2);
214  seqs[1] = seqs[0];
215 
216  // for regular alphabet
218 
219  TKMethods::ComputeCounts(seqs, *scope, counts_vect);
220  BOOST_REQUIRE_EQUAL(counts_vect.size(), seqs.size());
221 
222  TKMethods::ComputeDistMatrix(counts_vect,
223  TKMethods::eFractionCommonKmersGlobal, dmat);
224 
225  BOOST_REQUIRE_EQUAL(dmat.GetRows(), seqs.size());
226  BOOST_CHECK_CLOSE(dmat(0, 1), 0.0, 1e-6);
227 
228  TKMethods::ComputeDistMatrix(counts_vect,
229  TKMethods::eFractionCommonKmersLocal, dmat);
230 
231  BOOST_REQUIRE_EQUAL(dmat.GetRows(), seqs.size());
232  BOOST_CHECK_CLOSE(dmat(0, 1), 0.0, 1e-6);
233 
234 
235  // for compressed alphabet SE-B15
236  TKMethods::SetParams(kKmerLen, TKMethods::eSE_B15);
237 
238  TKMethods::ComputeCounts(seqs, *scope, counts_vect);
239  BOOST_REQUIRE_EQUAL(counts_vect.size(), seqs.size());
240 
241  TKMethods::ComputeDistMatrix(counts_vect,
242  TKMethods::eFractionCommonKmersGlobal, dmat);
243 
244  BOOST_REQUIRE_EQUAL(dmat.GetRows(), seqs.size());
245  BOOST_CHECK_CLOSE(dmat(0, 1), 0.0, 1e-6);
246 
247  TKMethods::ComputeDistMatrix(counts_vect,
248  TKMethods::eFractionCommonKmersLocal, dmat);
249 
250  BOOST_REQUIRE_EQUAL(dmat.GetRows(), seqs.size());
251  BOOST_CHECK_CLOSE(dmat(0, 1), 0.0, 1e-6);
252 
253  // for compressed alphabet SE-V10
254  TKMethods::SetParams(kKmerLen, TKMethods::eSE_V10);
255 
256  TKMethods::ComputeCounts(seqs, *scope, counts_vect);
257  BOOST_REQUIRE_EQUAL(counts_vect.size(), seqs.size());
258 
259  TKMethods::ComputeDistMatrix(counts_vect,
260  TKMethods::eFractionCommonKmersGlobal, dmat);
261 
262  BOOST_REQUIRE_EQUAL(dmat.GetRows(), seqs.size());
263  BOOST_CHECK_CLOSE(dmat(0, 1), 0.0, 1e-6);
264 
265  TKMethods::ComputeDistMatrix(counts_vect,
266  TKMethods::eFractionCommonKmersLocal, dmat);
267 
268  BOOST_REQUIRE_EQUAL(dmat.GetRows(), seqs.size());
269  BOOST_CHECK_CLOSE(dmat(0, 1), 0.0, 1e-6);
270 }
271 
273 
274 #endif /* SKIP_DOXYGEN_PROCESSING */
#define static
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
CBioseq_Handle –.
Exception class for Kmer counts.
Definition: kmercounts.hpp:406
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Kmer counts for alignment free sequence similarity computation implemented as a sparse vector.
Definition: kmercounts.hpp:61
static unsigned int GetKmerLength(void)
Get default kmer length.
Definition: kmercounts.hpp:115
static void SetUseCompressed(bool use_comp)
Set default option for using compressed alphabet.
Definition: kmercounts.hpp:159
static void SetAlphabetSize(unsigned size)
Set Default alphabet size.
Definition: kmercounts.hpp:148
unsigned int GetNumCounts(void) const
Get number of all k-mers found in the sequence.
Definition: kmercounts.hpp:110
static unsigned int CountCommonKmers(const CSparseKmerCounts &v1, const CSparseKmerCounts &v2, bool repetitions=true)
Copmute number of common kmers between two count vectors.
Definition: kmercounts.cpp:409
static void SetKmerLength(unsigned len)
Set default k-mer length.
Definition: kmercounts.hpp:142
static vector< Uint1 > & SetTransTable(void)
Set default compressed alphabet letter translation table.
Definition: kmercounts.hpp:154
Interface for computing and manipulating k-mer counts vectors that allows for different implementatio...
Definition: kmercounts.hpp:425
int ReadFastaQueries(const string &filename, vector< CRef< objects::CSeq_loc > > &seqs, CRef< objects::CScope > &scope, bool parse_deflines, objects::CSeqIdGenerator *id_generator)
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
TPrim & Set(void)
Definition: serialbase.hpp:351
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
virtual void SetParams()
Called at the beginning of Run, before creating thread pool.
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Whole
whole sequence
Definition: Seq_loc_.hpp:100
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
TNcbistdaa & SetNcbistdaa(void)
Select the variant.
Definition: Seq_data_.hpp:697
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
USING_SCOPE(cobalt)
BOOST_AUTO_TEST_CASE(TestSparseKmerCounts)
static CRef< CBioseq > s_CreateBioseq(const string &sequence, int id)
USING_NCBI_SCOPE
static void s_CreateKmerCounts(const string &seq, TKmerCounts &counts)
int i
The Object manager core.
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
Utility stuff for more convenient using of Boost.Test library.
#define const
Definition: zconf.h:230
Modified on Fri Dec 01 04:47:42 2023 by modify_doxy.py rev. 669887