NCBI C++ ToolKit
seqalignfilter_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqalignfilter_unit_test.cpp 98787 2023-01-05 20:04:06Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vahram Avagyan
27  *
28  * File Description:
29  * CSeqAlignFilter unit test.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
40 
43 
46 
47 #include <serial/serial.hpp>
48 #include <serial/objistr.hpp>
49 #include <serial/objostr.hpp>
50 #include <serial/iterator.hpp>
51 #include <sstream>
52 #undef NCBI_BOOST_NO_AUTO_TEST_MAIN
53 #include <corelib/test_boost.hpp>
54 #include <common/test_data_path.h>
55 
56 #ifndef SKIP_DOXYGEN_PROCESSING
57 
60 USING_SCOPE(align_format);
61 
62 template<class ASNOBJ>
63 void s_Stringify(const ASNOBJ & a, string & s)
64 {
65  CNcbiOstrstream oss;
66  oss << MSerial_AsnText << a;
67  s = CNcbiOstrstreamToString(oss);
68 }
69 
70 template<class ASNOBJ>
71 void s_Unstringify(const string & s, ASNOBJ & a)
72 {
73  istringstream iss;
74  iss.str(s);
75  iss >> MSerial_AsnText >> a;
76 }
77 
78 template<class ASNOBJ>
79 CRef<ASNOBJ> s_Duplicate(const ASNOBJ & a)
80 {
81  CRef<ASNOBJ> newobj(new ASNOBJ);
82 
83  string s;
84  s_Stringify(a, s);
85  s_Unstringify(s, *newobj);
86 
87  return newobj;
88 }
89 
90 /////////////////////////////////////////////////////////////////////////////
91 // List-based static helper functions
92 
93 static void s_GetUseThisGiEntries(CRef<CSeq_align> sa, list<TGi>& list_gis)
94 {
95  list_gis.clear();
96 
97  CSeq_align::TScore& score_entries = sa->SetScore();
98  CSeq_align::TScore::iterator iter_score = score_entries.begin();
99  while (iter_score != score_entries.end())
100  {
101  CRef<CScore> score_entry = *iter_score++;
102  if (score_entry->CanGetId() && score_entry->GetId().IsStr())
103  {
104  string str_id = score_entry->GetId().GetStr();
105  if (str_id == "use_this_gi")
106  {
107  bool bIsLegalGiEntry = score_entry->CanGetValue() && score_entry->GetValue().IsInt();
108  BOOST_REQUIRE(bIsLegalGiEntry);
109  Uint4 gi_v = (Uint4) (score_entry->GetValue().GetInt());
110  list_gis.push_back(GI_FROM(Uint4, gi_v));
111  }
112  }
113  }
114 }
115 
117 {
118  CConstRef<CSeq_id> id(&(sa->GetSeq_id(1)));
119 
120  BOOST_REQUIRE(id->IsGi());
121  return id->GetGi();
122 }
123 
124 static void s_GetFullGiList(CRef<CSeq_align> sa, list<TGi>& list_gis)
125 {
126  s_GetUseThisGiEntries(sa, list_gis);
127  list_gis.push_back(s_GetAlignedSeqGi(sa));
128 }
129 
130 static bool s_IsGiInList(TGi gi, list<TGi>& list_gis)
131 {
132  return find(list_gis.begin(), list_gis.end(), gi) != list_gis.end();
133 }
134 
135 static bool s_IsListSubset(list<TGi>& list_all, list<TGi>& list_sub)
136 {
137  bool is_missing = false;
138 
139  list<TGi>::iterator it;
140  for (it = list_sub.begin(); it != list_sub.end() && !is_missing; it++)
141  {
142  is_missing = !s_IsGiInList(*it, list_all);
143  }
144 
145  return !is_missing;
146 }
147 
148 static bool s_AreListsEqual(list<TGi>& list1, list<TGi>& list2)
149 {
150  return s_IsListSubset(list1, list2) && s_IsListSubset(list2, list1);
151 }
152 
153 /////////////////////////////////////////////////////////////////////////////
154 // Vector-based static helper functions
155 
156 static bool s_IsGiInVector(TGi gi, vector<TGi>& vec_gis)
157 {
158  return binary_search(vec_gis.begin(), vec_gis.end(), gi);
159 }
160 
161 static bool s_GetFilteredGiList(CRef<CSeq_align> sa, vector<TGi>& vec_all_gis,
162  list<TGi>& list_sa_filtered)
163 {
164  list<TGi> list_sa_full;
165  s_GetFullGiList(sa, list_sa_full);
166 
167  for (list<TGi>::iterator it = list_sa_full.begin();
168  it != list_sa_full.end(); it++)
169  {
170  if (s_IsGiInVector(*it, vec_all_gis))
171  {
172  list_sa_filtered.push_back(*it);
173  }
174  }
175 
176  return !list_sa_filtered.empty();
177 }
178 
179 /////////////////////////////////////////////////////////////////////////////
180 // Functions to test filtering results for individual seqaligns
181 
182 static void
184  CRef<CSeq_align> sa_new,
185  list<TGi>& list_orig_filtered,
186  list<TGi>& list_new_filtered)
187 {
188  list<TGi> list_new;
189  s_GetFullGiList(sa_new, list_new);
190 
191  BOOST_REQUIRE(s_AreListsEqual(list_new, list_new_filtered)); // new list is indeed filtered
192  BOOST_REQUIRE(s_IsListSubset(list_new, list_orig_filtered)); // all original gi's who survived filtering
193  // are included in the new list
194 }
195 
197 {
198  int oid1 = -1, oid2 = -1;
199  db->GiToOid(gi1, oid1);
200  db->GiToOid(gi2, oid2);
201 
202  BOOST_REQUIRE(oid1 > 0);
203  BOOST_REQUIRE(oid2 > 0);
204  BOOST_REQUIRE(oid1 == oid2);
205 }
206 
207 /////////////////////////////////////////////////////////////////////////////
208 // Pre-processing and testing individual seqaligns
209 
211  vector<TGi>& vec_all_gis)
212 {
213  list<TGi> list_orig_filtered;
214  list<TGi> list_new, list_new_filtered;
215 
216  s_GetFilteredGiList(sa_orig, vec_all_gis, list_orig_filtered);
217  s_GetFilteredGiList(sa_new, vec_all_gis, list_new_filtered);
218 
219  s_Check_GiListConsistency(sa_orig, sa_new,
220  list_orig_filtered, list_new_filtered);
221 }
222 
224 {
225  TGi main_gi = s_GetAlignedSeqGi(sa_new);
226 
227  list<TGi> list_extra_gis;
228  s_GetUseThisGiEntries(sa_new, list_extra_gis);
229 
230  for (list<TGi>::iterator it_extra_gi = list_extra_gis.begin();
231  it_extra_gi != list_extra_gis.end(); it_extra_gi++)
232  {
233  s_Check_GiEquivalenceInDB(main_gi, *it_extra_gi, db);
234  }
235 }
236 
237 /////////////////////////////////////////////////////////////////////////////
238 // Other pre-processing
239 
240 static void s_LoadSeqAlignsFromFile(CSeq_align_set& aln_all, const string& fname)
241 {
242  unique_ptr<CObjectIStream> asn_in(CObjectIStream::Open(fname, eSerial_AsnText));
243  *asn_in >> aln_all;
244 }
245 
246 /////////////////////////////////////////////////////////////////////////////
247 // Actual test cases
248 
249 BOOST_AUTO_TEST_SUITE(seqalignfilter)
250 
251 BOOST_AUTO_TEST_CASE(s_TestSimpleFiltering)
252 {
253  string fname_in_rel = "blast/algo/unit_tests/blast_format/data/in_test.txt";
254  string fname_out = "data/out_test.txt";
255  string fname_gis = "data/gilist_test.txt";
256  // convert all pathes to platform specific form
257  std::replace( fname_in_rel.begin(), fname_in_rel.end(), '/', CDirEntry::GetPathSeparator() );
258  std::replace( fname_out.begin(), fname_out.end(), '/', CDirEntry::GetPathSeparator() );
259  std::replace( fname_gis.begin(), fname_gis.end(), '/', CDirEntry::GetPathSeparator() );
260 
261  string fname_in = CFile::ConcatPath( NCBI_GetTestDataPath(), fname_in_rel );
262 
263  CSeq_align_set aln_all;
264  s_LoadSeqAlignsFromFile(aln_all, fname_in);
265 
266  CSeqAlignFilter filter;
267  filter.FilterSeqaligns(fname_in, fname_out, fname_gis);
268 
269  CSeq_align_set aln_filtered;
270  s_LoadSeqAlignsFromFile(aln_filtered, fname_out);
271 
272  list<TGi> list_gis;
273  filter.ReadGiList(fname_gis, list_gis);
274 
275  ITERATE(CSeq_align_set::Tdata, iter, aln_all.Get())
276  {
277  TGi gi = s_GetAlignedSeqGi(*iter);
278  if (s_IsGiInList(gi, list_gis))
279  {
280  bool found_gi = false;
281  ITERATE(CSeq_align_set::Tdata, iter_filtered, aln_filtered.Get())
282  {
283  TGi gi_filtered = s_GetAlignedSeqGi(*iter_filtered);
284  if (gi == gi_filtered)
285  {
286  found_gi = true;
287  break;
288  }
289  }
290  BOOST_REQUIRE(found_gi);
291  }
292  }
293 }
294 
295 BOOST_AUTO_TEST_CASE(s_TestDBBasedFiltering)
296 {
297  string fname_in_rel = "blast/algo/unit_tests/blast_format/data/in_test.txt";
298  string fname_out = "data/out_test.txt";
299  string fname_gis = "data/gilist_test.txt";
300  // convert all pathes to platform specific form
301  std::replace( fname_in_rel.begin(), fname_in_rel.end(), '/', CDirEntry::GetPathSeparator() );
302  std::replace( fname_out.begin(), fname_out.end(), '/', CDirEntry::GetPathSeparator() );
303  std::replace( fname_gis.begin(), fname_gis.end(), '/', CDirEntry::GetPathSeparator() );
304 
305  string fname_in = CFile::ConcatPath( NCBI_GetTestDataPath(), fname_in_rel );
306 
307  string db_name = "nt";
308  bool use_prot = false;
309 
310  CSeqAlignFilter filter;
311  CRef<CSeqDB> db;
312 
313  BOOST_REQUIRE_NO_THROW(db = filter.PrepareSeqDB(db_name, use_prot, fname_gis););
314  BOOST_REQUIRE_NO_THROW(filter.FilterSeqalignsExt(fname_in, fname_out, db););
315 
316  // check the results
317 
318  CSeq_align_set aln_all;
319  s_LoadSeqAlignsFromFile(aln_all, fname_in);
320 
321  CSeq_align_set aln_filtered;
322  s_LoadSeqAlignsFromFile(aln_filtered, fname_out);
323 
324  vector<TGi> vec_gis; // sorted vector of all available gi's
325  filter.ReadGiVector(fname_gis, vec_gis, true);
326 
327  ITERATE(CSeq_align_set::Tdata, iter, aln_all.Get())
328  {
329  TGi gi = s_GetAlignedSeqGi(*iter);
330  ITERATE(CSeq_align_set::Tdata, iter_filtered, aln_filtered.Get())
331  {
332  TGi gi_filtered = s_GetAlignedSeqGi(*iter_filtered);
333  if (gi == gi_filtered)
334  {
335  // main gi's coincide - check the concistency of all the gi's
336  s_DoConsistencyCheck(*iter, *iter_filtered, vec_gis);
337  // check the equivalence of all gi's in the filtered seqalign
338  s_DoEquivalenceCheck(*iter_filtered, db);
339  }
340  }
341  }
342 }
344 #endif /* SKIP_DOXYGEN_PROCESSING */
User-defined methods of the data storage class.
Defines the alignment filtering class.
Declarations of auxiliary functions using IBlastSeqInfoSrc to retrieve ids and related sequence infor...
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CRef –.
Definition: ncbiobj.hpp:618
CSeqAlignFilter.
void FilterSeqalignsExt(const string &fname_in_seqaligns, const string &fname_out_seqaligns, CRef< CSeqDB > db)
Filter Seqaligns - extended file-based version.
CRef< CSeqDB > PrepareSeqDB(const string &fname_db, bool is_prot, const string &fname_gis_to_filter)
Load a SeqDB database with the given gi-list.
void ReadGiList(const string &fname, list< TGi > &list_gis, bool sorted=false)
Read a gi list from a file and, optionally, sort it.
void ReadGiVector(const string &fname, vector< TGi > &vec_gis, bool sorted=false)
Read a gi vector from a file and, optionally, sort it.
void FilterSeqaligns(const string &fname_in_seqaligns, const string &fname_out_seqaligns, const string &fname_gis_to_filter)
Filter Seqaligns - file-based version.
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
Definition: seqdb.cpp:808
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
static string ConcatPath(const string &first, const string &second)
Concatenate two parts of the path for the current OS.
Definition: ncbifile.cpp:776
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
list< CRef< CSeq_align > > Tdata
const Tdata & Get(void) const
Get the member data.
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
unsigned int a
Definition: ncbi_localip.c:102
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
USING_SCOPE(objects)
static bool s_AreListsEqual(list< TGi > &list1, list< TGi > &list2)
static bool s_GetFilteredGiList(CRef< CSeq_align > sa, vector< TGi > &vec_all_gis, list< TGi > &list_sa_filtered)
static void s_Check_GiEquivalenceInDB(TGi gi1, TGi gi2, CRef< CSeqDB > db)
static void s_LoadSeqAlignsFromFile(CSeq_align_set &aln_all, const string &fname)
static void s_GetUseThisGiEntries(CRef< CSeq_align > sa, list< TGi > &list_gis)
void s_Unstringify(const string &s, ASNOBJ &a)
void s_Stringify(const ASNOBJ &a, string &s)
static bool s_IsGiInVector(TGi gi, vector< TGi > &vec_gis)
CRef< ASNOBJ > s_Duplicate(const ASNOBJ &a)
static bool s_IsGiInList(TGi gi, list< TGi > &list_gis)
static void s_DoEquivalenceCheck(CRef< CSeq_align > sa_new, CRef< CSeqDB > db)
static void s_Check_GiListConsistency(CRef< CSeq_align >, CRef< CSeq_align > sa_new, list< TGi > &list_orig_filtered, list< TGi > &list_new_filtered)
static TGi s_GetAlignedSeqGi(CRef< CSeq_align > sa)
BOOST_AUTO_TEST_CASE(s_TestSimpleFiltering)
static void s_DoConsistencyCheck(CRef< CSeq_align > sa_orig, CRef< CSeq_align > sa_new, vector< TGi > &vec_all_gis)
static void s_GetFullGiList(CRef< CSeq_align > sa, list< TGi > &list_gis)
static bool s_IsListSubset(list< TGi > &list_all, list< TGi > &list_sub)
Defines BLAST database access classes.
Defines a concrete strategy for the IBlastSeqInfoSrc interface for sequence identifiers retrieval fro...
Utility stuff for more convenient using of Boost.Test library.
Defines location of test data folder at NCBI.
static const char * NCBI_GetTestDataPath(void)
Get the directory where test data is stored at NCBI.
Modified on Thu Dec 07 10:11:11 2023 by modify_doxy.py rev. 669887