NCBI C++ ToolKit
blastsrainput_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastsrainput_unit_test.cpp 100964 2023-10-05 15:37:09Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Greg Boratyn
27  *
28  * Test module for BLAST SRA input library
29  */
30 
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbi_cookies.hpp>
33 #include <corelib/ncbi_message.hpp>
36 #include <corelib/test_boost.hpp>
37 
40 #include <objects/seq/Bioseq.hpp>
43 
45 
47 #include <unordered_map>
48 
49 
50 using namespace std;
51 using namespace ncbi;
52 using namespace ncbi::objects;
53 using namespace ncbi::blast;
54 
56 {
57 public:
59 protected:
61  const TPluginManagerParamTree*) const
62  { return nullptr; }
63 };
64 
66 {
67  // Avoid underlinkage under Apple Developer Tools 15
68  CHttpCookie empty_cookie;
69  CEndpointKey fake_endpoint("1.2.3.4:5", 0);
70  CFakeDataLoaderFactory fake_dl_factory;
71  CMessage_Basic empty_message(kEmptyStr, eDiag_Trace);
72  CPluginManager_DllResolver dummy_resolver;
73 }
74 
76 
77 // Retrieve segment flags for short read sequences
79 {
80  int retval = 0;
81 
82  if (!bioseq.IsSetDescr()) {
83  return 0;
84  }
85 
86  for (auto desc : bioseq.GetDescr().Get()) {
87  if (desc->Which() == CSeqdesc::e_User) {
88 
89  if (!desc->GetUser().IsSetType() ||
90  !desc->GetUser().GetType().IsStr() ||
91  desc->GetUser().GetType().GetStr() != "Mapping") {
92  continue;
93  }
94 
95  BOOST_REQUIRE(desc->GetUser().HasField("has_pair"));
96  const CUser_field& field = desc->GetUser().GetField("has_pair");
97  BOOST_REQUIRE(field.GetData().IsInt());
98 
99  retval = field.GetData().GetInt();
100  }
101  }
102 
103  return retval;
104 }
105 
106 
107 // Check that flags for paired reads are set correctly
108 BOOST_AUTO_TEST_CASE(FlagsForPairedReads)
109 {
110  const bool kCheckForPairs = true;
111  vector<string> accessions = {"SRR4423739"};
112 
113  CSraInputSource input_source(accessions, kCheckForPairs);
114  CBlastInputOMF input(&input_source, 2);
115 
116  unordered_map<string, int> ref_flags = {
117  {"gnl|SRA|SRR4423739.1.1", eFirstSegment},
118  {"gnl|SRA|SRR4423739.1.2", eLastSegment}
119  };
120 
121  CRef<CBioseq_set> queries(new CBioseq_set);
122  input.GetNextSeqBatch(*queries);
123 
124  BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 2u);
125 
126  size_t count = 0;
127  for (auto it : queries->GetSeq_set()) {
128  string id = it->GetSeq().GetFirstId()->AsFastaString();
129  int flags = s_GetSegmentFlags(it->GetSeq());
130  int expected = ref_flags.at(id);
131 
132  BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
133  id + " is different from expected " +
134  NStr::IntToString(flags) + " != " +
136  count++;
137  }
138 
139  BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
140 }
141 
142 
143 BOOST_AUTO_TEST_CASE(FlagsForSingleReads)
144 {
145  const bool kCheckForPairs = false;
146  vector<string> accessions = {"SRR4423739"};
147  CSraInputSource input_source(accessions, kCheckForPairs);
148  CBlastInputOMF input(&input_source, 300);
149  CRef<CBioseq_set> queries(new CBioseq_set);
150  input.GetNextSeqBatch(*queries);
151 
152  BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 2u);
153 
154  size_t count = 0;
155  for (auto it : queries->GetSeq_set()) {
156  string id = it->GetSeq().GetFirstId()->AsFastaString();
157  int flags = s_GetSegmentFlags(it->GetSeq());
158  int expected = 0;
159 
160  BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
161  id + " is different from expected " +
162  NStr::IntToString(flags) + " != " +
164  count++;
165  }
166 
167  BOOST_REQUIRE_EQUAL(2u, count);
168 }
169 
170 // Test that all input SRA accessions are read
171 BOOST_AUTO_TEST_CASE(MultipleAccessions)
172 {
173  const int kBatchSize = 30000000;
174  vector<string> accessions = {"SRR3720856", "SRR5196091"};
175 
176  CSraInputSource input_source(accessions);
177  CBlastInputOMF input(&input_source, kBatchSize);
178 
179  CRef<CBioseq_set> queries;
180  queries.Reset(new CBioseq_set);
181  input.GetNextSeqBatch(*queries);
182 
183  // the first query must be from the first SRA accession
184  const CSeq_id* fid = queries->GetSeq_set().front()->GetSeq().GetFirstId();
185  BOOST_REQUIRE(fid->GetSeqIdString().find(accessions.front()) != string::npos);
186 
187  // the last query must be from the last SRA accession
188  const CSeq_id* lid = queries->GetSeq_set().back()->GetSeq().GetFirstId();
189  BOOST_REQUIRE(lid->GetSeqIdString().find(accessions.back()) != string::npos);
190 }
191 
192 BOOST_AUTO_TEST_CASE(MultipleAccessionsForceSingle)
193 {
194  const int kBatchSize = 30000000;
195  const bool kCheckForPairs = false;
196  vector<string> accessions = {"SRR3720856", "SRR5196091"};
197 
198  CSraInputSource input_source(accessions, kCheckForPairs);
199  CBlastInputOMF input(&input_source, kBatchSize);
200  CRef<CBioseq_set> queries;
201  queries.Reset(new CBioseq_set);
202  input.GetNextSeqBatch(*queries);
203 
204  // the first query must be from the first SRA accession
205  const CSeq_id* fid = queries->GetSeq_set().front()->GetSeq().GetFirstId();
206  BOOST_REQUIRE(fid->GetSeqIdString().find(accessions.front()) != string::npos);
207 
208  // the last query must be from the last SRA accession
209  const CSeq_id* lid = queries->GetSeq_set().back()->GetSeq().GetFirstId();
210  BOOST_REQUIRE(lid->GetSeqIdString().find(accessions.back()) != string::npos);
211 }
212 
213 
215 
#define static
@ eFirstSegment
The first sequence of a pair with both sequences read and accepted.
@ eLastSegment
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
BOOST_AUTO_TEST_CASE(FlagsForPairedReads)
static int s_GetSegmentFlags(const CBioseq &bioseq)
NCBITEST_AUTO_INIT()
Lightweight representation of just a host and a port.
CDataLoader * CreateAndRegister(CObjectManager &, const TPluginManagerParamTree *) const
CHttpCookie::
Default IMessage implementation: text and severity only.
CObjectManager –.
Service class for DLLs resolution.
Class for reading sequences from SRA respository or SRA file.
definition of a Culling tree
Definition: ncbi_tree.hpp:100
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
Definition: User_field.cpp:211
static uch flags
static const char * expected[]
Definition: bcp.c:42
@ eDiag_Trace
Trace message.
Definition: ncbidiag.hpp:657
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define kEmptyStr
Definition: ncbistr.hpp:123
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5086
const TData & GetData(void) const
Get the Data member data.
bool IsInt(void) const
Check if variant Int is selected.
TInt GetInt(void) const
Get the variant data.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
static int input()
Magic spell ;-) needed for some weird compilers... very empiric.
IMessage/IMessageListener interfaces and basic implementations.
#define count
Plugin manager (using class factory paradigm).
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
Utility stuff for more convenient using of Boost.Test library.
#define const
Definition: zconf.h:232
Modified on Wed Sep 04 15:06:54 2024 by modify_doxy.py rev. 669887