NCBI C++ ToolKit
unit_test_internal_stops.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_internal_stops.cpp 93111 2021-03-05 15:18:46Z chetvern $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mike DiCuccio
27  *
28  * File Description:
29  *
30  * ===========================================================================
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/test_boost.hpp>
39 
42 #include <objmgr/scope.hpp>
43 
44 #include <serial/serial.hpp>
45 #include <serial/objistr.hpp>
46 #include <serial/objostr.hpp>
47 
49 
52 
54 {
55  // Here we make descriptions of command line parameters that we are
56  // going to use.
57 
58 }
59 
61 {
64 }
65 
67 {
69  CScope scope(*om);
70  scope.AddDefaults();
71 
72 string buf = " \
73 Seq-align ::= { \
74  type disc, \
75  dim 2, \
76  segs spliced { \
77  product-id gi 148225248, \
78  genomic-id gi 224514980, \
79  genomic-strand plus, \
80  product-type protein, \
81  exons { \
82  { \
83  product-start protpos { \
84  amin 22, \
85  frame 1 \
86  }, \
87  product-end protpos { \
88  amin 277, \
89  frame 3 \
90  }, \
91  genomic-start 30641728, \
92  genomic-end 30642468, \
93  parts { \
94  diag 69, \
95  product-ins 3, \
96  diag 30, \
97  product-ins 4, \
98  diag 494, \
99  product-ins 2, \
100  diag 85, \
101  product-ins 18, \
102  diag 63 \
103  }, \
104  partial TRUE \
105  } \
106  }, \
107  product-length 278, \
108  modifiers { \
109  stop-codon-found TRUE \
110  } \
111  } \
112 } \
113 ";
114 
115  CNcbiIstrstream istrs(buf);
116 
117  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
118 
119  CSeq_align align;
120  *istr >> align;
121 
122  BOOST_CHECK_NO_THROW(align.Validate(true));
123 
124  CInternalStopFinder int_stop_finder(scope);
125 
126  set<TSeqPos> stops = int_stop_finder.FindStops(align);
127 
128  BOOST_CHECK_EQUAL( stops.size(), 2U );
129 }
130 
131 /*
132 BOOST_AUTO_TEST_CASE(TestMRNA)
133 {
134  CRef<CObjectManager> om = CObjectManager::GetInstance();
135  CScope scope(*om);
136  scope.AddDefaults();
137 
138 string buf = " \
139 Seq-align ::= { \
140  type disc, \
141  dim 2, \
142  segs spliced { \
143  product-id gi 178405, \
144  genomic-id gi 224514917, \
145  product-strand plus, \
146  genomic-strand minus, \
147  product-type transcript, \
148  exons { \
149  { \
150  product-start nucpos 0, \
151  product-end nucpos 2083, \
152  genomic-start 78159147, \
153  genomic-end 78161233, \
154  parts { \
155  match 159, \
156  genomic-ins 1, \
157  match 25, \
158  genomic-ins 1, \
159  match 47, \
160  product-ins 1, \
161  match 361, \
162  genomic-ins 2, \
163  match 1491 \
164  } \
165  } \
166  }, \
167  product-length 2084 \
168  } \
169 } \
170 ";
171 
172  CNcbiIstrstream istrs(buf.c_str());
173 
174  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
175 
176  CSeq_align align;
177  *istr >> align;
178 
179  BOOST_CHECK_NO_THROW(align.Validate(true));
180 
181  CInternalStopFinder int_stop_finder(scope);
182 
183  set<TSeqPos> stops = int_stop_finder.FindStops(align);
184 
185  BOOST_CHECK_EQUAL( stops.size(), 9U );
186 }
187 */
188 
189 
190 BOOST_AUTO_TEST_CASE(TestStartStopProteinWithPadding)
191 {
193  CScope scope(*om);
194  scope.AddDefaults();
195 
196 string buf = " \
197 Seq-align ::= { \
198  type disc, \
199  dim 2, \
200  segs spliced { \
201  product-id gi 148225248, \
202  genomic-id gi 224514980, \
203  genomic-strand plus, \
204  product-type protein, \
205  exons { \
206  { \
207  product-start protpos { \
208  amin 22, \
209  frame 1 \
210  }, \
211  product-end protpos { \
212  amin 277, \
213  frame 3 \
214  }, \
215  genomic-start 30641728, \
216  genomic-end 30642468, \
217  parts { \
218  diag 69, \
219  product-ins 3, \
220  diag 30, \
221  product-ins 4, \
222  diag 494, \
223  product-ins 2, \
224  diag 85, \
225  product-ins 18, \
226  diag 63 \
227  }, \
228  partial TRUE \
229  } \
230  }, \
231  product-length 278, \
232  modifiers { \
233  stop-codon-found TRUE \
234  } \
235  } \
236 } \
237 ";
238 
239  CNcbiIstrstream istrs(buf);
240 
241  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
242 
243  CSeq_align align;
244  *istr >> align;
245 
246  BOOST_CHECK_NO_THROW(align.Validate(true));
247 
248  CInternalStopFinder int_stop_finder(scope);
249 
250  pair<set<TSeqPos>, set<TSeqPos> > starts_stops = int_stop_finder.FindStartsStops(align,3);
251 
252  BOOST_CHECK_EQUAL( starts_stops.first.size(), 10U );
253  BOOST_CHECK_EQUAL( starts_stops.second.size(), 3U );
254 }
255 
256 BOOST_AUTO_TEST_CASE(TestStartAcrossTheOrigin)
257 {
259  CScope scope(*om);
260  scope.AddDefaults();
261 
262 string buf = " \
263 Seq-align ::= { \
264  type disc, \
265  dim 2, \
266  segs spliced { \
267  product-id gi 488735231, \
268  genomic-id gi 6382081, \
269  genomic-strand plus, \
270  product-type protein, \
271  exons { \
272  { \
273  product-start protpos { \
274  amin 2, \
275  frame 3 \
276  }, \
277  product-end protpos { \
278  amin 430, \
279  frame 3 \
280  }, \
281  genomic-start 1, \
282  genomic-end 1285, \
283  parts { \
284  diag 1285 \
285  }, \
286  partial FALSE \
287  } \
288  }, \
289  product-length 431, \
290  modifiers { \
291  stop-codon-found TRUE \
292  } \
293  } \
294 } \
295 ";
296 
297  CNcbiIstrstream istrs(buf);
298 
299  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
300 
301  CSeq_align align;
302  *istr >> align;
303 
304  BOOST_CHECK_NO_THROW(align.Validate(true));
305 
306  CInternalStopFinder int_stop_finder(scope);
307 
308  pair<set<TSeqPos>, set<TSeqPos> > starts_stops = int_stop_finder.FindStartsStops(align,11);
309 
310  BOOST_CHECK_EQUAL( *starts_stops.first.rbegin(), 30740U );
311 }
312 
313 BOOST_AUTO_TEST_CASE(TestStopInGap)
314 {
316  CScope scope(*om);
317  scope.AddDefaults();
318 
319 string buf = " \
320 Seq-align ::= { \
321  type disc, \
322  dim 2, \
323  segs spliced { \
324  product-id gi 487809918, \
325  genomic-id gi 341576043, \
326  genomic-strand minus, \
327  product-type protein, \
328  exons { \
329  { \
330  product-start protpos { \
331  amin 10, \
332  frame 1 \
333  }, \
334  product-end protpos { \
335  amin 165, \
336  frame 3 \
337  }, \
338  genomic-start 84235, \
339  genomic-end 84618, \
340  parts { \
341  diag 6, \
342  genomic-ins 6, \
343  diag 48, \
344  product-ins 3, \
345  diag 12, \
346  product-ins 15, \
347  diag 18, \
348  product-ins 72, \
349  diag 294 \
350  }, \
351  partial TRUE \
352  } \
353  }, \
354  product-length 166, \
355  modifiers { \
356  start-codon-found TRUE, \
357  stop-codon-found TRUE \
358  } \
359  } \
360 } \
361 ";
362 
363  CNcbiIstrstream istrs(buf);
364 
365  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
366 
367  CSeq_align align;
368  *istr >> align;
369 
370  BOOST_CHECK_NO_THROW(align.Validate(true));
371 
372  CInternalStopFinder int_stop_finder(scope);
373 
374  CFeatureGenerator fg(scope);
376  CConstRef<CSeq_align> clean_alignment = fg.CleanAlignment(align);
377  set<TSeqPos> stops = int_stop_finder.FindStops(*clean_alignment);
378 
379  BOOST_CHECK_EQUAL( *stops.begin(), 84609U );
380 }
381 
382 BOOST_AUTO_TEST_CASE(TestAltStarts)
383 {
385  CScope scope(*om);
386  scope.AddDefaults();
387 
388 string buf = " \
389 Seq-align ::= { \
390  type disc, \
391  dim 2, \
392  segs spliced { \
393  product-id gi 148225248, \
394  genomic-id gi 224514980, \
395  genomic-strand plus, \
396  product-type protein, \
397  exons { \
398  { \
399  product-start protpos { \
400  amin 22, \
401  frame 1 \
402  }, \
403  product-end protpos { \
404  amin 277, \
405  frame 3 \
406  }, \
407  genomic-start 30641728, \
408  genomic-end 30642468, \
409  parts { \
410  diag 69, \
411  product-ins 3, \
412  diag 30, \
413  product-ins 4, \
414  diag 494, \
415  product-ins 2, \
416  diag 85, \
417  product-ins 18, \
418  diag 63 \
419  }, \
420  partial TRUE \
421  } \
422  }, \
423  product-length 278, \
424  modifiers { \
425  stop-codon-found TRUE \
426  } \
427  } \
428 } \
429 ";
430 
431  CNcbiIstrstream istrs(buf);
432 
433  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
434 
435  CSeq_align align;
436  *istr >> align;
437 
438  BOOST_CHECK_NO_THROW(align.Validate(true));
439 
440  CInternalStopFinder int_stop_finder(scope);
441 
443  TStarts starts = int_stop_finder.FindStartStopRanges(align,3).first;
444 
445  int atg_starts = 0;
446  ITERATE(TStarts, s, starts) {
447  if (s->second == "ATG")
448  ++atg_starts;
449  }
450 
451  BOOST_CHECK_EQUAL( atg_starts, 3 );
452 }
453 
454 BOOST_AUTO_TEST_CASE(TestAtCircularEnd)
455 {
457  CScope scope(*om);
458  scope.AddDefaults();
459 
460 string buf = " \
461 Seq-align ::= { \
462  type disc, \
463  dim 2, \
464  segs spliced { \
465  product-id gi 490246827, \
466  genomic-id gi 364515570, \
467  genomic-strand minus, \
468  product-type protein, \
469  exons { \
470  { \
471  product-start protpos { \
472  amin 0, \
473  frame 1 \
474  }, \
475  product-end protpos { \
476  amin 628, \
477  frame 3 \
478  }, \
479  genomic-start 5332055, \
480  genomic-end 5333941, \
481  parts { \
482  match 3, \
483  diag 1884 \
484  } \
485  } \
486  }, \
487  product-length 629, \
488  modifiers { \
489  start-codon-found TRUE, \
490  stop-codon-found TRUE \
491  } \
492  } \
493 } \
494 ";
495 
496  CNcbiIstrstream istrs(buf);
497 
498  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
499 
500  CSeq_align align;
501  *istr >> align;
502 
503  BOOST_CHECK_NO_THROW(align.Validate(true));
504 
505  CInternalStopFinder int_stop_finder(scope);
506 
507  for (int pad=33; pad > 30; --pad) {
508  auto starts_stops_ranges = int_stop_finder.FindStartStopRanges(align, pad);
509 
510  BOOST_CHECK( starts_stops_ranges.first.find(TSeqRange(5333941,5333939)) != starts_stops_ranges.first.end() );
511  BOOST_CHECK_EQUAL( starts_stops_ranges.first[TSeqRange(5333941,5333939)], "ATG" );
512  }
513 }
514 
515 BOOST_AUTO_TEST_CASE(TestReportGaps)
516 {
518  CScope scope(*om);
519  scope.AddDefaults();
520 
521 string buf = " \
522 Seq-align ::= { \
523  type disc, \
524  dim 2, \
525  segs spliced { \
526  product-id gi 487427171, \
527  genomic-id gi 357958168, \
528  genomic-strand plus, \
529  product-type protein, \
530  exons { \
531  { \
532  product-start protpos { \
533  amin 3, \
534  frame 1 \
535  }, \
536  product-end protpos { \
537  amin 96, \
538  frame 3 \
539  }, \
540  genomic-start 31, \
541  genomic-end 312, \
542  parts { \
543  diag 282 \
544  }, \
545  partial TRUE \
546  } \
547  }, \
548  product-length 97 \
549  } \
550 } \
551 ";
552 
553  CNcbiIstrstream istrs(buf);
554 
555  unique_ptr<CObjectIStream> istr(CObjectIStream::Open(eSerial_AsnText, istrs));
556 
557  CSeq_align align;
558  *istr >> align;
559 
560  BOOST_CHECK_NO_THROW(align.Validate(true));
561 
562  CInternalStopFinder int_stop_finder(scope);
563 
565  auto starts_stops = int_stop_finder.FindStartStopRanges(align, 1100, &gaps);
566 
567  BOOST_CHECK_EQUAL( gaps.size(), 3U );
568  auto it = gaps.begin();
569  BOOST_CHECK_EQUAL( it->GetFrom(), -10 );
570  BOOST_CHECK_EQUAL( it->GetTo(), -1 );
571  ++it;
572  BOOST_CHECK_EQUAL( it->GetFrom(), 26 );
573  BOOST_CHECK_EQUAL( it->GetTo(), 28 );
574  ++it;
575  BOOST_CHECK_EQUAL( it->GetFrom(), 1405 );
576  BOOST_CHECK_EQUAL( it->GetTo(), 1414 );
577 }
static void pad(Char *s)
Definition: bzip2.c:908
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
set< TSeqPos > FindStops(const CSeq_align &align)
pair< map< TSeqRange, string >, set< TSeqRange > > FindStartStopRanges(const CSeq_align &align, int padding=0, set< TSignedSeqRange > *gaps=nullptr)
pair< set< TSeqPos >, set< TSeqPos > > FindStartsStops(const CSeq_align &align, int padding=0)
CScope –.
Definition: scope.hpp:92
void Validate(bool full_test=false) const
Definition: Seq_align.cpp:649
Definition: map.hpp:338
Definition: set.hpp:45
const_iterator begin() const
Definition: set.hpp:135
size_type size() const
Definition: set.hpp:132
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
map< TSeqRange, string > TStarts
char * buf
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
The Object manager core.
CRef< objects::CObjectManager > om
Utility stuff for more convenient using of Boost.Test library.
USING_SCOPE(objects)
NCBITEST_INIT_CMDLINE(arg_desc)
BOOST_AUTO_TEST_CASE(TestProtein)
Modified on Fri Sep 20 14:57:30 2024 by modify_doxy.py rev. 669887