NCBI C++ ToolKit
unit_test_orf.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: unit_test_orf.cpp 93283 2021-03-26 14:48:04Z chetvern $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mike DiCuccio
27  *
28  * File Description:
29  *
30  * ===========================================================================
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/test_boost.hpp>
39 
40 
43 #include <objmgr/scope.hpp>
44 #include <objmgr/seq_vector.hpp>
45 #include <objmgr/feat_ci.hpp>
46 
47 #include <serial/serial.hpp>
48 #include <serial/objistr.hpp>
49 #include <serial/objostr.hpp>
50 
53 #include <objects/seq/Bioseq.hpp>
57 
58 #include <algo/sequence/orf.hpp>
59 #include <objmgr/util/sequence.hpp>
60 
63 
64 
66 {
69 }
70 
71 
73 {
74  arg_desc->AddKey("in", "input",
75  "Seq-annot of ORFs",
77 
78  arg_desc->AddOptionalKey("out", "output",
79  "Output seq-annot of ORFs",
81 }
82 
83 BOOST_AUTO_TEST_CASE(TestUsingArg)
84 {
86  CScope scope(*om);
87  scope.AddDefaults();
88 
89  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
90  CNcbiIstream& istr = args["in"].AsInputFile();
91  CNcbiOstream* ostr = args["out"] ? &args["out"].AsOutputFile() : NULL;
92 
93  while(true) {
94 
95  CRef<CSeq_annot> ref_annot(new CSeq_annot);
96 
97  try {
98  istr >> MSerial_AsnText >> *ref_annot;
99  } catch (CEofException&) {
100  break;
101  }
102 
103  const CAnnot_descr& desc = ref_annot->GetDesc();
104  string comment = desc.Get().front()->GetComment();
105 
106  vector<string> allowable_starts;
107  bool longest_orfs = false;
108  NStr::Split(comment, " ", allowable_starts, NStr::fSplit_Tokenize);
109  auto longest = find(allowable_starts.begin(), allowable_starts.end(), "longest");
110  if (longest != allowable_starts.end()) {
111  longest_orfs = true;
112  allowable_starts.erase(longest);
113  }
114 
115  const CSeq_id& seq_id = *ref_annot->GetData().GetFtable().front()->GetLocation().GetId();
116  CBioseq_Handle bsh = scope.GetBioseqHandle(seq_id);
117 
118  const COrg_ref &org = sequence::GetOrg_ref(bsh);
119  int gcode = org.IsSetGcode() ? org.GetGcode() : 1;
120 
121 
122  COrf::TLocVec loc_vec;
123  {{
124  CSeqVector sv(bsh);
125  size_t min_length = 300; //=100 codons options in gbench
126  size_t max_seq_gap = 30;
127  COrf::FindOrfs(sv, loc_vec, min_length, gcode, allowable_starts, longest_orfs, max_seq_gap);
128  }}
129 
130  CRef<CSeq_annot> my_annot = COrf::MakeCDSAnnot(loc_vec, gcode, const_cast<CSeq_id*>(&seq_id));
131 
132  my_annot->SetDesc().Assign(desc);
133 
134  if(ostr) {
135  *ostr << MSerial_AsnText << *my_annot;
136  }
137 
138  BOOST_CHECK(my_annot->Equals(*ref_annot));
139  }
140 }
141 
142 
143 BOOST_AUTO_TEST_CASE(tiny_islands)
144 {
145  string seq =
146  "ANNNNNNNNNNNNANNNNNNNNNNNNAANNNNNNNNNNNNAA"
147  "ANNNNNNNNNNNNATGTGANNNNNNNNNNNNATGTGAANNNNNNNNNNNNAA";
148  vector<string> allowable_starts = {{"ATG"}};
149  COrf::TLocVec loc_vec;
150  BOOST_CHECK_NO_THROW(
151  COrf::FindOrfs(seq, loc_vec, 0, 1, allowable_starts, false, 10)
152  );
153  BOOST_CHECK_EQUAL(loc_vec.size(), 2);
154 }
155 
157 {
158  // the very last N caused lookup beyond sequence
159  // the very first N would become the last on minus strand,
160  // can be extended beyond sequence and
161  // become negative when converted back
162  string seq =
163  "NTCACCTTTTCGCCCCTCGGCGACTTACTTTGAGAGGCCAAAGTAAGCAAAGCCTTTTGCTCCGGTTCC";
164  vector<string> allowable_starts;
165  COrf::TLocVec loc_vec;
166  BOOST_CHECK_NO_THROW(COrf::FindOrfs(seq, loc_vec, 60, 11, allowable_starts,
167  true, // longest_orfs
168  10000));
169  for (auto& loc: loc_vec) {
170  BOOST_CHECK(loc->GetStart(eExtreme_Positional) <
171  loc->GetStop(eExtreme_Positional));
172  }
173  BOOST_CHECK(loc_vec.size() > 0);
174 }
176 {
177  // the first N used to be ignored in codon state calculation
178  string seq =
179  "ATGCTGANAA"
180  "ATGTTGNAAA"
181  "ATGCCCTGA";
182  vector<string> allowable_starts = {"ATG"};
183  COrf::TLocVec loc_vec;
184  BOOST_CHECK_NO_THROW(COrf::FindOrfs(seq, loc_vec, 3, 11, allowable_starts,
185  true, // longest_orfs
186  10000));
187  for (auto& loc: loc_vec) {
188  cerr << loc->GetStart(eExtreme_Biological) << ".." <<
189  loc->GetStop(eExtreme_Biological) << endl;
190  }
191  BOOST_CHECK_EQUAL(count_if(loc_vec.begin(),loc_vec.end(),
192  [](const CRef<CSeq_loc>& a){
193  return !a->IsPartialStop(eExtreme_Biological);
194  }),
195  1);
196 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
CAnnot_descr –.
Definition: Annot_descr.hpp:66
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
vector< CRef< objects::CSeq_loc > > TLocVec
Definition: orf.hpp:55
static CRef< objects::CSeq_annot > MakeCDSAnnot(const TLocVec &orfs, int genetic_code=1, objects::CSeq_id *id=NULL)
/ This version returns an annot full of CDS features.
Definition: orf.cpp:438
static void FindOrfs(const string &seq, TLocVec &results, unsigned int min_length_bp=3, int genetic_code=1, const vector< string > &allowable_starts=vector< string >(), bool longest_orfs=true, size_t max_seq_gap=k_default_max_seq_gap)
Find ORFs in both orientations.
Definition: orf.cpp:336
bool IsSetGcode(void) const
Definition: Org_ref.cpp:129
int GetGcode(void) const
Definition: Org_ref.cpp:134
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
#define NULL
Definition: ncbistd.hpp:225
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
Definition: sequence.cpp:264
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
const Tdata & Get(void) const
Get the member data.
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Seq_annot_.hpp:852
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
unsigned int a
Definition: ncbi_localip.c:102
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
The Object manager core.
CRef< objects::CObjectManager > om
Utility stuff for more convenient using of Boost.Test library.
USING_SCOPE(objects)
BOOST_AUTO_TEST_CASE(TestUsingArg)
NCBITEST_INIT_CMDLINE(arg_desc)
NCBITEST_AUTO_INIT()
USING_NCBI_SCOPE
Modified on Fri Sep 20 14:58:28 2024 by modify_doxy.py rev. 669887