NCBI C++ ToolKit
cpgdemo.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cpgdemo.cpp 91964 2020-12-17 12:53:46Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Philip Johnson
27 *
28 * File Description: cpgdemo -- demo program for cpg island finder
29 *
30 * ===========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/sequence/cpg.hpp>
35 
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbienv.hpp>
39 
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44 
47 
48 class CCpGDemoApp : public CNcbiApplication {
49 public:
51  virtual void Init(void);
52  virtual int Run(void);
53 };
54 
55 /*---------------------------------------------------------------------------*/
56 
58 {
59  unique_ptr<CArgDescriptions> argDescr(new CArgDescriptions);
60  argDescr->AddDefaultKey("gc", "gcContent", "calibrated organism %GC "
61  "content (ie. human: 50, rat: 48)",
63  argDescr->AddDefaultKey("cpg", "obsexp",
64  "observed / expected CpG percentage",
66  argDescr->AddDefaultKey("win", "window_size",
67  "width of sliding window",
69  argDescr->AddDefaultKey("len", "min_length",
70  "minimum length of an island",
72  argDescr->AddOptionalKey("m", "merge_isles",
73  "merge adjacent islands within the specified "
74  "distance of each other",
76 
77 
78 
79  argDescr->AddOptionalKey("a", "accession",
80  "Single accession", CArgDescriptions::eString);
81  argDescr->AddOptionalKey("i", "infile",
82  "List of accessions",
84  argDescr->AddExtra(0,99, "fasta files", CArgDescriptions::eInputFile);
85 
86 
87  argDescr->SetUsageContext(GetArguments().GetProgramBasename(),
88  "Scans sequences for CpG islands; uses algorithm based upon Takai & Jones, 2002. Output sent to stdout.\n", false);
89  SetupArgDescriptions(argDescr.release());
90 }
91 
92 //---------------------------------------------------------------------------
93 // PRE : open output stream, populated CpG island struct
94 // POST: <cpg start> <cpg stop> <%G + C> <obs/exp CpG>
96 {
97  unsigned int len = i.m_Stop - i.m_Start + 1;
98  o << i.m_Start << "\t" << i.m_Stop << "\t" <<
99  (double) (i.m_C + i.m_G) / len << "\t" <<
100  (double) i.m_CG * len / (i.m_C * i.m_G);
101  return o;
102 };
103 
104 //---------------------------------------------------------------------------
105 // PRE : accession to scan, scope in which to resolve accession, CArgs
106 // containing cpg island-scanning parameters
107 // POST: 0 if successful, any islands found in the given accession according
108 // to the arguments have been sent to cout
109 int ScanForCpGs(const string& acc, CScope &scope, const CArgs& args)
110 {
111  CRef<CSeq_id> seq_id;
112  try {
113  seq_id.Reset(new CSeq_id(acc));
114  } catch (CSeqIdException& e) {
115  cerr << "Invalid seq-id: '" << acc << "': " << e.what() << endl;
116  return 1;
117  }
118 
119  CBioseq_Handle bioseq_handle = scope.GetBioseqHandle(*seq_id);
120 
121  if (!bioseq_handle) {
122  cerr << "Bioseq load FAILED." << endl;
123  return 2;
124  }
125 
126  CSeqVector sv =
128  string seqString;
129  seqString.reserve(sv.size());
130  sv.GetSeqData(0, sv.size(), seqString);
131 
132  CCpGIslands cpgIsles(seqString.data(), seqString.length(),
133  args["win"].AsInteger(), args["len"].AsInteger(),
134  args["gc"].AsInteger(), args["cpg"].AsInteger());
135  if (args["m"]) {
136  cpgIsles.MergeIslesWithin(args["m"].AsInteger());
137  }
138 
139  ITERATE(CCpGIslands::TIsles, i, cpgIsles.GetIsles()) {
140  cout << acc << "\t" << *i << endl;
141  }
142 
143  return 0;
144 }
145 
146 //---------------------------------------------------------------------------
147 // PRE : fasta file to scan, scope in which to resolve accession, CArgs
148 // containing cpg island-scanning parameters
149 // POST: 0 if successful, any islands found in the given accession according
150 // to the arguments have been sent to cout
151 int ScanForCpGs(istream &infile, const CArgs &args)
152 {
153  string localID;
154 
155  infile >> localID;
156  //skip rest of line
157  infile.ignore(numeric_limits<int>::max(), '\n');
158 
159  while (infile) {
160  if (localID[0] != '>') {
161  ERR_POST(Critical << "FASTA file garbled around '" <<localID<<"'");
162  return 1;
163  }
164 
165  //read in nucleotides
166  string seqString, lineBuff;
167  while (!infile.eof() && infile.peek() != '>') {
168  getline(infile, lineBuff);
169  if (seqString.size() + lineBuff.size() > seqString.capacity())
170  seqString.reserve(seqString.capacity() * 2);
171  seqString += lineBuff;
172  }
173 
174  //scan
175  CCpGIslands cpgIsles(seqString.data(), seqString.length(),
176  args["win"].AsInteger(), args["len"].AsInteger(),
177  args["gc"].AsInteger(), args["cpg"].AsInteger());
178  if (args["m"]) {
179  cpgIsles.MergeIslesWithin(args["m"].AsInteger());
180  }
181 
182  //output
183  ITERATE(CCpGIslands::TIsles, i, cpgIsles.GetIsles()) {
184  cout << localID << "\t" << *i << endl;
185  }
186 
187  infile >> localID;
188  infile.ignore(numeric_limits<int>::max(), '\n');
189  }
190 
191  return 0;
192 }
193 
194 
195 /*---------------------------------------------------------------------------*/
196 
198 {
199  const CArgs& args = GetArgs();
202 
203  CScope scope(*objMgr);
204  scope.AddDefaults();
205  int retCode = 0;
206 
207  cout.precision(2);
208  cout.setf(ios::fixed, ios::floatfield);
209  cout << "# CpG islands. Win:" << args["win"].AsInteger()
210  << "; Min len:" << args["len"].AsInteger() << "; Min %GC:"
211  << args["gc"].AsDouble() << "; Min obs/exp CpG: "
212  << args["cpg"].AsDouble();
213  if (args["m"]) {
214  cout << "; Merge islands within: " << args["m"].AsInteger();
215  }
216  cout << endl;
217  cout << "# label\tisle_start\tisle_stop\t%GC\tobs/exp CpG" << endl;
218 
219  if (args["a"]) {
220  retCode = ScanForCpGs(args["a"].AsString(), scope, args);
221  }
222 
223  if (args["i"]) {
224  istream &infile = args["i"].AsInputFile();
225  string acc;
226 
227  infile >> acc;
228  while (infile.good()) {
229  cerr << "Processing " << acc << endl;
230  if (ScanForCpGs(acc, scope, args) != 0) {
231  retCode = 3;
232  }
233  infile >> acc;
234  }
235  }
236 
237  for (unsigned int i = 1; i <= args.GetNExtra(); ++i) {
238  if (!ScanForCpGs(args[i].AsInputFile(), args)) {
239  retCode = 3;
240  }
241  }
242 
243  return retCode;
244 }
245 
246 //---------------------------------------------------------------------------
247 int main(int argc, char** argv)
248 {
249  CCpGDemoApp theApp;
250  return theApp.AppMain(argc, argv, NULL, eDS_Default, 0);
251 }
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
CCpGDemoApp(void)
Definition: cpgdemo.cpp:50
virtual int Run(void)
Run the application.
Definition: cpgdemo.cpp:197
virtual void Init(void)
Initialize the application.
Definition: cpgdemo.cpp:57
list< SCpGIsland > TIsles
Definition: cpg.hpp:57
void MergeIslesWithin(unsigned int range)
Definition: cpg.cpp:204
const TIsles & GetIsles(void) const
Definition: cpg.hpp:95
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
CScope –.
Definition: scope.hpp:92
CSeqIdException –.
Definition: Seq_id.hpp:1001
CSeqVector –.
Definition: seq_vector.hpp:65
USING_SCOPE(objects)
int ScanForCpGs(const string &acc, CScope &scope, const CArgs &args)
Definition: cpgdemo.cpp:109
int main(int argc, char **argv)
Definition: cpgdemo.cpp:247
CNcbiOstream & operator<<(CNcbiOstream &o, SCpGIsland i)
Definition: cpgdemo.cpp:95
USING_NCBI_SCOPE
Definition: cpgdemo.cpp:45
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void DisableArgDescriptions(TDisableArgDesc disable=fDisableStdArgs)
Definition: ncbiapp.cpp:1319
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
Definition: ncbiargs.hpp:422
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
void Critical(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1203
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
int i
int len
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
T max(T x_, T y_)
The Object manager core.
static FILE * infile
Definition: pcre2test.c:950
Modified on Fri Sep 20 14:57:25 2024 by modify_doxy.py rev. 669887