NCBI C++ ToolKit
segmasker.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: segmasker.cpp 92134 2020-12-22 16:50:26Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file segmasker.cpp
31  * SEG filtering application
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 
37 // Objects includes
39 
40 // Filtering applications IO
50 
51 // Object manager includes
53 #include <objmgr/bioseq_handle.hpp>
54 
55 #include <algo/segmask/segmask.hpp>
56 
57 #ifndef SKIP_DOXYGEN_PROCESSING
60 #endif /* SKIP_DOXYGEN_PROCESSING */
61 
62 /////////////////////////////////////////////////////////////////////////////
63 // SegMaskerApplication::
64 
65 
67 {
68 public:
69  /// Application constructor
72  version->SetVersionInfo(1, 0, 0);
74  }
75 
76 private:
77  /** @inheritDoc */
78  virtual void Init(void);
79  /** @inheritDoc */
80  virtual int Run(void);
81  /** @inheritDoc */
82  virtual void Exit(void);
83 
84  /// Retrieves the sequence reader interface for the application
86  /// Retrieves the output writer interface for the application
88 
89  /// Contains the description of this application
90  static const char * const USAGE_LINE;
91 };
92 
93 /////////////////////////////////////////////////////////////////////////////
94 // Init test for all different types of arguments
95 
96 const char * const SegMaskerApplication::USAGE_LINE
97  = "Low complexity region masker based on the SEG algorithm";
98 
100 {
102 
103  // Create command-line argument descriptions class
104  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
105 
106  // Specify USAGE context
107  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
108  USAGE_LINE);
109 
110  arg_desc->SetCurrentGroup("Input/output options");
111  arg_desc->AddDefaultKey(kInput, "input_file_name",
112  "input file name",
114  arg_desc->AddDefaultKey(kOutput, "output_file_name",
115  "output file name",
117  arg_desc->AddDefaultKey(kInputFormat, "input_format",
118  "controls the format of the masker input",
120  CArgAllow_Strings* strings_allowed = new CArgAllow_Strings();
121  for (size_t i = 0; i < kNumInputFormats; i++) {
122  strings_allowed->Allow(kInputFormats[i]);
123  }
124  arg_desc->SetConstraint(kInputFormat, strings_allowed);
125  arg_desc->AddFlag ( "parse_seqids",
126  "Parse Seq-ids in FASTA input", true );
127 
128  arg_desc->AddDefaultKey(kOutputFormat, "output_format",
129  "controls the format of the masker output",
131  strings_allowed = new CArgAllow_Strings();
132  for (size_t i = 0; i < kNumOutputFormats; i++) {
133  strings_allowed->Allow(kOutputFormats[i]);
134  }
135  arg_desc->SetConstraint(kOutputFormat, strings_allowed);
136 
137  arg_desc->SetCurrentGroup("SEG algorithm options");
138  arg_desc->AddDefaultKey("window", "integer_value", "SEG window",
141  arg_desc->AddDefaultKey("locut", "float_value", "SEG locut",
144  arg_desc->AddDefaultKey("hicut", "float_value", "SEG hicut",
147 
148  // Setup arg.descriptions for this application
149  SetupArgDescriptions(arg_desc.release());
150 }
151 
154 {
155  const CArgs& args = GetArgs();
156  const string& format(args[kInputFormat].AsString());
157  CMaskReader* retval = NULL;
158 
159  if (format == "fasta") {
160  CNcbiIstream& input = args[kInput].AsInputFile();
161  retval = new CMaskFastaReader(input, false, args["parse_seqids"]);
162  } else if (format == "blastdb") {
163  retval = new CMaskBDBReader(args[kInput].AsString(), false);
164  } else {
165  _ASSERT("Unknown input format" == 0);
166  }
167  return retval;
168 }
169 
172 {
173  const CArgs& args = GetArgs();
174  const string& format(args[kOutputFormat].AsString());
175  CMaskWriter* retval = NULL;
176 
177  if (format == "interval") {
178  CNcbiOstream& output = args[kOutput].AsOutputFile();
179  retval = new CMaskWriterInt(output);
180  } else if (format == "fasta") {
181  CNcbiOstream& output = args[kOutput].AsOutputFile();
182  retval = new CMaskWriterFasta(output);
183  } else if (NStr::StartsWith(format, "seqloc_asn1_binary")) {
184  CNcbiOstream& output = args[kOutput].AsOutputFile(CArgValue::fBinary);
185  retval = new CMaskWriterSeqLoc(output, format);
186  } else if (NStr::StartsWith(format, "seqloc_")) {
187  CNcbiOstream& output = args[kOutput].AsOutputFile();
188  retval = new CMaskWriterSeqLoc(output, format);
189  } else if (NStr::StartsWith(format, "maskinfo_asn1_bin")) {
190  CNcbiOstream& output = args[kOutput].AsOutputFile(CArgValue::fBinary);
191  retval =
195  } else if (NStr::StartsWith(format, "maskinfo_")) {
196  CNcbiOstream& output = args[kOutput].AsOutputFile();
197  retval =
201  } else {
202  throw runtime_error("Unknown output format");
203  }
204  return retval;
205 }
206 
207 /////////////////////////////////////////////////////////////////////////////
208 // Run demo
209 
210 
212 {
213  int retval = 0;
214  const CArgs& args = GetArgs();
215 
216  try {
217 
219 
220  CSegMasker masker(args["window"].AsInteger(),
221  args["locut"].AsDouble(),
222  args["hicut"].AsDouble());
223 
224  CRef<CSeq_entry> seq_entry;
225  unique_ptr<CMaskReader> reader(x_GetReader());
226  unique_ptr<CMaskWriter> writer(x_GetWriter());
227 
228  while ( (seq_entry = reader->GetNextSequence()).NotEmpty() ) {
229 
230  // Allow skipping of oid
231  if(seq_entry->Which() == CSeq_entry::e_not_set)
232  continue;
233 
234  CScope scope(*objmgr);
235  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*seq_entry);
236  CBioseq_Handle bioseq_handle = seh.GetSeq();
237  CSeqVector sequence_data =
239  unique_ptr<CSegMasker::TMaskList> masks(masker(sequence_data));
240  writer->Print(bioseq_handle, *masks, GetArgs()["parse_seqids"]);
241  // writer->Print(bioseq_handle, *masks);
242 
243  }
244 
245  } catch (const CException& e) {
246  cerr << e.what() << endl;
247  retval = 1;
248  }
249 
250  return retval;
251 }
252 
253 
254 /////////////////////////////////////////////////////////////////////////////
255 // Cleanup
256 
257 
259 {
260  SetDiagStream(0);
261 }
262 
263 
264 /////////////////////////////////////////////////////////////////////////////
265 // MAIN
266 
267 
268 #ifndef SKIP_DOXYGEN_PROCESSING
269 int main(int argc, const char* argv[])
270 {
271  // Execute main application function
272  return SegMaskerApplication().AppMain(argc, argv);
273 }
274 #endif /* SKIP_DOXYGEN_PROCESSING */
275 
const double kSegLocut
Locut parameter for SEG.
Definition: blast_seg.c:46
const int kSegWindow
Window that SEG examines at once.
Definition: blast_seg.c:45
const double kSegHicut
Hicut parameter for SEG.
Definition: blast_seg.c:47
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
Class for reading sequences from BLAST databases.
Class for reading sequences from fasta files.
Virtual base class for all input readers.
Definition: mask_reader.hpp:50
Output filter to print masked sequence locations as Blast-db-mask-info objects.
Output filter to write masked data in fasta format.
Output filter to print masked sequences as sets of intervals.
Output filter to print masked sequence locations as NCBI Seq-loc objects.
A base class for winmasker output writers.
Definition: mask_writer.hpp:52
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
This class encapsulates the SEG filtering algorithm.
Definition: segmask.hpp:47
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
virtual void Init(void)
@inheritDoc
Definition: segmasker.cpp:99
SegMaskerApplication()
Application constructor.
Definition: segmasker.cpp:70
virtual int Run(void)
@inheritDoc
Definition: segmasker.cpp:211
CMaskReader * x_GetReader()
Retrieves the sequence reader interface for the application.
Definition: segmasker.cpp:153
CMaskWriter * x_GetWriter()
Retrieves the output writer interface for the application.
Definition: segmasker.cpp:171
static const char *const USAGE_LINE
Contains the description of this application.
Definition: segmasker.cpp:90
virtual void Exit(void)
@inheritDoc
Definition: segmasker.cpp:258
static SQLCHAR output[256]
Definition: print.c:5
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1174
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1312
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
Definition: ncbiargs.cpp:4598
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
@ fBinary
Open file in binary mode.
Definition: ncbiargs.hpp:263
#define NULL
Definition: ncbistd.hpp:225
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
TSeq GetSeq(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
#define CVersion
@ eBlast_filter_program_seg
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_entry_.hpp:228
@ e_not_set
No variant selected.
Definition: Seq_entry_.hpp:88
static int input()
int i
Contains the command line options common to filtering algorithms.
const size_t kNumInputFormats
Number of elements in kInputFormats.
const char * kOutputFormats[]
Output formats allowed, the first one is the default.
const size_t kNumOutputFormats
Number of elements in kOutputFormats.
const std::string kOutput
Command line flag to specify the output.
const std::string kOutputFormat
Command line flag to specify the output format.
const char * kInputFormats[]
Input formats allowed, the first one is the default.
const std::string kInput
Command line flag to specify the input.
const std::string kInputFormat
Command line flag to specify the input format.
string BuildAlgorithmParametersString(const CArgs &args)
Builds an algorithm options string for the filtering applications (segmasker, dustmasker) by examinin...
static int version
Definition: mdb_load.c:29
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
static Format format
Definition: njn_ioutil.cpp:53
The Object manager core.
USING_SCOPE(objects)
int main(int argc, const char *argv[])
Definition: segmasker.cpp:269
USING_NCBI_SCOPE
Definition: segmasker.cpp:58
#define _ASSERT
Modified on Thu May 02 14:32:02 2024 by modify_doxy.py rev. 669887