59 "file with unit counts",
62 arg_desc.
AddKey(
"ustat",
"unit_counts",
63 "file with unit counts",
68 "(not optional if used with -mk_counts or -convert options)",
75 "check for duplicate sequences",
79 "indicates that -input represents a file containing "
80 "a list of names of fasta files to process, one name "
84 "memory available for mk_counts option in megabytes",
87 "add metadata to the counts file",
90 "number of bases in a unit",
93 "total size of the genome",
102 "window score above which it is allowed to extend masking",
105 "window score threshold used to trigger masking",
108 "alternative high score for a unit if the"
109 "original unit score is more than highscore",
112 "alternative low score for a unit if the"
113 "original unit score is lower than lowscore",
125 arg_desc.
AddFlag (
"parse_seqids",
126 "Parse Seq-ids in FASTA input",
true );
128 "controls the format of the masker output (for masking stage only)",
138 "maximum useful unit score",
141 "maximum useful unit score as percentage",
144 "window score threshold used to trigger masking as percentage",
147 "window score above which it is allowed to extend masking as percentage",
150 "minimum useful unit score",
153 "minimum useful unit score as percentage",
168 "controls the format of the masker input",
171 "file containing the list of ids to exclude from processing",
174 "file containing the list of ids to process",
177 "match ids as strings",
183 strings_allowed->
Allow(
"seqids");
188 "controls the format of the output file containing the unit counts "
189 "(for counts generation and conversion only)",
196 ->Allow(
"obinary" ) );
198 "target size of the output file containing the unit counts",
203 "combine window masking with dusting",
206 "dust minimum level",
211 arg_desc.
AddFlag(
"mk_counts",
"generate frequency counts for a database" );
212 arg_desc.
AddFlag(
"convert",
"convert counts between different formats" );
264 if(args[
"mk_counts"])
266 else if(args[
"convert"])
268 else if(args[
"ustat"])
272 "one of '-mk_counts', '-convert' or '-ustat <stat_file>' "
273 "must be specified" );
288 if (
format ==
"interval") {
291 }
else if (
format ==
"fasta") {
313 throw runtime_error(
"Unknown output format");
320 : app_type(s_DetermineAppType(args,
type)),
321 is( app_type >= eGenerateMasks && args[
kInputFormat].AsString() !=
"blastdb"
323 ( !(args[
kInput].AsString() ==
"-")
326 lstat_name( app_type >= eGenerateMasks ? args[
"ustat"].AsString() :
"" ),
327 t_low_pct( app_type != eConvertCounts && args[
"t_low_pct"] ? args[
"t_low_pct"].AsDouble() : -1.0 ),
328 t_extend_pct( app_type != eConvertCounts && args[
"t_extend_pct"] ? args[
"t_extend_pct"].AsDouble() : -1.0 ),
329 t_thres_pct( app_type != eConvertCounts && args[
"t_thres_pct"] ? args[
"t_thres_pct"].AsDouble() : -1.0 ),
330 t_high_pct( app_type != eConvertCounts && args[
"t_high_pct"] ? args[
"t_high_pct"].AsDouble() : -1.0 ),
331 textend( app_type >= eGenerateMasks && args[
"t_extend"] ? args[
"t_extend"].AsInteger() : 0 ),
332 cutoff_score( app_type >= eGenerateMasks && args[
"t_thres"] ? args[
"t_thres"].AsInteger() : 0 ),
333 max_score( app_type != eConvertCounts && args[
"t_high"] ? args[
"t_high"].AsInteger() : 0 ),
334 min_score( app_type != eConvertCounts && args[
"t_low"] ? args[
"t_low"].AsInteger() : 0 ),
335 window_size( app_type >= eGenerateMasks && args[
"window"] ? args[
"window"].AsInteger() : 0 ),
337 merge_cutoff_score( 50 ),
338 abs_merge_cutoff_dist( 8 ),
339 mean_merge_cutoff_dist( 50 ),
346 merge_unit_step( 1 ),
347 fa_list( app_type == eComputeCounts && determine_input ? args[
"fa_list"].AsBoolean() :
false ),
348 mem( app_type == eComputeCounts ? args[
"mem"].AsInteger() : 0 ),
349 unit_size( app_type == eComputeCounts && args[
"unit"] ? args[
"unit"].AsInteger() : 0 ),
350 genome_size( app_type == eComputeCounts && args[
"genome_size"] ? args[
"genome_size"].AsInt8() : 0 ),
351 input( determine_input ? args[
kInput].AsString() :
""),
353 th(
"90,99,99.5,99.8" ),
355 dust_level( app_type == eGenerateMasksWithDuster ? args[
"dust_level"].AsInteger() : 0 ),
357 checkdup( app_type == eComputeCounts ? args[
"checkdup"].AsBoolean() :
false ),
358 sformat( app_type < eGenerateMasks ? args[
"sformat"].AsString() :
"" ),
359 smem( app_type < eGenerateMasks ? args[
"smem"].AsInteger() : 0 ),
360 ids( 0 ), exclude_ids( 0 ),
361 use_ba( app_type != eConvertCounts ),
362 text_match( app_type != eConvertCounts && args[
"text_match"].AsBoolean() )
364 if (args.
Exist(
"meta") && args[
"meta"]) {
367 _TRACE(
"Entering CWinMaskConfig::CWinMaskConfig()" );
381 args[
kInput].AsString() );
384 if(determine_input &&
iformatstr !=
"seqids"){
393 eReaderAllocFail,
"" );
399 set_max_score = args[
"set_t_high"] ? args[
"set_t_high"].AsInteger()
401 set_min_score = args[
"set_t_low"] ? args[
"set_t_low"].AsInteger()
405 string ids_file_name( args[
"ids"].AsString() );
406 string exclude_ids_file_name( args[
"exclude_ids"].AsString() );
408 if( !ids_file_name.empty()
409 && !exclude_ids_file_name.empty() )
412 "only one of -ids or -exclude_ids can be specified" );
415 if( !ids_file_name.empty() ) {
423 "-text_match false can be used only with "
430 if( !exclude_ids_file_name.empty() ) {
438 "-text_match false can be used only with "
445 _TRACE(
"Leaving CWinMaskConfig::CWinMaskConfig" );
461 "User options caused reader not to be created; can't get reader" );
476 string::size_type stop( line.find_first_of(
" \t" ) );
477 string::size_type start( line[0] ==
'>' ? 1 : 0 );
478 string id_str = line.substr( start, stop - start );
491 return "can not open input stream";
495 return "can not allocate fasta sequence reader";
499 return "inconsistent program options";
Class for reading sequences from BLAST databases.
Class for reading sequences from fasta files.
Virtual base class for all input readers.
Output filter to print masked sequence locations as Blast-db-mask-info objects.
Output filter to write masked data in fasta format.
Output filter to print masked sequences as sets of intervals.
Output filter to print masked sequence locations as NCBI Seq-loc objects.
A base class for winmasker output writers.
Winmasker configuration errors.
virtual const char * GetErrCodeString() const override
Get the description of an error.
@ eInconsistentOptions
Option validation failure.
@ eInputOpenFail
Can not open input file.
@ eReaderAllocFail
Memory allocation for input reader object failed.
string iformatstr
input format
static void FillIdList(const string &file_name, CIdSet &id_list)
Read the list of sequence ids from a given file.
CMaskWriter * writer
output writer object
CMaskReader * reader
input reader object
@ eGenerateMasksWithDuster
CWinMaskConfig(const CArgs &args, EAppType type=eAny, bool determine_input=true)
Object constructor.
CIstreamProxy is
input file resource manager
CMaskWriter * x_GetWriter(const CArgs &args)
Create the CMaskWriter instance for this class.
string output
output file name (may be empty to indicate stdout)
CIdSet * exclude_ids
set of ids to exclude from processing
CWinMaskUtil::CIdSet_TextMatch CIdSet_TextMatch
EAppType app_type
type of application to run
CMaskReader & Reader()
Get the input reader object.
Uint4 set_max_score
score to use for high scoring units
string metadata
metadata associated with counts file
~CWinMaskConfig()
Destructor.
CIdSet * ids
set of ids to process
bool text_match
identify seq ids by string matching
Uint4 set_min_score
score to use for low scoring units
static void AddWinMaskArgs(CArgDescriptions &arg_desc, EAppType type=eAny, bool determine_input=true)
CWinMaskUtil::CIdSet_SeqId CIdSet_SeqId
static EAppType s_DetermineAppType(const CArgs &args, EAppType user_specified_type)
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
virtual void insert(const string &id_str)=0
Add a string to the id set.
static SQLCHAR output[256]
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
void SetConstraint(const string &name, const CArgAllow *constraint, EConstraintNegate negate=eConstraint)
Set additional user defined constraint on argument value.
bool Exist(const string &name) const
Check existence of argument description.
void AddKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for mandatory key.
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
void SetCurrentGroup(const string &group)
Set current arguments group name.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eExcludes
One argument excludes another.
@ eInputFile
Name of file (must exist and be readable)
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
@ fBinary
Open file in binary mode.
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
@ eBlast_filter_program_windowmasker
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
Contains the command line options common to filtering algorithms.
const size_t kNumInputFormats
Number of elements in kInputFormats.
const char * kOutputFormats[]
Output formats allowed, the first one is the default.
const size_t kNumOutputFormats
Number of elements in kOutputFormats.
const std::string kOutput
Command line flag to specify the output.
const std::string kOutputFormat
Command line flag to specify the output format.
const char * kInputFormats[]
Input formats allowed, the first one is the default.
const std::string kInput
Command line flag to specify the input.
const std::string kInputFormat
Command line flag to specify the input format.
string BuildAlgorithmParametersString(const CArgs &args)
Builds an algorithm options string for the filtering applications (segmasker, dustmasker) by examinin...
NCBI C++ auxiliary debug macros.