51 #include <algo/winmask/win_mask_reader.hpp>
52 #include <algo/winmask/win_mask_fasta_reader.hpp>
53 #include <algo/winmask/win_mask_writer.hpp>
55 #include <algo/winmask/win_mask_seq_title.hpp>
58 #include <algo/winmask/dust_masker.hpp>
73 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
77 arg_desc->AddDefaultKey(
"lstat",
"length_statistics_file",
78 "relative unit frequencies "
79 "(required if -mk_counts is false)",
81 arg_desc->AddDefaultKey(
"input",
"input_file_name",
83 "(not optional if used with -mk_counts option)",
85 arg_desc->AddDefaultKey(
"output",
"output_file_name",
88 arg_desc->AddDefaultKey(
"checkdup",
"check_duplicates",
89 "check for duplicate sequences",
91 arg_desc->AddDefaultKey(
"window",
"window_size",
"window size",
93 arg_desc->AddDefaultKey(
"wstep",
"window_step",
"window step",
95 arg_desc->AddDefaultKey(
"ustep",
"unit_step",
"unit step",
97 arg_desc->AddDefaultKey(
"xdrop",
"X_drop",
98 "value of X-drop parameter",
100 arg_desc->AddDefaultKey(
"score",
"score_threshold",
101 "window score threshold",
103 arg_desc->AddDefaultKey(
"highscore",
"max_score",
104 "maximum useful unit score",
106 arg_desc->AddOptionalKey(
"lowscore",
"min_score",
107 "minimum useful unit score",
109 arg_desc->AddOptionalKey(
"sethighscore",
"score_value",
110 "alternative high score for a unit if the"
111 "original unit score is more than highscore",
113 arg_desc->AddOptionalKey(
"setlowscore",
"score_value",
114 "alternative low score for a unit if the"
115 "original unit score is lower than lowscore",
117 arg_desc->AddDefaultKey(
"ambig",
"ambiguity_handler",
118 "the way to handle ambiguity characters",
120 arg_desc->AddDefaultKey(
"oformat",
"output_format",
121 "controls the format of the masker output",
123 arg_desc->AddDefaultKey(
"mpass",
"merge_pass_flag",
124 "true if separate merging pass is needed",
126 arg_desc->AddDefaultKey(
"discontig",
"discontiguous_units",
127 "true if using discontiguous units",
129 arg_desc->AddDefaultKey(
"mscore",
"merge_cutoff_score",
130 "minimum average unit score triggering a merge",
132 arg_desc->AddDefaultKey(
"mabs",
"distance",
133 "absolute distance threshold for merging",
135 arg_desc->AddDefaultKey(
"mmean",
"distance",
136 "distance threshold for merging if average unit"
137 " score is high enough",
139 arg_desc->AddDefaultKey(
"mustep",
"merge_unit_step",
140 "unit step value used for interval merging",
142 arg_desc->AddDefaultKey(
"trigger",
"trigger_type",
143 "type of the event triggering masking",
145 arg_desc->AddDefaultKey(
"tmin_count",
"unit_count",
146 "number of units to count with min trigger",
148 arg_desc->AddDefaultKey(
"pattern",
"base_mask",
149 "which bases in a window to use as a discontinuous unit",
151 arg_desc->AddDefaultKey(
"dbg",
"debug_output",
152 "enable debug output",
154 arg_desc->AddDefaultKey(
"mk_counts",
"generate_counts",
155 "generate frequency counts for a database",
157 arg_desc->AddDefaultKey(
"fa_list",
"input_is_a_list",
158 "indicates that -input represents a file containing "
159 "a list of names of fasta files to process, one name "
160 " per line (can only be used with -mk_counts true)",
162 arg_desc->AddDefaultKey(
"mem",
"available_memory",
163 "memory available for mk_counts option in megabytes",
165 arg_desc->AddDefaultKey(
"unit",
"unit_length",
166 "number of bases in a unit",
168 arg_desc->AddDefaultKey(
"th",
"thresholds",
169 "4 percentage values used to determine "
170 "masking thresholds (4 floating point numbers "
171 "separated by commas)",
173 arg_desc->AddDefaultKey(
"dust",
"use_dust",
174 "combine window masking with dusting",
176 arg_desc->AddDefaultKey(
"dust_window",
"dust_window",
177 "window size for dusting",
179 arg_desc->AddDefaultKey(
"dust_level",
"dust_level",
180 "dust minimum level",
182 arg_desc->AddDefaultKey(
"dust_linker",
"dust_linker",
183 "link windows by this many basepairs",
185 arg_desc->AddDefaultKey(
"exclude_ids",
"exclude_id_list",
186 "file containing the list of ids to exclude from processing",
188 arg_desc->AddDefaultKey(
"ids",
"id_list",
189 "file containing the list of ids to process",
193 arg_desc->SetConstraint(
"window",
195 arg_desc->SetConstraint(
"wstep",
197 arg_desc->SetConstraint(
"ustep",
199 arg_desc->SetConstraint(
"xdrop",
201 arg_desc->SetConstraint(
"score",
203 arg_desc->SetConstraint(
"highscore",
205 arg_desc->SetConstraint(
"lowscore",
207 arg_desc->SetConstraint(
"sethighscore",
209 arg_desc->SetConstraint(
"setlowscore",
211 arg_desc->SetConstraint(
"mscore",
213 arg_desc->SetConstraint(
"mabs",
215 arg_desc->SetConstraint(
"mmean",
217 arg_desc->SetConstraint(
"mustep",
219 arg_desc->SetConstraint(
"ambig",
221 arg_desc->SetConstraint(
"oformat",
223 ->Allow(
"fasta" ) );
224 arg_desc->SetConstraint(
"trigger",
227 arg_desc->SetConstraint(
"tmin_count",
241 if(
GetArgs()[
"dbg"].AsBoolean() )
248 if( aConfig.MakeCounts() )
257 aConfig.HasMinScore(),
264 CWinMaskReader & theReader = aConfig.
Reader();
265 CWinMaskWriter & theWriter = aConfig.
Writer();
286 Uint4 total = 0, total_masked = 0;
287 CDustMasker * duster( 0 );
291 if( aConfig.UseDust() )
292 duster =
new CDustMasker( aConfig.
DustWindow(),
296 while( (aSeqEntry = theReader.GetNextSequence()).NotEmpty() )
307 bool process(
true );
314 if( ids.
find(
id ) != ids.
end() )
318 if( !exclude_ids.
empty() )
319 if( exclude_ids.
find(
id ) != exclude_ids.
end() )
332 unique_ptr< CSeqMasker::TMaskList > mask_info( theMasker(
data ) );
336 unique_ptr< CSeqMasker::TMaskList > dust_info( (*duster)(
data ) );
340 theWriter.Print( seh, bioseq, *mask_info );
342 for( CSeqMasker::TMaskList::const_iterator
i = mask_info->begin();
343 i != mask_info->end(); ++
i )
344 masked +=
i->second -
i->first + 1;
346 total_masked += masked;
347 _TRACE(
"Number of positions masked: " << masked );
352 _TRACE(
"Total number of positions: " << total );
353 _TRACE(
"Total number of positions masked: " << total_masked );
User-defined methods of the data storage class.
Main interface to window based masker functionality.
static void MergeMaskInfo(TMaskList *dest, const TMaskList *src)
Merge together two result lists.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
Objects of this class contain winmasker configuration data.
string Th() const
Percentage thresholds.
Uint4 MeanMergeCutoffDist() const
Distance at which intervals are considered candidates for merging.
const CIdSet * ExcludeIds() const
The set of query ids to exclude from processing.
Uint1 TMin_Count() const
Number of units to count.
bool MergePass() const
Flag to run the interval merging passes.
bool FaList() const
Use a list of fasta files.
const CIdSet * Ids() const
The set of query ids to process.
bool CheckDup() const
Check for possibly duplicate sequences in the input.
Uint1 UnitStep() const
Unit step.
Uint4 SetMinScore() const
Get the alternative score for low scoring units.
string Input() const
Value of the -input parameter.
Uint1 UnitSize() const
n-mer size used for n-mer frequency counting.
Uint4 DustLinker() const
Dust linker (in bps).
const string Trigger() const
Type of the event triggering the masking.
Uint4 WindowStep() const
Window step.
string Output() const
Value of the -output parameter.
CMaskReader & Reader()
Get the input reader object.
Uint4 DustWindow() const
Dust window.
const string LStatName() const
Get the name of the length statistics file.
Uint4 Mem() const
Memory available for n-mer frequency counting.
bool Discontig() const
Whether discontiguous units are used.
Uint4 DustLevel() const
Dust level.
Uint4 MaxScore() const
Get the maximum unit score.
Uint4 SetMaxScore() const
Get the alternative score for high scoring units.
Uint4 Pattern() const
Pattern to form discontiguous units.
Uint4 MergeCutoffScore() const
Average unit score triggering the interval merging.
Uint4 MinScore() const
Get the minimum unit score.
Uint4 AbsMergeCutoffDist() const
Distance at which intervals are merged unconditionally.
Uint4 CutoffScore() const
Get the average unit score threshold.
CMaskWriter & Writer()
Get the output writer object.
Uint1 MergeUnitStep() const
Unit step to use for interval merging.
Uint1 WindowSize() const
Get the window size.
This class encapsulates the n-mer frequency counts generation functionality of winmasker.
virtual int Run(void)
Main routine of the window based masker.
static const char *const USAGE_LINE
Short description of the program.
virtual void Init(void)
Initialization.
const_iterator find(const key_type &key) const
const_iterator end() const
Operators to edit gaps in sequences.
unsigned int TSeqPos
Type for sequence locations and lengths.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eString
An arbitrary string.
@ eInteger
Convertible into an integer number (int or Int8)
void SetDiagTrace(EDiagTrace how, EDiagTrace dflt=eDT_Default)
Set the diagnostic trace settings.
@ eDT_Enable
Enable messages of severity "eDiag_Trace".
const TPrim & Get(void) const
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
const TSeq & GetSeq(void) const
Get the variant data.
const TInst & GetInst(void) const
Get the Inst member data.
const TIupacna & GetIupacna(void) const
Get the variant data.
bool CanGetLength(void) const
Check if it is safe to call GetLength method.
TLength GetLength(void) const
Get the Length member data.
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
@ e_Iupacna
IUPAC 1 letter nuc acid code.
NCBI C++ auxiliary debug macros.
CRef< objects::CObjectManager > om