NCBI C++ ToolKit
win_mask_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_app.cpp 91954 2020-12-17 12:53:02Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * CWinMaskDemoApplication class member and method definitions.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbidbg.hpp>
37 #include <objects/seq/Bioseq.hpp>
38 #include <objects/seq/Seq_inst.hpp>
39 #include <objects/seq/Seq_data.hpp>
41 #include <objects/seq/IUPACna.hpp>
43 
44 #include <objmgr/bioseq_ci.hpp>
46 #include <objmgr/scope.hpp>
48 
49 #include "win_mask_app.hpp"
51 #include <algo/winmask/win_mask_reader.hpp>
52 #include <algo/winmask/win_mask_fasta_reader.hpp>
53 #include <algo/winmask/win_mask_writer.hpp>
55 #include <algo/winmask/win_mask_seq_title.hpp>
56 
58 #include <algo/winmask/dust_masker.hpp>
59 
62 
63 //-------------------------------------------------------------------------
64 const char * const
65 CWinMaskDemoApplication::USAGE_LINE = "Window based sequence masker";
66 
67 //-------------------------------------------------------------------------
69 {
70  unique_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
71 
72  // Set the program description
73  arg_desc->SetUsageContext( GetArguments().GetProgramBasename(),
74  USAGE_LINE );
75 
76  // Adding command line arguments descriptions
77  arg_desc->AddDefaultKey( "lstat", "length_statistics_file",
78  "relative unit frequencies "
79  "(required if -mk_counts is false)",
81  arg_desc->AddDefaultKey( "input", "input_file_name",
82  "input file name "
83  "(not optional if used with -mk_counts option)",
85  arg_desc->AddDefaultKey( "output", "output_file_name",
86  "output file name",
88  arg_desc->AddDefaultKey( "checkdup", "check_duplicates",
89  "check for duplicate sequences",
90  CArgDescriptions::eBoolean, "false" );
91  arg_desc->AddDefaultKey( "window", "window_size", "window size",
93  arg_desc->AddDefaultKey( "wstep", "window_step", "window step",
95  arg_desc->AddDefaultKey( "ustep", "unit_step", "unit step",
97  arg_desc->AddDefaultKey( "xdrop", "X_drop",
98  "value of X-drop parameter",
100  arg_desc->AddDefaultKey( "score", "score_threshold",
101  "window score threshold",
103  arg_desc->AddDefaultKey( "highscore", "max_score",
104  "maximum useful unit score",
106  arg_desc->AddOptionalKey( "lowscore", "min_score",
107  "minimum useful unit score",
109  arg_desc->AddOptionalKey( "sethighscore", "score_value",
110  "alternative high score for a unit if the"
111  "original unit score is more than highscore",
113  arg_desc->AddOptionalKey( "setlowscore", "score_value",
114  "alternative low score for a unit if the"
115  "original unit score is lower than lowscore",
117  arg_desc->AddDefaultKey( "ambig", "ambiguity_handler",
118  "the way to handle ambiguity characters",
119  CArgDescriptions::eString, "break" );
120  arg_desc->AddDefaultKey( "oformat", "output_format",
121  "controls the format of the masker output",
122  CArgDescriptions::eString, "interval" );
123  arg_desc->AddDefaultKey( "mpass", "merge_pass_flag",
124  "true if separate merging pass is needed",
125  CArgDescriptions::eBoolean, "false" );
126  arg_desc->AddDefaultKey( "discontig", "discontiguous_units",
127  "true if using discontiguous units",
128  CArgDescriptions::eBoolean, "false" );
129  arg_desc->AddDefaultKey( "mscore", "merge_cutoff_score",
130  "minimum average unit score triggering a merge",
132  arg_desc->AddDefaultKey( "mabs", "distance",
133  "absolute distance threshold for merging",
135  arg_desc->AddDefaultKey( "mmean", "distance",
136  "distance threshold for merging if average unit"
137  " score is high enough",
139  arg_desc->AddDefaultKey( "mustep", "merge_unit_step",
140  "unit step value used for interval merging",
142  arg_desc->AddDefaultKey( "trigger", "trigger_type",
143  "type of the event triggering masking",
144  CArgDescriptions::eString, "mean" );
145  arg_desc->AddDefaultKey( "tmin_count", "unit_count",
146  "number of units to count with min trigger",
148  arg_desc->AddDefaultKey( "pattern", "base_mask",
149  "which bases in a window to use as a discontinuous unit",
151  arg_desc->AddDefaultKey( "dbg", "debug_output",
152  "enable debug output",
153  CArgDescriptions::eBoolean, "false" );
154  arg_desc->AddDefaultKey( "mk_counts", "generate_counts",
155  "generate frequency counts for a database",
156  CArgDescriptions::eBoolean, "false" );
157  arg_desc->AddDefaultKey( "fa_list", "input_is_a_list",
158  "indicates that -input represents a file containing "
159  "a list of names of fasta files to process, one name "
160  " per line (can only be used with -mk_counts true)",
161  CArgDescriptions::eBoolean, "false" );
162  arg_desc->AddDefaultKey( "mem", "available_memory",
163  "memory available for mk_counts option in megabytes",
164  CArgDescriptions::eInteger, "1536" );
165  arg_desc->AddDefaultKey( "unit", "unit_length",
166  "number of bases in a unit",
168  arg_desc->AddDefaultKey( "th", "thresholds",
169  "4 percentage values used to determine "
170  "masking thresholds (4 floating point numbers "
171  "separated by commas)",
172  CArgDescriptions::eString, "90,99,99.5,99.8" );
173  arg_desc->AddDefaultKey( "dust", "use_dust",
174  "combine window masking with dusting",
176  arg_desc->AddDefaultKey( "dust_window", "dust_window",
177  "window size for dusting",
179  arg_desc->AddDefaultKey( "dust_level", "dust_level",
180  "dust minimum level",
182  arg_desc->AddDefaultKey( "dust_linker", "dust_linker",
183  "link windows by this many basepairs",
185  arg_desc->AddDefaultKey( "exclude_ids", "exclude_id_list",
186  "file containing the list of ids to exclude from processing",
188  arg_desc->AddDefaultKey( "ids", "id_list",
189  "file containing the list of ids to process",
191 
192  // Set some constraints on command line parameters
193  arg_desc->SetConstraint( "window",
194  new CArgAllow_Integers( 1, kMax_Int ) );
195  arg_desc->SetConstraint( "wstep",
196  new CArgAllow_Integers( 1, kMax_Int ) );
197  arg_desc->SetConstraint( "ustep",
198  new CArgAllow_Integers( 1, 256 ) );
199  arg_desc->SetConstraint( "xdrop",
200  new CArgAllow_Integers( 0, kMax_Int ) );
201  arg_desc->SetConstraint( "score",
202  new CArgAllow_Integers( 1, kMax_Int ) );
203  arg_desc->SetConstraint( "highscore",
204  new CArgAllow_Integers( 1, kMax_Int ) );
205  arg_desc->SetConstraint( "lowscore",
206  new CArgAllow_Integers( 1, kMax_Int ) );
207  arg_desc->SetConstraint( "sethighscore",
208  new CArgAllow_Integers( 1, kMax_Int ) );
209  arg_desc->SetConstraint( "setlowscore",
210  new CArgAllow_Integers( 1, kMax_Int ) );
211  arg_desc->SetConstraint( "mscore",
212  new CArgAllow_Integers( 0, kMax_Int ) );
213  arg_desc->SetConstraint( "mabs",
214  new CArgAllow_Integers( 0, kMax_Int ) );
215  arg_desc->SetConstraint( "mmean",
216  new CArgAllow_Integers( 0, kMax_Int ) );
217  arg_desc->SetConstraint( "mustep",
218  new CArgAllow_Integers( 0, 256 ) );
219  arg_desc->SetConstraint( "ambig",
220  (new CArgAllow_Strings())->Allow( "break" ) );
221  arg_desc->SetConstraint( "oformat",
222  (new CArgAllow_Strings())->Allow( "interval" )
223  ->Allow( "fasta" ) );
224  arg_desc->SetConstraint( "trigger",
225  (new CArgAllow_Strings())->Allow( "mean" )
226  ->Allow( "min" ) );
227  arg_desc->SetConstraint( "tmin_count",
228  new CArgAllow_Integers( 0, kMax_Int ) );
229  arg_desc->SetConstraint( "mem", new CArgAllow_Integers( 1, kMax_Int ) );
230  arg_desc->SetConstraint( "unit", new CArgAllow_Integers( 1, 16 ) );
231 
232  // Parse the arguments according to descriptions.
233  SetupArgDescriptions(arg_desc.release());
234 }
235 
236 //-------------------------------------------------------------------------
238 {
240 
241  if( GetArgs()["dbg"].AsBoolean() )
243 
244  // Read and validate configuration values.
245  CWinMaskConfig aConfig( GetArgs() );
246  aConfig.Validate();
247 
248  if( aConfig.MakeCounts() )
249  {
250  CWinMaskCountsGenerator cg( aConfig.Input(),
251  aConfig.Output(),
252  aConfig.Th(),
253  aConfig.Mem(),
254  aConfig.UnitSize(),
255  aConfig.MinScore(),
256  aConfig.MaxScore(),
257  aConfig.HasMinScore(),
258  aConfig.CheckDup(),
259  aConfig.FaList() );
260  cg();
261  return 0;
262  }
263 
264  CWinMaskReader & theReader = aConfig.Reader();
265  CWinMaskWriter & theWriter = aConfig.Writer();
266  CSeqMasker theMasker( aConfig.LStatName(),
267  aConfig.WindowSize(),
268  aConfig.WindowStep(),
269  aConfig.UnitStep(),
270  aConfig.XDrop(),
271  aConfig.CutoffScore(),
272  aConfig.MaxScore(),
273  aConfig.MinScore(),
274  aConfig.SetMaxScore(),
275  aConfig.SetMinScore(),
276  aConfig.MergePass(),
277  aConfig.MergeCutoffScore(),
278  aConfig.AbsMergeCutoffDist(),
279  aConfig.MeanMergeCutoffDist(),
280  aConfig.MergeUnitStep(),
281  aConfig.Trigger(),
282  aConfig.TMin_Count(),
283  aConfig.Discontig(),
284  aConfig.Pattern() );
285  CRef< CSeq_entry > aSeqEntry( 0 );
286  Uint4 total = 0, total_masked = 0;
287  CDustMasker * duster( 0 );
288  set< string > ids( aConfig.Ids() );
289  set< string > exclude_ids( aConfig.ExcludeIds() );
290 
291  if( aConfig.UseDust() )
292  duster = new CDustMasker( aConfig.DustWindow(),
293  aConfig.DustLevel(),
294  aConfig.DustLinker() );
295 
296  while( (aSeqEntry = theReader.GetNextSequence()).NotEmpty() )
297  {
298  Uint4 masked = 0;
299  const CBioseq & bioseq = aSeqEntry->GetSeq();
300 
301  if( bioseq.CanGetInst()
302  && bioseq.GetInst().CanGetLength()
303  && bioseq.GetInst().CanGetSeq_data() )
304  {
305  CRef<CScope> scope(new CScope(*om));
306  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry( *aSeqEntry );
307  bool process( true );
308  string id( CWinMaskSeqTitle::GetId( seh, bioseq ) );
309 
310  if( !ids.empty() )
311  {
312  process = false;
313 
314  if( ids.find( id ) != ids.end() )
315  process = true;
316  }
317 
318  if( !exclude_ids.empty() )
319  if( exclude_ids.find( id ) != exclude_ids.end() )
320  process = false;
321 
322  if( process )
323  {
324  TSeqPos len = bioseq.GetInst().GetLength();
325  total += len;
326  _TRACE( "Sequence length " << len );
327  const CSeq_data & seqdata = bioseq.GetInst().GetSeq_data();
328  CRef< CSeq_data > dest( new CSeq_data );
330  0, len );
331  const string & data = dest->GetIupacna().Get();
332  unique_ptr< CSeqMasker::TMaskList > mask_info( theMasker( data ) );
333 
334  if( duster != 0 ) // Dust and merge with mask_info
335  {
336  unique_ptr< CSeqMasker::TMaskList > dust_info( (*duster)( data ) );
337  CSeqMasker::MergeMaskInfo( mask_info.get(), dust_info.get() );
338  }
339 
340  theWriter.Print( seh, bioseq, *mask_info );
341 
342  for( CSeqMasker::TMaskList::const_iterator i = mask_info->begin();
343  i != mask_info->end(); ++i )
344  masked += i->second - i->first + 1;
345 
346  total_masked += masked;
347  _TRACE( "Number of positions masked: " << masked );
348  }
349  }
350  }
351 
352  _TRACE( "Total number of positions: " << total );
353  _TRACE( "Total number of positions masked: " << total_masked );
354  return 0;
355 }
356 
User-defined methods of the data storage class.
USING_SCOPE(objects)
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CScope –.
Definition: scope.hpp:92
Main interface to window based masker functionality.
Definition: seq_masker.hpp:53
static void MergeMaskInfo(TMaskList *dest, const TMaskList *src)
Merge together two result lists.
Definition: seq_masker.cpp:508
CSeq_entry_Handle –.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
Objects of this class contain winmasker configuration data.
string Th() const
Percentage thresholds.
Uint4 MeanMergeCutoffDist() const
Distance at which intervals are considered candidates for merging.
const CIdSet * ExcludeIds() const
The set of query ids to exclude from processing.
Uint1 TMin_Count() const
Number of units to count.
bool MergePass() const
Flag to run the interval merging passes.
bool FaList() const
Use a list of fasta files.
const CIdSet * Ids() const
The set of query ids to process.
bool CheckDup() const
Check for possibly duplicate sequences in the input.
Uint1 UnitStep() const
Unit step.
Uint4 SetMinScore() const
Get the alternative score for low scoring units.
string Input() const
Value of the -input parameter.
Uint1 UnitSize() const
n-mer size used for n-mer frequency counting.
Uint4 DustLinker() const
Dust linker (in bps).
const string Trigger() const
Type of the event triggering the masking.
Uint4 WindowStep() const
Window step.
string Output() const
Value of the -output parameter.
CMaskReader & Reader()
Get the input reader object.
Uint4 DustWindow() const
Dust window.
const string LStatName() const
Get the name of the length statistics file.
Uint4 Mem() const
Memory available for n-mer frequency counting.
bool Discontig() const
Whether discontiguous units are used.
Uint4 DustLevel() const
Dust level.
Uint4 MaxScore() const
Get the maximum unit score.
Uint4 SetMaxScore() const
Get the alternative score for high scoring units.
Uint4 Pattern() const
Pattern to form discontiguous units.
Uint4 MergeCutoffScore() const
Average unit score triggering the interval merging.
Uint4 MinScore() const
Get the minimum unit score.
Uint4 AbsMergeCutoffDist() const
Distance at which intervals are merged unconditionally.
Uint4 CutoffScore() const
Get the average unit score threshold.
CMaskWriter & Writer()
Get the output writer object.
Uint1 MergeUnitStep() const
Unit step to use for interval merging.
Uint1 WindowSize() const
Get the window size.
This class encapsulates the n-mer frequency counts generation functionality of winmasker.
virtual int Run(void)
Main routine of the window based masker.
static const char *const USAGE_LINE
Short description of the program.
virtual void Init(void)
Initialization.
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
Operators to edit gaps in sequences.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define _TRACE(message)
Definition: ncbidbg.hpp:122
void SetDiagTrace(EDiagTrace how, EDiagTrace dflt=eDT_Default)
Set the diagnostic trace settings.
Definition: ncbidiag.cpp:6226
@ eDT_Enable
Enable messages of severity "eDiag_Trace".
Definition: ncbidiag.hpp:1550
const TPrim & Get(void) const
Definition: serialbase.hpp:347
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
bool CanGetLength(void) const
Check if it is safe to call GetLength method.
Definition: Seq_inst_.hpp:646
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
Definition: Seq_inst_.hpp:811
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
int i
int len
NCBI C++ auxiliary debug macros.
The Object manager core.
CRef< objects::CObjectManager > om
Modified on Mon Mar 04 05:12:32 2024 by modify_doxy.py rev. 669887