NCBI C++ ToolKit
win_mask_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_app.cpp 98106 2022-09-29 01:34:59Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * CWinMaskApplication class member and method definitions.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbidbg.hpp>
37 #include <objects/seq/Bioseq.hpp>
38 #include <objects/seq/Seq_inst.hpp>
39 #include <objects/seq/Seq_data.hpp>
41 #include <objects/seq/IUPACna.hpp>
43 
44 #include <objmgr/bioseq_ci.hpp>
46 #include <objmgr/scope.hpp>
49 
63 #include "win_mask_app.hpp"
65 
66 
69 
70 #define WIN_MASK_APP_VER_MAJOR 1
71 #define WIN_MASK_APP_VER_MINOR 0
72 #define WIN_MASK_APP_VER_PATCH 0
73 
74 //-------------------------------------------------------------------------
75 const char * const
76 CWinMaskApplication::USAGE_LINE = "Window based sequence masker";
77 
78 //-------------------------------------------------------------------------
81  version->SetVersionInfo( WIN_MASK_APP_VER_MAJOR,
84  version->AddComponentVersion( new CSeqMaskerVersion(
86  version->AddComponentVersion( new CSeqMaskerVersion(
88  version->AddComponentVersion( new CSeqMaskerVersion(
90  version->AddComponentVersion( new CSeqMaskerVersion(
92  version->AddComponentVersion( new CSeqMaskerVersion(
94  version->AddComponentVersion( new CSeqMaskerVersion(
97 }
98 
99 //-------------------------------------------------------------------------
101 {
103  unique_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
104 
105  // Set the program description
106  arg_desc->SetUsageContext( GetArguments().GetProgramBasename(),
107  USAGE_LINE );
108 
110 
111  // Parse the arguments according to descriptions.
112  SetupArgDescriptions(arg_desc.release());
113 }
114 
115 //-------------------------------------------------------------------------
117 {
119  CWinMaskConfig aConfig( GetArgs() );
120 
121  // Branch away immediately if the converter is called.
122  //
123  // if( GetArgs()["convert"].AsBoolean() ) {
124  if( aConfig.AppType() == CWinMaskConfig::eConvertCounts )
125  {
126  if( aConfig.Output() == "-" ) {
127  CWinMaskCountsConverter converter(
128  aConfig.Input(),
129  NcbiCout,
130  aConfig.SFormat(),
131  aConfig.GetMetaData() );
132  return converter();
133  }
134  else {
135  CWinMaskCountsConverter converter(
136  aConfig.Input(),
137  aConfig.Output(),
138  aConfig.SFormat(),
139  aConfig.GetMetaData() );
140  return converter();
141  }
142  }
143 
145  if(aConfig.InFmt() == "seqids")
148 
149  // Read and validate configuration values.
150  if( aConfig.AppType() == CWinMaskConfig::eComputeCounts )
151  {
152  if( aConfig.Output() == "-" ) {
153  CWinMaskCountsGenerator cg( aConfig.Input(),
154  NcbiCout,
155  aConfig.InFmt(),
156  aConfig.SFormat(),
157  aConfig.Th(),
158  aConfig.Mem(),
159  aConfig.UnitSize(),
160  aConfig.GenomeSize(),
161  aConfig.MinScore(),
162  aConfig.MaxScore(),
163  aConfig.CheckDup(),
164  aConfig.FaList(),
165  aConfig.Ids(),
166  aConfig.ExcludeIds(),
167  aConfig.UseBA(),
168  aConfig.GetMetaData(),
169  aConfig.MinScorePct(),
170  aConfig.ExtendScorePct(),
171  aConfig.ThresScorePct(),
172  aConfig.MaxScorePct() );
173  cg();
174  }
175  else {
176  CWinMaskCountsGenerator cg( aConfig.Input(),
177  aConfig.Output(),
178  aConfig.InFmt(),
179  aConfig.SFormat(),
180  aConfig.Th(),
181  aConfig.Mem(),
182  aConfig.UnitSize(),
183  aConfig.GenomeSize(),
184  aConfig.MinScore(),
185  aConfig.MaxScore(),
186  aConfig.CheckDup(),
187  aConfig.FaList(),
188  aConfig.Ids(),
189  aConfig.ExcludeIds(),
190  aConfig.UseBA(),
191  aConfig.GetMetaData(),
192  aConfig.MinScorePct(),
193  aConfig.ExtendScorePct(),
194  aConfig.ThresScorePct(),
195  aConfig.MaxScorePct() );
196  cg();
197  }
198 
199  return 0;
200  }
201 
202  if(aConfig.InFmt() == "seqids"){
203  ERR_POST(Error << "windowmasker with seqids input not implemented yet");
204  return 1;
205  }
206 
207  CMaskReader & theReader = aConfig.Reader();
208  CMaskWriter & theWriter = aConfig.Writer();
209  CSeqMasker theMasker( aConfig.LStatName(),
210  aConfig.WindowSize(),
211  aConfig.WindowStep(),
212  aConfig.UnitStep(),
213  aConfig.Textend(),
214  aConfig.CutoffScore(),
215  aConfig.MaxScore(),
216  aConfig.MinScore(),
217  aConfig.SetMaxScore(),
218  aConfig.SetMinScore(),
219  aConfig.MergePass(),
220  aConfig.MergeCutoffScore(),
221  aConfig.AbsMergeCutoffDist(),
222  aConfig.MeanMergeCutoffDist(),
223  aConfig.MergeUnitStep(),
224  aConfig.Trigger(),
225  aConfig.TMin_Count(),
226  aConfig.Discontig(),
227  aConfig.Pattern(),
228  aConfig.UseBA(),
229  aConfig.MinScorePct(),
230  aConfig.ExtendScorePct(),
231  aConfig.ThresScorePct(),
232  aConfig.MaxScorePct() );
233  CRef< CSeq_entry > aSeqEntry( 0 );
234  Uint4 total = 0, total_masked = 0;
235  CSDustMasker * duster( 0 );
236  const CWinMaskConfig::CIdSet * ids( aConfig.Ids() );
237  const CWinMaskConfig::CIdSet * exclude_ids( aConfig.ExcludeIds() );
238 
240  duster = new CSDustMasker( aConfig.DustWindow(),
241  aConfig.DustLevel(),
242  aConfig.DustLinker() );
243 
244  while( (aSeqEntry = theReader.GetNextSequence()).NotEmpty() )
245  {
246  if( aSeqEntry->Which() == CSeq_entry::e_not_set ) continue;
247  CScope scope(*om);
248  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*aSeqEntry);
249  Uint4 masked = 0;
250  CBioseq_CI bs_iter(seh, CSeq_inst::eMol_na);
251  for ( ; bs_iter; ++bs_iter) {
252  CBioseq_Handle bsh = *bs_iter;
253  if (bsh.GetBioseqLength() == 0) {
254  continue;
255  }
256 
257  if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
258  {
259  TSeqPos len = bsh.GetBioseqLength();
260  total += len;
261  _TRACE( "Sequence length " << len );
262  CSeqVector data =
264  unique_ptr< CSeqMasker::TMaskList > mask_info( theMasker( data ) );
266 
267  if( duster != 0 ) // Dust and merge with mask_info
268  {
269  unique_ptr< CSeqMasker::TMaskList > dust_info(
270  (*duster)( data, *mask_info.get() ) );
271  CSeqMasker::MergeMaskInfo( mask_info.get(), dust_info.get() );
272  }
273 
274  // theWriter.Print( bsh, *mask_info, aConfig.MatchId() );
275  theWriter.Print( bsh, *mask_info, GetArgs()["parse_seqids"] );
276 
277  for( CSeqMasker::TMaskList::const_iterator i = mask_info->begin();
278  i != mask_info->end(); ++i )
279  masked += i->second - i->first + 1;
280 
281  total_masked += masked;
282  _TRACE( "Number of positions masked: " << masked );
283  }
284  }
285  }
286 
287  _TRACE( "Total number of positions: " << total );
288  _TRACE( "Total number of positions masked: " << total_masked );
289  return 0;
290 }
291 
User-defined methods of the data storage class.
USING_SCOPE(objects)
#define WIN_MASK_APP_VER_MAJOR
#define WIN_MASK_APP_VER_PATCH
#define WIN_MASK_APP_VER_MINOR
unsigned dummy
Definition: block_cipher.h:0
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
Virtual base class for all input readers.
Definition: mask_reader.hpp:50
virtual CRef< objects::CSeq_entry > GetNextSequence()=0
Read the next sequence from the source stream.
A base class for winmasker output writers.
Definition: mask_writer.hpp:52
virtual void Print(objects::CBioseq_Handle &bsh, const TMaskList &mask, bool parsed_id=false)=0
Output masked sequence data.
CRef –.
Definition: ncbiobj.hpp:618
This class encapsulates the dusting functionality of winmask.
CScope –.
Definition: scope.hpp:92
static CSeqMaskerVersion FormatVersion
Format version.
static CSeqMaskerVersion FormatVersion
Format version.
static CSeqMaskerVersion FormatVersion
Format version.
static CSeqMaskerVersion FormatVersion
Format version.
static CSeqMaskerVersion StatAlgoVersion
Version of the statistics generation algorithm.
Main interface to window based masker functionality.
Definition: seq_masker.hpp:53
static void MergeMaskInfo(TMaskList *dest, const TMaskList *src)
Merge together two result lists.
Definition: seq_masker.cpp:508
vector< TMaskedInterval > TMaskList
A type representing the total of masking information about a sequence.
Definition: seq_masker.hpp:74
static CSeqMaskerVersion AlgoVersion
Version of window masking algorithm.
Definition: seq_masker.hpp:57
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
static const char *const USAGE_LINE
Short description of the program.
CWinMaskApplication()
Application constructor.
virtual void Init(void)
Initialization.
virtual int Run(void)
Main routine of the window based masker.
Objects of this class contain winmasker configuration data.
string Th() const
Percentage thresholds.
double MinScorePct() const
Uint4 MeanMergeCutoffDist() const
Distance at which intervals are considered candidates for merging.
const CIdSet * ExcludeIds() const
The set of query ids to exclude from processing.
Uint1 TMin_Count() const
Number of units to count.
bool MergePass() const
Flag to run the interval merging passes.
bool FaList() const
Use a list of fasta files.
double MaxScorePct() const
const CIdSet * Ids() const
The set of query ids to process.
bool CheckDup() const
Check for possibly duplicate sequences in the input.
Uint1 UnitStep() const
Unit step.
Uint4 SetMinScore() const
Get the alternative score for low scoring units.
bool UseBA() const
Whether to use bit array optimization for optimized binary counts format.
string Input() const
Value of the -input parameter.
const string InFmt() const
Input file format.
Uint1 UnitSize() const
n-mer size used for n-mer frequency counting.
Uint4 DustLinker() const
Dust linker (in bps).
double ExtendScorePct() const
const string Trigger() const
Type of the event triggering the masking.
EAppType AppType() const
Type of application to run.
Uint4 WindowStep() const
Window step.
string Output() const
Value of the -output parameter.
CMaskReader & Reader()
Get the input reader object.
Uint4 DustWindow() const
Dust window.
const string LStatName() const
Get the name of the length statistics file.
string const GetMetaData() const
Get metadata string to be added to the counts file.
const string SFormat() const
Format in which the unit counts generator should generate its output.
Uint4 Mem() const
Memory available for n-mer frequency counting.
Uint4 Textend() const
Get the t_extend value.
bool Discontig() const
Whether discontiguous units are used.
Uint4 DustLevel() const
Dust level.
Uint4 MaxScore() const
Get the maximum unit score.
Uint4 SetMaxScore() const
Get the alternative score for high scoring units.
Uint4 Pattern() const
Pattern to form discontiguous units.
Uint8 GenomeSize() const
Total genome length.
Uint4 MergeCutoffScore() const
Average unit score triggering the interval merging.
Uint4 MinScore() const
Get the minimum unit score.
static void AddWinMaskArgs(CArgDescriptions &arg_desc, EAppType type=eAny, bool determine_input=true)
double ThresScorePct() const
Uint4 AbsMergeCutoffDist() const
Distance at which intervals are merged unconditionally.
Uint4 CutoffScore() const
Get the average unit score threshold.
CMaskWriter & Writer()
Get the output writer object.
Uint1 MergeUnitStep() const
Unit step to use for interval merging.
Uint1 WindowSize() const
Get the window size.
Class responsible for converting unit counts between different formats.
This class encapsulates the n-mer frequency counts generation functionality of winmasker.
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
static bool consider(const objects::CBioseq_Handle &bsh, const CIdSet *ids, const CIdSet *exclude_ids)
Check if the given bioseq should be considered for processing.
Operators to edit gaps in sequences.
char data[12]
Definition: iconv.c:80
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1187
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1325
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
#define _TRACE(message)
Definition: ncbidbg.hpp:122
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6132
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NcbiCout
Definition: ncbistre.hpp:543
#define CVersion
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_entry_.hpp:228
@ e_not_set
No variant selected.
Definition: Seq_entry_.hpp:88
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
int i
int len
Contains the command line options common to filtering algorithms.
const string version
version string
Definition: variables.hpp:66
NCBI C++ auxiliary debug macros.
The Object manager core.
CRef< objects::CObjectManager > om
Modified on Fri Sep 20 14:58:05 2024 by modify_doxy.py rev. 669887