NCBI C++ ToolKit
dust_mask_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dust_mask_app.cpp 93978 2021-06-11 12:28:04Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * CDustMaskApplication class member and method definitions.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include "dust_mask_app.hpp"
36 
37 #include <memory>
38 
39 #include <corelib/ncbidbg.hpp>
40 #include <util/line_reader.hpp>
43 #include <objects/seq/Bioseq.hpp>
44 #include <objects/seq/Seq_inst.hpp>
45 #include <objects/seq/Seq_data.hpp>
47 #include <objects/seq/IUPACna.hpp>
50 
51 #include <objmgr/bioseq_ci.hpp>
53 #include <objmgr/scope.hpp>
55 #include <objmgr/util/sequence.hpp>
59 
60 // Filtering applications IO
71 
74 
75 //-------------------------------------------------------------------------
76 const char * const CDustMaskApplication::USAGE_LINE
77  = "Low complexity region masker based on Symmetric DUST algorithm";
78 
79 //-------------------------------------------------------------------------
81 {
83  unique_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
84  arg_desc->SetUsageContext( GetArguments().GetProgramBasename(),
85  USAGE_LINE );
86  arg_desc->AddDefaultKey( kInput, "input_file_name",
87  "input file name",
89  arg_desc->AddDefaultKey( kOutput, "output_file_name",
90  "output file name",
92  arg_desc->AddDefaultKey( "window", "window_size",
93  "DUST window length",
95  arg_desc->AddDefaultKey( "level", "level",
96  "DUST level (score threshold for subwindows)",
98  arg_desc->AddDefaultKey( "linker", "linker",
99  "DUST linker (how close masked intervals "
100  "should be to get merged together).",
102  arg_desc->AddDefaultKey( kInputFormat, "input_format",
103  "input format (possible values: fasta, blastdb)",
105  arg_desc->AddDefaultKey( kOutputFormat, "output_format",
106  "output format",
108  arg_desc->AddFlag ( "parse_seqids",
109  "Parse Seq-ids in FASTA input", true );
110  arg_desc->AddFlag ( "hard_masking",
111  "Use hard masking for fasta outfmt", true );
112  CArgAllow_Strings* strings_allowed = new CArgAllow_Strings();
113  for (size_t i = 0; i < kNumOutputFormats; i++) {
114  strings_allowed->Allow(kOutputFormats[i]);
115  }
116  strings_allowed->Allow("acclist");
117  arg_desc->SetConstraint( kOutputFormat, strings_allowed );
118 
119  SetupArgDescriptions( arg_desc.release() );
120 }
121 
124 {
125  const CArgs& args = GetArgs();
126  const string& format(args[kOutputFormat].AsString());
127  CMaskWriter* retval = NULL;
128 
129  if (args["hard_masking"].AsBoolean() && (format != "fasta")) {
130  throw runtime_error("Hard masking can only be applied for fasta output");
131  }
132 
133  if (format == "interval") {
134  CNcbiOstream& output = args[kOutput].AsOutputFile();
135  retval = new CMaskWriterInt(output);
136  } else if (format == "acclist") {
137  CNcbiOstream& output = args[kOutput].AsOutputFile();
138  retval = new CMaskWriterTabular(output);
139  } else if (format == "fasta") {
140  CNcbiOstream& output = args[kOutput].AsOutputFile();
141  bool hard_masking = args["hard_masking"].AsBoolean();
142  retval = new CMaskWriterFasta(output, hard_masking);
143  } else if (NStr::StartsWith(format, "seqloc_asn1_binary")) {
144  CNcbiOstream& output = args[kOutput].AsOutputFile(CArgValue::fBinary);
145  retval = new CMaskWriterSeqLoc(output, format);
146  } else if (NStr::StartsWith(format, "seqloc_")) {
147  CNcbiOstream& output = args[kOutput].AsOutputFile();
148  retval = new CMaskWriterSeqLoc(output, format);
149  } else if (NStr::StartsWith(format, "maskinfo_asn1_bin")) {
150  CNcbiOstream& output = args[kOutput].AsOutputFile(CArgValue::fBinary);
151  retval =
155  } else if (NStr::StartsWith(format, "maskinfo_")) {
156  CNcbiOstream& output = args[kOutput].AsOutputFile();
157  retval =
161  } else {
162 
163  throw runtime_error("Unknown output format");
164  }
165  return retval;
166 }
167 
169 {
170  const CArgs & args( GetArgs() );
171  const string & format( args[kInputFormat].AsString() );
172 
173  if( format == "fasta" ) {
174  CNcbiIstream& input_stream = GetArgs()[kInput].AsInputFile();
175  return new CMaskFastaReader(
176  input_stream, true, args["parse_seqids"] );
177  }
178  else if( format == "blastdb" ) {
179  return new CMaskBDBReader( args[kInput].AsString() );
180  }
181  else {
182  throw runtime_error( "Unknown input format" );
183  }
184 
185  return 0;
186 }
187 
188 CSymDustMasker::TMaskList s_FindSegmentWithLongNs(const unsigned int MAX_Ns, objects::CSeqVector & seq)
189 {
190  //Always trim Ns at start and end of seq
192  unsigned int pos = 0;
193  unsigned int Ns = 0;
194  for(objects::CSeqVector_CI itr=seq.begin();itr!=seq.end(); ++itr) {
195  if ((*itr) == 78) {
196  Ns++;
197  }
198  else {
199  if (Ns > 0) {
200  if((Ns > MAX_Ns ) || (pos == 0)) {
201  CSymDustMasker::TMaskedInterval r(pos, pos+Ns-1);
202  NsRange.push_back(r);
203  }
204  pos += Ns;
205  Ns = 0;
206  }
207  pos++;
208  }
209  }
210 
211  if (Ns > 0) {
212  CSymDustMasker::TMaskedInterval r(pos, pos+Ns-1);
213  NsRange.push_back(r);
214  }
215  return NsRange;
216 }
217 
219 {
220  if ((!list.empty()) && (list.back().second + linker == new_mask.first)) {
221  list.back().second = new_mask.second;
222  return;
223  }
224 
225  list.push_back(new_mask);
226 }
227 
228 std::unique_ptr< CSymDustMasker::TMaskList >
229 GetDustMasks_SkipNs(objects::CSeqVector & seq, Uint4 level, Uint4 window, Uint4 linker)
230 {
231  CSymDustMasker duster(level, window, linker);
232  CSymDustMasker::TMaskList NsRange = s_FindSegmentWithLongNs(window, seq);
233 
234  if(NsRange.empty()){
235  return duster(seq);
236  }
237  std::unique_ptr< CSymDustMasker::TMaskList > rv(new CSymDustMasker::TMaskList);
238  TSeqPos seq_start =0;
240  if(itr->first == 0) {
241  seq_start = itr->second + 1;
242  rv->push_back(*itr);
243  continue;
244  }
245  else {
246  std::unique_ptr< CSymDustMasker::TMaskList > s_mask = duster(seq, seq_start, itr->first -1);
247  if(s_mask->size() > 0) {
248  s_InsertMerge(*rv, s_mask->front(), linker);
249  if( s_mask->size() > 1) {
250  rv->insert(rv->end(), ++(s_mask->begin()), s_mask->end());
251  }
252  s_InsertMerge(*rv, *itr, linker);
253  }
254  else {
255  rv->push_back(*itr);
256  }
257  seq_start = itr->second + 1;
258  }
259  }
260  if(seq_start < seq.size()){
261  std::unique_ptr< CSymDustMasker::TMaskList > s_mask = duster(seq, seq_start, seq.size() -1);
262  if(s_mask->size() > 0) {
263  s_InsertMerge(*rv, s_mask->front(), linker);
264  if( s_mask->size() > 1) {
265  rv->insert(rv->end(), ++(s_mask->begin()), s_mask->end());
266  }
267  }
268  }
269  return rv;
270 }
271 
272 //-------------------------------------------------------------------------
274 {
275  // Set up the input and output streams.
276  CNcbiOstream& output_stream = GetArgs()[kOutput].AsOutputFile();
277 
278  // Set up the object manager.
280 
281  // Set up the duster object.
282  Uint4 level = GetArgs()["level"].AsInteger();
283  duster_type::size_type window = GetArgs()["window"].AsInteger();
284  duster_type::size_type linker = GetArgs()["linker"].AsInteger();
285  duster_type duster( level, window, linker );
286 
287  // Now process each input sequence in a loop.
288  CRef< CSeq_entry > aSeqEntry( 0 );
289  unique_ptr<CMaskWriter> writer(x_GetWriter());
290  CMaskReader * reader = x_GetReader();
291 
292  while( (aSeqEntry = reader->GetNextSequence()).NotEmpty() )
293  {
294  CScope scope( *om );
295  CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry( *aSeqEntry );
296 
297  CBioseq_CI bs_iter(seh, CSeq_inst::eMol_na);
298 
299  for ( ; bs_iter; ++bs_iter)
300  {
301  CBioseq_Handle bsh = *bs_iter;
302 
303  if (bsh.GetBioseqLength() == 0)
304  continue;
305 
306  CSeqVector data
308  std::unique_ptr< duster_type::TMaskList > res = GetDustMasks_SkipNs(data, level, window, linker);
309  if (res.get()) {
310  writer->Print(bsh, *res, GetArgs()["parse_seqids"] );
311  }
312  }
313  }
314 
315  output_stream << flush;
316  return 0;
317 }
318 
User-defined methods of the data storage class.
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
virtual int Run(void)
Run the application.
CMaskReader * x_GetReader()
CMaskWriter * x_GetWriter()
virtual void Init(void)
Initialize the application.
static const char *const USAGE_LINE
Class for reading sequences from BLAST databases.
Class for reading sequences from fasta files.
Virtual base class for all input readers.
Definition: mask_reader.hpp:50
virtual CRef< objects::CSeq_entry > GetNextSequence()=0
Read the next sequence from the source stream.
Output filter to print masked sequence locations as Blast-db-mask-info objects.
Output filter to write masked data in fasta format.
Output filter to print masked sequences as sets of intervals.
Output filter to print masked sequence locations as NCBI Seq-loc objects.
Output filter to print masked sequences as sets of intervals one per line.
A base class for winmasker output writers.
Definition: mask_writer.hpp:52
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
Definition: symdust.hpp:61
std::pair< size_type, size_type > TMaskedInterval
Type respresenting an interval selected for masking.
Definition: symdust.hpp:99
sequence_type::size_type size_type
Integer size type corresponding to sequence_type.
Definition: symdust.hpp:97
std::vector< TMaskedInterval > TMaskList
Type representing a list of masked intervals.
Definition: symdust.hpp:101
USING_SCOPE(objects)
std::unique_ptr< CSymDustMasker::TMaskList > GetDustMasks_SkipNs(objects::CSeqVector &seq, Uint4 level, Uint4 window, Uint4 linker)
void s_InsertMerge(CSymDustMasker::TMaskList &list, CSymDustMasker::TMaskedInterval &new_mask, Uint4 linker)
CSymDustMasker::TMaskList s_FindSegmentWithLongNs(const unsigned int MAX_Ns, objects::CSeqVector &seq)
Operators to edit gaps in sequences.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1292
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideLogfile
Hide log file description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
CArgAllow_Strings * Allow(const string &value)
Add allowed string values.
Definition: ncbiargs.cpp:4593
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
@ fBinary
Open file in binary mode.
Definition: ncbiargs.hpp:263
#define NULL
Definition: ncbistd.hpp:225
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
@ eBlast_filter_program_dust
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
int i
Lightweight interface for getting lines of data with minimal memory copying.
Contains the command line options common to filtering algorithms.
const char * kOutputFormats[]
Output formats allowed, the first one is the default.
const size_t kNumOutputFormats
Number of elements in kOutputFormats.
const std::string kOutput
Command line flag to specify the output.
const std::string kOutputFormat
Command line flag to specify the output format.
const char * kInputFormats[]
Input formats allowed, the first one is the default.
const std::string kInput
Command line flag to specify the input.
const std::string kInputFormat
Command line flag to specify the input format.
string BuildAlgorithmParametersString(const CArgs &args)
Builds an algorithm options string for the filtering applications (segmasker, dustmasker) by examinin...
NCBI C++ auxiliary debug macros.
static Format format
Definition: njn_ioutil.cpp:53
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
The Object manager core.
static SQLCHAR output[256]
Definition: print.c:5
CRef< objects::CObjectManager > om
Modified on Mon Mar 04 05:12:23 2024 by modify_doxy.py rev. 669887