NCBI C++ ToolKit
winmask_filter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: winmask_filter.cpp 93029 2021-03-01 15:21:15Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  * Initial Version Creation Date: April 17th, 2008
29  *
30  * File Description:
31  * Blast wrappers for WindowMasker filtering.
32  *
33  * */
34 
35 /// @file winmask_filter.cpp
36 /// Blast wrappers for WindowMasker filtering.
37 #include <ncbi_pch.hpp>
38 #include "winmask_filter.hpp"
39 #include <sstream>
40 #include <serial/iterator.hpp>
45 #include <objmgr/util/sequence.hpp>
47 
49 
54 #include "blast_setup.hpp"
55 
59 
62 #include <corelib/env_reg.hpp>
63 
64 /** @addtogroup AlgoBlast
65  *
66  * @{
67  */
68 
71 BEGIN_SCOPE(blast)
72 
73 static string s_WINDOW_MASKER_STAT_FILE_NAME("wmasker.obinary");
76 
77 int WindowMaskerPathInit(const string& window_masker_path)
78 {
79  if (CDirEntry(window_masker_path).GetType() != CDirEntry::eDir) {
80  return 1;
81  }
82  {
83  CMutexGuard guard(InitMutex);
84  s_WINDOW_MASKER_PATH = window_masker_path;
85  }
86  return 0;
87 }
89 {
90  CMutexGuard guard(InitMutex);
91  s_WINDOW_MASKER_PATH.clear();
92 }
94 {
95  CMutexGuard guard(InitMutex);
96  return s_WINDOW_MASKER_PATH;
97 }
98 
99 CSeqMasker* s_BuildSeqMasker(const string & lstat)
100 {
101  Uint1 arg_window_size = 0; // [allow setting of this field?]
102  Uint4 arg_window_step = 1;
103  Uint1 arg_unit_step = 1;
104  Uint4 arg_textend = 0; // [allow setting of this field?]
105  Uint4 arg_cutoff_score = 0; // [allow setting of this field?]
106  Uint4 arg_max_score = 0; // [allow setting of this field?]
107  Uint4 arg_min_score = 0; // [allow setting of this field?]
108  Uint4 arg_set_max_score = 0; // [allow setting of this field?]
109  Uint4 arg_set_min_score = 0; // [allow setting of this field?]
110  bool arg_merge_pass = false;
111  Uint4 arg_merge_cutoff_score = 0;
112  Uint4 arg_abs_merge_cutoff_dist = 0;
113  Uint4 arg_mean_merge_cutoff_dist = 0;
114  Uint1 arg_merge_unit_step = 0;
115  const string & arg_trigger = "mean";
116  Uint1 tmin_count = 0;
117  bool arg_discontig = false;
118  Uint4 arg_pattern = 0;
119 
120  // enable/disable some kind of optimization
121  bool arg_use_ba = true;
122 
123  // Get a sequence masker.
124 
125  CSeqMasker* masker = NULL;
126 
127  try {
128  masker = new CSeqMasker( lstat,
129  arg_window_size,
130  arg_window_step,
131  arg_unit_step,
132  arg_textend,
133  arg_cutoff_score,
134  arg_max_score,
135  arg_min_score,
136  arg_set_max_score,
137  arg_set_min_score,
138  arg_merge_pass,
139  arg_merge_cutoff_score,
140  arg_abs_merge_cutoff_dist,
141  arg_mean_merge_cutoff_dist,
142  arg_merge_unit_step,
143  arg_trigger,
144  tmin_count,
145  arg_discontig,
146  arg_pattern,
147  arg_use_ba );
148  }
149  catch(CException & e) {
150  NCBI_THROW(CBlastException, eSetup, e.what());
151  }
152 
153  return masker;
154 }
155 
157  const CSeq_loc & seqloc,
158  CSeq_id & query_id,
159  TMaskedQueryRegions * mqr,
160  CRef<CSeq_loc> * psl)
161 {
162  TSeqPos query_start = seqloc.GetStart(eExtreme_Positional);
163 
164  // This needs to be examined further for places where a +1, -1,
165  // etc is needed due to biological vs. computer science offset
166  // notations.
167 
168  ITERATE(CSeqMasker::TMaskList, pr, masks) {
170 
171  TSeqPos
172  start = pr->first,
173  end = pr->second;
174 
175  ival->SetFrom (query_start + start);
176  ival->SetTo (query_start + end);
177  ival->SetId (query_id);
178  ival->SetStrand(eNa_strand_both);
179 
180  if (mqr) {
181  CRef<CSeqLocInfo> info_plus
182  (new CSeqLocInfo(&* ival, CSeqLocInfo::eFramePlus1));
183  mqr->push_back(info_plus);
184 
185  CRef<CSeqLocInfo> info_minus
186  (new CSeqLocInfo(&* ival, CSeqLocInfo::eFrameMinus1));
187  mqr->push_back(info_minus);
188  }
189 
190  if (psl) {
191  if (psl->Empty()) {
192  psl->Reset(new CSeq_loc);
193  }
194  (**psl).SetPacked_int().Set().push_back(ival);
195  }
196  }
197  if (psl && !psl->Empty())
198  {
200  CRef<CSeq_loc> tmp = (*psl)->Merge(kTopFlags, 0);
201  psl->Reset(tmp);
202  (*psl)->ChangeToPackedInt();
203  }
204 
205 }
206 
207 // These templates only exist to reduce code duplication due to the
208 // TSeqLocVector / BlastQueryVector split. By parameterizing on the
209 // query container type, several functions can call these templates
210 // with different types of queries and options handles, and the
211 // appropriate number of "glue" functions will be generated to call
212 // the actual taxid / filename based implementations.
213 
214 template<class TQueries>
215 void
217  const CBlastOptions * opts)
218 {
219  if (! opts)
220  return;
221 
222  if (opts->GetWindowMaskerDatabase()) {
224  } else if (opts->GetWindowMaskerTaxId()) {
226  }
227 }
228 
229 template<class TQueries>
230 void
232  const CBlastOptionsHandle * opts_handle)
233 {
234  if (! opts_handle)
235  return;
236 
238 }
239 
240 // These four functions exist to provide non-template public
241 // interfaces; the work is done in the two templates above this to
242 // reduce duplication.
243 
244 void
246  const CBlastOptions * opts)
247 {
249 }
250 
251 void
253  const CBlastOptions * opts)
254 {
256 }
257 
258 void
260  const CBlastOptionsHandle * opts)
261 {
263 }
264 
265 void
267  const CBlastOptionsHandle * opts)
268 {
270 }
271 
272 // These two functions do the actual work. If either is changed, the
273 // other should be too. The TSeqLocVector vs. BlastQueryVector
274 // differences could be factored out into a wrapper that isolates the
275 // differences so that the algorithm is not duplicated. Another
276 // alternative is to (continue to) replace TSeqLocVector with
277 // CBlastQueryVector as was originally planned.
278 
279 void
280 Blast_FindWindowMaskerLoc(CBlastQueryVector & queries, const string & lstat)
281 {
282  AutoPtr<CSeqMasker> masker(s_BuildSeqMasker(lstat));
283 
284  for(size_t j = 0; j < queries.Size(); j++) {
286 
287  // Get SeqVector, query Seq-id, and range.
288 
289  CConstRef<CSeq_loc> seqloc = query.GetQuerySeqLoc();
290 
291  CSeqVector psv(*seqloc,
292  *queries.GetScope(j),
295 
296  CRef<CSeq_id> query_seq_id(new CSeq_id);
297  query_seq_id->Assign(*seqloc->GetId());
298 
299  // Mask the query.
300 
301  AutoPtr<CSeqMasker::TMaskList> pos_masks((*masker)(psv));
302 
304 
305  s_BuildMaskedRanges(*pos_masks,
306  *seqloc,
307  *query_seq_id,
308  & mqr,
309  0);
310 
311  query.SetMaskedRegions(mqr);
312  }
313 }
314 
315 void
316 Blast_FindWindowMaskerLoc(TSeqLocVector & queries, const string & lstat)
317 {
318  AutoPtr<CSeqMasker> masker(s_BuildSeqMasker(lstat));
319 
320  for(size_t j = 0; j < queries.size(); j++) {
321  // Get SeqVector, query Seq-id, and range.
322 
323  CConstRef<CSeq_loc> seqloc = queries[j].seqloc;
324 
325  CSeqVector psv(*seqloc,
326  *queries[j].scope,
329 
330  CRef<CSeq_id> query_seq_id(new CSeq_id);
331  query_seq_id->Assign(*seqloc->GetId());
332 
333  // Mask the query.
334 
335  AutoPtr<CSeqMasker::TMaskList> pos_masks((*masker)(psv));
336 
337  s_BuildMaskedRanges(*pos_masks,
338  *seqloc,
339  *query_seq_id,
340  0,
341  & queries[j].mask);
342 
343  if( queries[0].mask ) {
344  CPacked_seqint::Tdata & seqint_list =
345  queries[0].mask->SetPacked_int().Set();
346 
347  NON_CONST_ITERATE(CPacked_seqint::Tdata, itr, seqint_list) {
348  if ((*itr)->CanGetStrand()) {
349  switch((*itr)->GetStrand()) {
350  case eNa_strand_unknown:
351  case eNa_strand_both:
352  case eNa_strand_plus:
353  (*itr)->ResetStrand();
354  break;
355 
356  default:
357  break;
358  }
359  }
360  }
361  }
362  }
363 }
364 
365 /// Find the path to the window masker files, first checking the (optionally
366 /// set) value passed to the WindowMaskerPathInit function, then the
367 /// environment variable WINDOW_MASKER_PATH, then the section WINDOW_MASKER,
368 /// label WINDOW_MASKER_PATH in the NCBI configuration file. If not found in
369 /// either location, return the current working directory
370 /// @sa s_FindPathToGeneInfoFiles
371 static string
373 {
374  string retval = WindowMaskerPathGet();
375  if ( !retval.empty() ) {
376  return retval;
377  }
378  const string kEnvVar("WINDOW_MASKER_PATH");
379  const string kSection("WINDOW_MASKER");
380  CNcbiIstrstream empty_stream(kEmptyStr);
381  CRef<CNcbiRegistry> reg(new CNcbiRegistry(empty_stream,
384  kEmptyStr));
387  reg->Add(*env_reg, CNcbiRegistry::ePriority_MaxUser);
388  retval = reg->Get(kSection, kEnvVar);
389  if (retval == kEmptyStr) {
390  retval = CDir::GetCwd();
391  }
392 #if defined(NCBI_OS_MSWIN)
393  // We address this here otherwise CDirEntry::IsAbsolutePath() fails
394  if (NStr::StartsWith(retval, "//")) {
395  NStr::ReplaceInPlace(retval, "//", "\\\\");
396  }
397 #endif
398  return retval;
399 }
400 
401 string WindowMaskerTaxidToDb(const string& window_masker_path, int taxid)
402 {
403  string path = window_masker_path;
404  path += CFile::GetPathSeparator() + NStr::IntToString(taxid)
406  const string binpath = path + s_WINDOW_MASKER_STAT_FILE_NAME;
407  return (CFile(binpath).Exists() ? binpath : kEmptyStr);
408 }
409 
410 /* Unit test is in bl2seq_unit_test.cpp */
411 string WindowMaskerTaxidToDb(int taxid)
412 {
413  string path = s_FindPathToWM();
414  return WindowMaskerTaxidToDb(path, taxid);
415 }
416 
417 void
419 {
420  string db = WindowMaskerTaxidToDb(taxid);
421  Blast_FindWindowMaskerLoc(queries, db);
422 }
423 
424 void
426 {
427  string db = WindowMaskerTaxidToDb(taxid);
428  Blast_FindWindowMaskerLoc(queries, db);
429 }
430 
431 static void s_OldGetTaxIdWithWindowMaskerSupport(set<int>& supported_taxids)
432 {
433  supported_taxids.clear();
434  CNcbiOstrstream oss;
435  const string wmpath = s_FindPathToWM();
436  oss << wmpath << CFile::GetPathSeparator() << "*"
437  << CFile::GetPathSeparator() << "*.*"
439  const string path = CNcbiOstrstreamToString(oss);
440 
441  list<string> builds;
442  FindFiles(path, builds, fFF_File);
443  NON_CONST_ITERATE(list<string>, path, builds) {
444  // remove the WindowMasker path and path separator
445  path->erase(0, wmpath.size() + 1);
446  // then remove the remaining path
447  const size_t pos = path->find(CFile::GetPathSeparator());
448  path->erase(pos);
449  const int taxid = NStr::StringToInt(*path, NStr::fConvErr_NoThrow);
450  supported_taxids.insert(taxid);
451  }
452 }
453 
455 {
456  supported_taxids.clear();
457  CNcbiOstrstream oss;
458  const string wmpath = s_FindPathToWM();
459  oss << wmpath << CFile::GetPathSeparator() << "*"
461  const string path = CNcbiOstrstreamToString(oss);
462 
463  list<string> builds;
464  FindFiles(path, builds, fFF_File);
465  NON_CONST_ITERATE(list<string>, path, builds) {
466  // remove the WindowMasker path and path separator
467  path->erase(0, wmpath.size() + 1);
468  // then remove the remaining path
469  const size_t pos = path->find(CFile::GetPathSeparator());
470  path->erase(pos);
471  const int taxid = NStr::StringToInt(*path, NStr::fConvErr_NoThrow);
472  supported_taxids.insert(taxid);
473  }
474 
475  if (supported_taxids.empty()) {
476  s_OldGetTaxIdWithWindowMaskerSupport(supported_taxids);
477  }
478 }
479 
480 END_SCOPE(blast)
482 
483 /* @} */
#define static
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
BLAST filtering functions.
Structures and API used for saving BLAST hits.
Declares the CBlastNucleotideOptionsHandle class.
Declaration of ADT to retrieve sequences for the BLAST engine.
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Definitions of special type used in BLAST.
ncbi::TMaskedQueryRegions mask
AutoPtr –.
Definition: ncbimisc.hpp:401
Defines BLAST error codes (user errors included)
Handle to the options to the BLAST algorithm.
Encapsulates ALL the BLAST algorithm's options.
Query Vector.
Definition: sseqloc.hpp:276
CRef< objects::CScope > GetScope(size_type i) const
Get the scope containing a query by index.
Definition: sseqloc.hpp:322
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
CRef< CBlastSearchQuery > GetBlastSearchQuery(size_type i) const
Get the CBlastSearchQuery object at index i.
Definition: sseqloc.hpp:367
Search Query.
Definition: sseqloc.hpp:147
CDirEntry –.
Definition: ncbifile.hpp:262
CEnvironmentRegistry –.
Definition: env_reg.hpp:87
CFile –.
Definition: ncbifile.hpp:1604
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CNcbiRegistry –.
Definition: ncbireg.hpp:913
structure for seqloc info
Definition: seqlocinfo.hpp:48
Main interface to window based masker functionality.
Definition: seq_masker.hpp:53
vector< TMaskedInterval > TMaskList
A type representing the total of masking information about a sequence.
Definition: seq_masker.hpp:74
CSeqVector –.
Definition: seq_vector.hpp:65
CSimpleEnvRegMapper –.
Definition: env_reg.hpp:157
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
void clear()
Definition: set.hpp:153
bool empty() const
Definition: set.hpp:133
Classes to support using environment variables as a backend for the registry framework.
void Blast_FindWindowMaskerLoc_Fwd(TQueries &query, const CBlastOptions *opts)
int GetWindowMaskerTaxId() const
Returns the tax id used for the windowmasker database to use, if set via SetWindowMaskerTaxId (otherw...
DEFINE_STATIC_MUTEX(InitMutex)
string WindowMaskerTaxidToDb(int taxid)
Get the windowmasker file path for a given taxid.
static string s_FindPathToWM(void)
Find the path to the window masker files, first checking the (optionally set) value passed to the Win...
void WindowMaskerPathReset()
Resets the path to the windowmasker data files.
void GetTaxIdWithWindowMaskerSupport(set< int > &supported_taxids)
This function returns a list of NCBI taxonomy IDs for which there exists windowmasker masking data to...
CSeqMasker * s_BuildSeqMasker(const string &lstat)
void Blast_FindWindowMaskerLoc(CBlastQueryVector &query, const CBlastOptions *opts)
Find Window Masker filtered locations using a BlastOptions.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
static string s_WINDOW_MASKER_STAT_FILE_NAME("wmasker.obinary")
void s_BuildMaskedRanges(CSeqMasker::TMaskList &masks, const CSeq_loc &seqloc, CSeq_id &query_id, TMaskedQueryRegions *mqr, CRef< CSeq_loc > *psl)
static void s_OldGetTaxIdWithWindowMaskerSupport(set< int > &supported_taxids)
const char * GetWindowMaskerDatabase() const
Return the name of the windowmasker database to use.
static string s_WINDOW_MASKER_PATH(kEmptyStr)
string WindowMaskerPathGet()
Retrieves the path to the windowmasker data files.
int WindowMaskerPathInit(const string &window_masker_path)
Initialize the path to the windowmasker data files.
void Blast_FindWindowMaskerLocTaxId(CBlastQueryVector &queries, int taxid)
Find Window Masker filtered locations by taxonomic ID.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void FindFiles(TPathIterator path_begin, TPathIterator path_end, const vector< string > &masks, TFindFunc &find_func, TFindFiles flags=fFF_Default)
Generic algorithm for file search.
Definition: ncbifile.hpp:3145
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
static string GetCwd(void)
Get the current working directory.
Definition: ncbifile.cpp:3708
@ eDir
Directory.
Definition: ncbifile.hpp:784
@ fFF_File
find files
Definition: ncbifile.hpp:3008
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
@ fStrand_Ignore
Definition: Seq_loc.hpp:325
@ fMerge_All
Definition: Seq_loc.hpp:331
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
void Add(const IRegistry &reg, TPriority prio=ePriority_Default, const string &name=kEmptyStr)
Non-empty names must be unique within each compound registry, but there is no limit to the number of ...
Definition: ncbireg.cpp:1779
void AddMapper(const IEnvRegMapper &mapper, TPriority prio=ePriority_Default)
Definition: env_reg.cpp:71
@ fWithNcbirc
Include .ncbirc (used only by CNcbiReg.)
Definition: ncbireg.hpp:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
void SetTo(TTo value)
Assign a value to To data member.
list< CRef< CSeq_interval > > Tdata
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
virtual void Reset(void)
Reset the whole object.
Definition: Seq_loc_.cpp:59
void SetStrand(TStrand value)
Assign a value to Strand data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
Main class to perform a BLAST search on the local machine.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
static char tmp[2048]
Definition: utf8.c:42
Implementation of the BlastSeqSrc interface using the C++ BLAST databases API.
const char *const kSection
Definition: snpptis.cpp:56
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
static string query
Interface to retrieve list of available windowmasker filtering.
Blast wrappers for WindowMasker filtering.
#define const
Definition: zconf.h:230
Modified on Sun Mar 03 03:15:41 2024 by modify_doxy.py rev. 669887