NCBI C++ ToolKit
seq_masker.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_masker.cpp 98106 2022-09-29 01:34:59Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * CSeqMasker class member and method definitions.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbi_limits.h>
36 
48 
49 #include <algorithm>
50 #include <memory>
51 #include <sstream>
52 
53 
56 
57 #define WIN_MASK_ALGO_NAME "window-masker-algorithm"
58 #define WIN_MASK_ALGO_VER_MAJOR 1
59 #define WIN_MASK_ALGO_VER_MINOR 0
60 #define WIN_MASK_ALGO_VER_PATCH 0
61 
62 //-------------------------------------------------------------------------
68 );
69 
70 //-------------------------------------------------------------------------
71 CSeqMasker::CSeqMasker( const string & lstat_name,
72  Uint1 arg_window_size,
73  Uint4 arg_window_step,
74  Uint1 arg_unit_step,
75  Uint4 arg_textend,
76  Uint4 arg_cutoff_score,
77  Uint4 arg_max_score,
78  Uint4 arg_min_score,
79  Uint4 arg_set_max_score,
80  Uint4 arg_set_min_score,
81  bool arg_merge_pass,
82  Uint4 arg_merge_cutoff_score,
83  Uint4 arg_abs_merge_cutoff_dist,
84  Uint4 arg_mean_merge_cutoff_dist,
85  Uint1 arg_merge_unit_step,
86  const string & arg_trigger,
87  Uint1 tmin_count,
88  bool arg_discontig,
89  Uint4 arg_pattern,
90  bool arg_use_ba,
91  double min_pct,
92  double extend_pct,
93  double thres_pct,
94  double max_pct )
95  : ustat( CSeqMaskerIstatFactory::create( lstat_name,
96  arg_cutoff_score,
97  arg_textend,
98  arg_max_score,
99  arg_set_max_score,
100  arg_min_score,
101  arg_set_min_score,
102  arg_use_ba,
103  min_pct,
104  extend_pct,
105  thres_pct,
106  max_pct ) ),
107  score( NULL ), score_p3( NULL ), trigger_score( NULL ),
108  window_size( arg_window_size ), window_step( arg_window_step ),
109  unit_step( arg_unit_step ),
110  merge_pass( arg_merge_pass ),
111  merge_cutoff_score( arg_merge_cutoff_score ),
112  abs_merge_cutoff_dist( arg_abs_merge_cutoff_dist ),
113  mean_merge_cutoff_dist( arg_mean_merge_cutoff_dist ),
114  merge_unit_step( arg_merge_unit_step ),
115  trigger( arg_trigger == "mean" ? eTrigger_Mean
116  : eTrigger_Min ),
117  discontig( arg_discontig ), pattern( arg_pattern )
118 {
119  if( window_size == 0 ) window_size = ustat->UnitSize() + 4;
120 
121  if( window_size < ustat->UnitSize() ) {
122  std::ostringstream os;
123  os << "window size (" << (int)window_size << ") "
124  "must be greater or equal to unit size (" <<
125  (int)ustat->UnitSize() << ")";
126  NCBI_THROW( CSeqMaskerException, eValidation, os.str() );
127  }
128 
130 
131  if( trigger == eTrigger_Min )
132  trigger_score = new CSeqMaskerScoreMin( ustat, tmin_count );
133 
134  if( !score )
135  {
137  eScoreAllocFail,
138  "" );
139  }
140 
141  if( arg_merge_pass )
142  {
144 
145  if( !score )
146  {
148  eScoreP3AllocFail,
149  "" );
150  }
151  }
152 }
153 
154 //-------------------------------------------------------------------------
156 {
157  if( trigger_score != score ) delete trigger_score;
158 
159  delete score;
160  delete score_p3;
161 }
162 
163 //-------------------------------------------------------------------------
166 { return DoMask( data, 0, data.size() ); }
167 
168 //-------------------------------------------------------------------------
171  const CSeqVector& data, TSeqPos begin, TSeqPos stop ) const
172 {
173  ustat->total_ = 0;
174  unique_ptr<TMaskList> mask(new TMaskList);
175  Uint4 cutoff_score = ustat->get_threshold();
176  Uint4 textend = ustat->get_textend();
178  Uint4 unit_size = ustat->UnitSize() + nbits;
179  unique_ptr<CSeqMaskerWindow> window_ptr
180  (discontig ? new CSeqMaskerWindowPattern( data, unit_size,
182  pattern, unit_step )
183  : new CSeqMaskerWindow( data, unit_size,
185  unit_step, begin, stop ));
186  CSeqMaskerWindow & window = *window_ptr;
187  score->SetWindow( window );
188 
189  if( trigger == eTrigger_Min ) trigger_score->SetWindow( window );
190 
191  Uint4 start = 0, end = 0, cend = 0;
192  Uint4 limit = textend;
195 
196  CSeqMaskerCacheBoost booster( window, od );
197 
198  while( window )
199  {
200  Uint4 ts = (*trigger_score)();
201  Uint4 s = (*score)();
202  Uint4 adv = window_step;
203 
204  if( s < limit )
205  {
206  if( end > start )
207  {
208  if( window.Start() > cend )
209  {
210  mask->push_back( TMaskedInterval( start, end ) );
211  start = end = cend = 0;
212  }
213  }
214 
215  if( od != 0 && od->cba_ != 0 )
216  {
217  adv = window.Start();
218 
219  if( !booster.Check() )
220  break;
221 
222  adv = window_step*( 1 + window.Start() - adv );
223  }
224  }
225  else if( ts < cutoff_score )
226  {
227  if( end > start )
228  {
229  if( window.Start() > cend + 1 )
230  {
231  mask->push_back( TMaskedInterval( start, end ) );
232  start = end = cend = 0;
233  }
234  else cend = window.End();
235  }
236  }
237  else
238  {
239  if( end > start )
240  {
241  if( window.Start() > cend + 1 )
242  {
243  mask->push_back( TMaskedInterval( start, end ) );
244  start = window.Start();
245  }
246  }
247  else start = window.Start();
248 
249  cend = end = window.End();
250  }
251 
252 
253  if( adv == window_step )
254  ++window;
255 
256  score->PostAdvance( adv );
257  }
258 
259  if( end > start )
260  mask->push_back( TMaskedInterval( start, end ) );
261 
262  window_ptr.reset();
263 
264  if( merge_pass )
265  {
267  Uint4 unit_size = ustat->UnitSize() + nbits;
268 
269  if( mask->size() < 2 ) return mask.release();
270 
271  TMList masked, unmasked;
272  TMaskList::iterator jtmp = mask->end();
273 
274  {{
275  for( TMaskList::iterator i = mask->begin(), j = --jtmp;
276  i != j; )
277  {
278  masked.push_back( mitem( i->first, i->second, unit_size,
279  data, *this ) );
280  Uint4 nstart = (i++)->second - unit_size + 2;
281  unmasked.push_back( mitem( nstart, i->first + unit_size - 2,
282  unit_size, data, *this ) );
283  }
284 
285  masked.push_back( mitem( (mask->rbegin())->first,
286  (mask->rbegin())->second,
287  unit_size, data, *this ) );
288  }}
289 
290  Int4 count = 0;
291  TMList::iterator ii = masked.begin();
292  TMList::iterator j = unmasked.begin();
293  TMList::iterator k = ii, l = ii;
294  --k; ++l;
295 
296  for( ; ii != masked.end(); k = l = ii, --k, ++l )
297  {
298  Uint4 ldist = (ii != masked.begin())
299  ? ii->start - k->end - 1 : 0;
300  TMList::iterator tmpend = masked.end();
301  --tmpend;
302  Uint4 rdist = (ii != tmpend)
303  ? l->start - ii->end - 1 : 0;
304  double lavg = 0.0, ravg = 0.0;
305  bool can_go_left = count && ldist
306  && ldist <= mean_merge_cutoff_dist;
307  bool can_go_right = rdist
308  && rdist <= mean_merge_cutoff_dist;
309 
310  if( can_go_left )
311  {
312  TMList::iterator tmp = j; --tmp;
313  lavg = MergeAvg( k, tmp, unit_size );
314  can_go_left = can_go_left && (lavg >= merge_cutoff_score);
315  }
316 
317  if( can_go_right )
318  {
319  ravg = MergeAvg( ii, j, unit_size );
320  can_go_right = can_go_right && (ravg >= merge_cutoff_score);
321  }
322 
323  if( can_go_right )
324  {
325  if( can_go_left )
326  {
327  if( ravg >= lavg )
328  {
329  ++count;
330  ++ii;
331  ++j;
332  }
333  else // count must be greater than 0.
334  {
335  --count;
336  k->avg = MergeAvg( k, --j, unit_size );
337  _TRACE( "Merging "
338  << k->start << " - " << k->end
339  << " and "
340  << ii->start << " - " << ii->end );
341  Merge( masked, k, unmasked, j );
342 
343  if( count )
344  {
345  ii = --k;
346  --j;
347  --count;
348  }
349  else ii = k;
350  }
351  }
352  else
353  {
354  ++count;
355  ++ii;
356  ++j;
357  }
358  }
359  else if( can_go_left )
360  {
361  --count;
362  k->avg = MergeAvg( k, --j, unit_size );
363  _TRACE( "Merging "
364  << k->start << " - " << k->end
365  << " and "
366  << ii->start << " - " << ii->end );
367  Merge( masked, k, unmasked, j );
368 
369  if( count )
370  {
371  ii = --k;
372  --j;
373  --count;
374  }
375  else ii = k;
376  }
377  else
378  {
379  ++ii;
380  ++j;
381  count = 0;
382  }
383  }
384 
385  for( ii = masked.begin(), j = unmasked.begin(), k = ii++;
386  ii != masked.end(); (k = ii++), j++ )
387  {
388  if( k->end + abs_merge_cutoff_dist >= ii->start )
389  {
390  _TRACE( "Unconditionally merging "
391  << k->start << " - " << k->end
392  << " and "
393  << ii->start << " - " << ii->end );
394  k->avg = MergeAvg( k, j, unit_size );
395  Merge( masked, k, unmasked, j );
396  ii = k;
397 
398  if( ++ii == masked.end() ) break;
399  }
400  }
401 
402  mask->clear();
403 
404  for( TMList::const_iterator iii = masked.begin(); iii != masked.end(); ++iii )
405  mask->push_back( TMaskedInterval( iii->start, iii->end ) );
406  }
407 
408  return mask.release();
409 }
410 
411 //-------------------------------------------------------------------------
412 double CSeqMasker::MergeAvg( TMList::iterator mi,
413  const TMList::iterator & umi,
414  Uint4 unit_size ) const
415 {
416  TMList::iterator tmp = mi++;
417  Uint4 n1 = (tmp->end - tmp->start - unit_size + 2)/merge_unit_step;
418  Uint4 n2 = (umi->end - umi->start - unit_size + 2)/merge_unit_step;
419  Uint4 n3 = (mi->end - mi->start - unit_size + 2)/merge_unit_step;
420  Uint4 N = (mi->end - tmp->start - unit_size + 2)/merge_unit_step;
421  double a1 = tmp->avg, a2 = umi->avg, a3 = mi->avg;
422  return (a1*n1 + a2*n2 + a3*n3)/N;
423 }
424 
425 //-------------------------------------------------------------------------
426 void CSeqMasker::Merge( TMList & m, TMList::iterator mi,
427  TMList & um, TMList::iterator & umi ) const
428 {
429  TMList::iterator tmp = mi++;
430  tmp->end = mi->end;
431  m.erase( mi );
432  umi = um.erase( umi );
433 }
434 
435 //----------------------------------------------------------------------------
437 {
438  switch( GetErrCode() )
439  {
441 
442  return "can not open input stream";
443 
444  case eLstatSyntax:
445 
446  return "syntax error";
447 
448  case eLstatParam:
449 
450  return "the following parameters could not be determined"
451  " from the unit frequency database or command line: ";
452 
453  case eScoreAllocFail:
454 
455  return "score function object allocation failed";
456 
457  case eScoreP3AllocFail:
458 
459  return "merge pass score function object allocation failed";
460 
461  case eValidation:
462 
463  return "validation error";
464 
465  default:
466 
468  }
469 }
470 
471 //----------------------------------------------------------------------------
472 CSeqMasker::mitem::mitem( Uint4 arg_start, Uint4 arg_end, Uint1 unit_size,
473  const CSeqVector & data, const CSeqMasker & owner )
474  : start( arg_start ), end( arg_end ), avg( 0.0 )
475 {
476  const Uint1 & window_size = owner.window_size;
477  const CSeqMaskerWindow::TUnit & ambig_unit = owner.ustat->AmbigUnit();
478  CSeqMaskerScore * const score = owner.score_p3;
479  CSeqMaskerWindow * window = NULL;
480 
481  if( owner.discontig )
482  window = new CSeqMaskerWindowPatternAmbig( data, unit_size, window_size,
483  owner.merge_unit_step,
484  owner.pattern, ambig_unit,
485  start,
486  owner.merge_unit_step );
487  else
488  window = new CSeqMaskerWindowAmbig( data, unit_size, window_size,
489  owner.merge_unit_step,
490  ambig_unit, start,
491  owner.merge_unit_step );
492 
493  score->SetWindow( *window );
494  Uint4 step = window->Step();
495 
496  while( window->End() < end )
497  {
498  score->PreAdvance( step );
499  ++*window;
500  score->PostAdvance( step );
501  }
502 
503  avg = (*score)();
504  delete window;
505 }
506 
507 //----------------------------------------------------------------------------
508 void CSeqMasker::MergeMaskInfo( TMaskList * dest, const TMaskList * src )
509 {
510  if( src->empty() )
511  return;
512 
513  TMaskList::const_iterator si( src->begin() );
514  TMaskList::const_iterator send( src->end() );
515  TMaskList::iterator di( dest->begin() );
516  TMaskList::iterator dend( dest->end() );
517  TMaskList res;
518  TMaskedInterval seg;
519  TMaskedInterval next_seg;
520 
521  if( di != dend && di->first < si->first )
522  seg = *(di++);
523  else seg = *(si++);
524 
525  while( true )
526  {
527  if( si != send ) {
528  if( di != dend ) {
529  if( si->first < di->first ) {
530  next_seg = *(si++);
531  } else {
532  next_seg = *(di++);
533  }
534  } else {
535  next_seg = *(si++);
536  }
537  } else if( di != dend ) {
538  next_seg = *(di++);
539  } else {
540  break;
541  }
542 
543  if( seg.second + 1 < next_seg.first ) {
544  res.push_back( seg );
545  seg = next_seg;
546  }
547  else if( seg.second < next_seg.second ) {
548  seg.second = next_seg.second;
549  }
550  }
551 
552  res.push_back( seg );
553  dest->swap(res);
554 }
555 
556 
ncbi::TMaskedQueryRegions mask
Interface to the bit array used to check if the score of a unit is below t_extend.
bool Check()
Check if the current state of the window and advance.
Factory class to generate an appropriate CSeqMaskerIstat derived class based on the format name.
CSeqMaskerWindow::TUnit AmbigUnit() const
Get the value of the unit used to represent an ambuguity.
Uint4 get_textend() const
Get the value of T_extend.
virtual Uint1 UnitSize() const =0
Get the unit size.
const optimization_data * get_optimization_data() const
Get the data structure optimization parameters.
Uint4 get_threshold() const
Get the value of T_threshold.
Average unit score form the start of the sequence to the end of current window.
Score function object computing mean of unit in a window.
The score function object that computes maxmin of k consecutive units in a window.
Abstract base class for score function objects.
virtual void PreAdvance(Uint4 step)=0
Window advancement notification.
void SetWindow(const CSeqMaskerWindow &new_window)
Set the window object that should be used for score computation.
virtual void PostAdvance(Uint4 step)=0
Window advancement notification.
static Uint1 BitCount(Uint4 mask, Uint1 bit_value=1)
Count the bits with given value in a given bit pattern.
Windows with units that may contain ambiguities.
Window iterator for discontiguous units used for the merging pass.
Window iterator used for discontiguous units.
Sliding window skipping over the ambiguities.
Uint4 Step() const
Get the current value of the window step.
Uint4 End() const
Get the current ending position of the window.
Uint4 Start() const
Get the current starting position of the window.
Uint4 TUnit
Integer type used to represent units within a window.
Represents different error situations that can occur in the masking process.
Definition: seq_masker.hpp:81
@ eValidation
Insconsistent internal parameters.
Definition: seq_masker.hpp:94
@ eLstatSyntax
Error parsing the length statistics file.
Definition: seq_masker.hpp:90
@ eLstatParam
Error deducing parameters from lstat or command line.
Definition: seq_masker.hpp:91
@ eScoreAllocFail
Error allocating the score function object.
Definition: seq_masker.hpp:92
@ eLstatStreamIpenFail
Error opening the length statistics file.
Definition: seq_masker.hpp:89
@ eScoreP3AllocFail
Error allocating the score function object for merging pass.
Definition: seq_masker.hpp:93
virtual const char * GetErrCodeString() const override
Get the exception description string.
Definition: seq_masker.cpp:436
Main interface to window based masker functionality.
Definition: seq_masker.hpp:53
Uint1 unit_step
Definition: seq_masker.hpp:316
void Merge(TMList &m, TMList::iterator mi, TMList &um, TMList::iterator &umi) const
Definition: seq_masker.cpp:426
list< mitem > TMList
Definition: seq_masker.hpp:234
Uint1 merge_unit_step
Definition: seq_masker.hpp:349
~CSeqMasker()
Object destructor.
Definition: seq_masker.cpp:155
CSeqMaskerScore * score
Definition: seq_masker.hpp:285
static void MergeMaskInfo(TMaskList *dest, const TMaskList *src)
Merge together two result lists.
Definition: seq_masker.cpp:508
@ eTrigger_Min
Using min score of k unit in the window.
Definition: seq_masker.hpp:357
Uint4 merge_cutoff_score
Definition: seq_masker.hpp:327
CSeqMaskerScore * trigger_score
Definition: seq_masker.hpp:295
Uint4 abs_merge_cutoff_dist
Definition: seq_masker.hpp:333
bool merge_pass
Definition: seq_masker.hpp:321
CSeqMaskerScore * score_p3
Definition: seq_masker.hpp:290
pair< TSeqPos, TSeqPos > TMaskedInterval
Type representing a masked interval within a sequence.
Definition: seq_masker.hpp:67
Uint4 mean_merge_cutoff_dist
Definition: seq_masker.hpp:339
TMaskList * DoMask(const objects::CSeqVector &data, TSeqPos start, TSeqPos end) const
Definition: seq_masker.cpp:170
vector< TMaskedInterval > TMaskList
A type representing the total of masking information about a sequence.
Definition: seq_masker.hpp:74
TMaskList * operator()(const objects::CSeqVector &data) const
Sequence masking operator.
Definition: seq_masker.cpp:165
Uint4 pattern
Definition: seq_masker.hpp:368
double MergeAvg(TMList::iterator mi, const TMList::iterator &umi, Uint4 unit_size) const
Definition: seq_masker.cpp:412
Uint1 window_size
Definition: seq_masker.hpp:300
bool discontig
Definition: seq_masker.hpp:363
enum CSeqMasker::@32 trigger
static CSeqMaskerVersion AlgoVersion
Version of window masking algorithm.
Definition: seq_masker.hpp:57
Uint4 window_step
Definition: seq_masker.hpp:308
CRef< CSeqMaskerIstat > ustat
Definition: seq_masker.hpp:280
CSeqMasker(const string &lstat_name, Uint1 arg_window_size, Uint4 arg_window_step, Uint1 arg_unit_step, Uint4 arg_textend, Uint4 arg_cutoff_score, Uint4 arg_max_score, Uint4 arg_min_score, Uint4 arg_set_max_score, Uint4 arg_set_min_score, bool arg_merge_pass, Uint4 arg_merge_cutoff_score, Uint4 arg_abs_merge_cutoff_dist, Uint4 arg_mean_merge_cutoff_dist, Uint1 arg_merge_unit_step, const string &arg_trigger, Uint1 tmin_count, bool arg_discontig, Uint4 arg_pattern, bool arg_use_ba, double min_pct=-1.0, double extend_pct=-1.0, double thres_pct=-1.0, double max_pct=-1.0)
Object constructor.
Definition: seq_masker.cpp:71
CSeqVector –.
Definition: seq_vector.hpp:65
static ulg window_size
static const char si[8][64]
Definition: des.c:146
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
#define WIN_MASK_ALGO_VER_MAJOR
Definition: seq_masker.cpp:58
USING_SCOPE(objects)
#define WIN_MASK_ALGO_NAME
Definition: seq_masker.cpp:57
#define WIN_MASK_ALGO_VER_PATCH
Definition: seq_masker.cpp:60
#define WIN_MASK_ALGO_VER_MINOR
Definition: seq_masker.cpp:59
Structure containing information about optimization parameters used.
Uint4 * cba_
Bit array with zeroes where all corresponding units have counts below t_extend.
Uint4 start
Start of the interval.
Definition: seq_masker.hpp:208
Uint4 end
End of the interval.
Definition: seq_masker.hpp:209
mitem(Uint4 start, Uint4 end, Uint1 unit_size, const objects::CSeqVector &data, const CSeqMasker &owner)
Object constructor.
Definition: seq_masker.cpp:472
double avg
Average score of the units in the interval.
Definition: seq_masker.hpp:210
#define N
Definition: crc32.c:57
Modified on Mon Jul 15 05:33:46 2024 by modify_doxy.py rev. 669887