57 #define WIN_MASK_ALGO_NAME "window-masker-algorithm"
58 #define WIN_MASK_ALGO_VER_MAJOR 1
59 #define WIN_MASK_ALGO_VER_MINOR 0
60 #define WIN_MASK_ALGO_VER_PATCH 0
72 Uint1 arg_window_size,
73 Uint4 arg_window_step,
76 Uint4 arg_cutoff_score,
79 Uint4 arg_set_max_score,
80 Uint4 arg_set_min_score,
82 Uint4 arg_merge_cutoff_score,
83 Uint4 arg_abs_merge_cutoff_dist,
84 Uint4 arg_mean_merge_cutoff_dist,
85 Uint1 arg_merge_unit_step,
86 const string & arg_trigger,
108 window_size( arg_window_size ), window_step( arg_window_step ),
109 unit_step( arg_unit_step ),
110 merge_pass( arg_merge_pass ),
111 merge_cutoff_score( arg_merge_cutoff_score ),
112 abs_merge_cutoff_dist( arg_abs_merge_cutoff_dist ),
113 mean_merge_cutoff_dist( arg_mean_merge_cutoff_dist ),
114 merge_unit_step( arg_merge_unit_step ),
115 trigger( arg_trigger ==
"mean" ? eTrigger_Mean
117 discontig( arg_discontig ), pattern( arg_pattern )
121 if( window_size < ustat->UnitSize() ) {
122 std::ostringstream os;
124 "must be greater or equal to unit size (" <<
179 unique_ptr<CSeqMaskerWindow> window_ptr
191 Uint4 start = 0, end = 0, cend = 0;
192 Uint4 limit = textend;
200 Uint4 ts = (*trigger_score)();
201 Uint4 s = (*score)();
208 if( window.
Start() > cend )
211 start = end = cend = 0;
215 if( od != 0 && od->
cba_ != 0 )
217 adv = window.
Start();
219 if( !booster.
Check() )
225 else if( ts < cutoff_score )
229 if( window.
Start() > cend + 1 )
232 start = end = cend = 0;
234 else cend = window.
End();
241 if( window.
Start() > cend + 1 )
244 start = window.
Start();
247 else start = window.
Start();
249 cend = end = window.
End();
269 if(
mask->size() < 2 )
return mask.release();
272 TMaskList::iterator jtmp =
mask->end();
275 for( TMaskList::iterator
i =
mask->begin(), j = --jtmp;
278 masked.push_back(
mitem(
i->first,
i->second, unit_size,
280 Uint4 nstart = (
i++)->second - unit_size + 2;
281 unmasked.push_back(
mitem( nstart,
i->first + unit_size - 2,
282 unit_size,
data, *
this ) );
285 masked.push_back(
mitem( (
mask->rbegin())->first,
286 (
mask->rbegin())->second,
287 unit_size,
data, *
this ) );
291 TMList::iterator ii = masked.begin();
292 TMList::iterator j = unmasked.begin();
293 TMList::iterator k = ii,
l = ii;
296 for( ; ii != masked.end(); k =
l = ii, --k, ++
l )
298 Uint4 ldist = (ii != masked.begin())
299 ? ii->start - k->end - 1 : 0;
300 TMList::iterator tmpend = masked.end();
302 Uint4 rdist = (ii != tmpend)
303 ?
l->start - ii->end - 1 : 0;
304 double lavg = 0.0, ravg = 0.0;
305 bool can_go_left =
count && ldist
307 bool can_go_right = rdist
312 TMList::iterator
tmp = j; --
tmp;
319 ravg =
MergeAvg( ii, j, unit_size );
336 k->avg =
MergeAvg( k, --j, unit_size );
338 << k->start <<
" - " << k->end
340 << ii->start <<
" - " << ii->end );
341 Merge( masked, k, unmasked, j );
359 else if( can_go_left )
362 k->avg =
MergeAvg( k, --j, unit_size );
364 << k->start <<
" - " << k->end
366 << ii->start <<
" - " << ii->end );
367 Merge( masked, k, unmasked, j );
385 for( ii = masked.begin(), j = unmasked.begin(), k = ii++;
386 ii != masked.end(); (k = ii++), j++ )
390 _TRACE(
"Unconditionally merging "
391 << k->start <<
" - " << k->end
393 << ii->start <<
" - " << ii->end );
394 k->avg =
MergeAvg( k, j, unit_size );
395 Merge( masked, k, unmasked, j );
398 if( ++ii == masked.end() )
break;
404 for( TMList::const_iterator iii = masked.begin(); iii != masked.end(); ++iii )
408 return mask.release();
413 const TMList::iterator & umi,
414 Uint4 unit_size )
const
416 TMList::iterator
tmp = mi++;
421 double a1 =
tmp->avg, a2 = umi->avg, a3 = mi->avg;
422 return (a1*n1 + a2*n2 + a3*n3)/
N;
427 TMList & um, TMList::iterator & umi )
const
429 TMList::iterator
tmp = mi++;
432 umi = um.erase( umi );
442 return "can not open input stream";
446 return "syntax error";
450 return "the following parameters could not be determined"
451 " from the unit frequency database or command line: ";
455 return "score function object allocation failed";
459 return "merge pass score function object allocation failed";
463 return "validation error";
474 : start( arg_start ), end( arg_end ), avg( 0.0 )
496 while( window->
End() <
end )
513 TMaskList::const_iterator
si( src->begin() );
514 TMaskList::const_iterator send( src->end() );
515 TMaskList::iterator di( dest->begin() );
516 TMaskList::iterator dend( dest->end() );
521 if( di != dend && di->first <
si->first )
529 if(
si->first < di->first ) {
537 }
else if( di != dend ) {
543 if( seg.second + 1 < next_seg.first ) {
544 res.push_back( seg );
547 else if( seg.second < next_seg.second ) {
548 seg.second = next_seg.second;
552 res.push_back( seg );
Interface to the bit array used to check if the score of a unit is below t_extend.
bool Check()
Check if the current state of the window and advance.
Factory class to generate an appropriate CSeqMaskerIstat derived class based on the format name.
CSeqMaskerWindow::TUnit AmbigUnit() const
Get the value of the unit used to represent an ambuguity.
Uint4 get_textend() const
Get the value of T_extend.
virtual Uint1 UnitSize() const =0
Get the unit size.
const optimization_data * get_optimization_data() const
Get the data structure optimization parameters.
Uint4 get_threshold() const
Get the value of T_threshold.
Average unit score form the start of the sequence to the end of current window.
Score function object computing mean of unit in a window.
The score function object that computes maxmin of k consecutive units in a window.
Abstract base class for score function objects.
virtual void PreAdvance(Uint4 step)=0
Window advancement notification.
void SetWindow(const CSeqMaskerWindow &new_window)
Set the window object that should be used for score computation.
virtual void PostAdvance(Uint4 step)=0
Window advancement notification.
static Uint1 BitCount(Uint4 mask, Uint1 bit_value=1)
Count the bits with given value in a given bit pattern.
Windows with units that may contain ambiguities.
Window iterator for discontiguous units used for the merging pass.
Window iterator used for discontiguous units.
Sliding window skipping over the ambiguities.
Uint4 Step() const
Get the current value of the window step.
Uint4 End() const
Get the current ending position of the window.
Uint4 Start() const
Get the current starting position of the window.
Uint4 TUnit
Integer type used to represent units within a window.
Represents different error situations that can occur in the masking process.
@ eValidation
Insconsistent internal parameters.
@ eLstatSyntax
Error parsing the length statistics file.
@ eLstatParam
Error deducing parameters from lstat or command line.
@ eScoreAllocFail
Error allocating the score function object.
@ eLstatStreamIpenFail
Error opening the length statistics file.
@ eScoreP3AllocFail
Error allocating the score function object for merging pass.
virtual const char * GetErrCodeString() const override
Get the exception description string.
Main interface to window based masker functionality.
void Merge(TMList &m, TMList::iterator mi, TMList &um, TMList::iterator &umi) const
~CSeqMasker()
Object destructor.
static void MergeMaskInfo(TMaskList *dest, const TMaskList *src)
Merge together two result lists.
@ eTrigger_Min
Using min score of k unit in the window.
CSeqMaskerScore * trigger_score
Uint4 abs_merge_cutoff_dist
CSeqMaskerScore * score_p3
pair< TSeqPos, TSeqPos > TMaskedInterval
Type representing a masked interval within a sequence.
Uint4 mean_merge_cutoff_dist
TMaskList * DoMask(const objects::CSeqVector &data, TSeqPos start, TSeqPos end) const
vector< TMaskedInterval > TMaskList
A type representing the total of masking information about a sequence.
TMaskList * operator()(const objects::CSeqVector &data) const
Sequence masking operator.
double MergeAvg(TMList::iterator mi, const TMList::iterator &umi, Uint4 unit_size) const
enum CSeqMasker::@32 trigger
static CSeqMaskerVersion AlgoVersion
Version of window masking algorithm.
CRef< CSeqMaskerIstat > ustat
CSeqMasker(const string &lstat_name, Uint1 arg_window_size, Uint4 arg_window_step, Uint1 arg_unit_step, Uint4 arg_textend, Uint4 arg_cutoff_score, Uint4 arg_max_score, Uint4 arg_min_score, Uint4 arg_set_max_score, Uint4 arg_set_min_score, bool arg_merge_pass, Uint4 arg_merge_cutoff_score, Uint4 arg_abs_merge_cutoff_dist, Uint4 arg_mean_merge_cutoff_dist, Uint1 arg_merge_unit_step, const string &arg_trigger, Uint1 tmin_count, bool arg_discontig, Uint4 arg_pattern, bool arg_use_ba, double min_pct=-1.0, double extend_pct=-1.0, double thres_pct=-1.0, double max_pct=-1.0)
Object constructor.
static const char si[8][64]
unsigned int TSeqPos
Type for sequence locations and lengths.
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
unsigned int
A callback function used to compare two keys in a database.
#define WIN_MASK_ALGO_VER_MAJOR
#define WIN_MASK_ALGO_NAME
#define WIN_MASK_ALGO_VER_PATCH
#define WIN_MASK_ALGO_VER_MINOR
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure containing information about optimization parameters used.
Uint4 * cba_
Bit array with zeroes where all corresponding units have counts below t_extend.
Uint4 start
Start of the interval.
Uint4 end
End of the interval.
mitem(Uint4 start, Uint4 end, Uint1 unit_size, const objects::CSeqVector &data, const CSeqMasker &owner)
Object constructor.
double avg
Average score of the units in the interval.