NCBI C++ ToolKit
symdust.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: symdust.cpp 91947 2020-12-17 12:52:59Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation file for CSymDustMasker class.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
36 
38 
39 //------------------------------------------------------------------------------
41  size_type window, Uint1 low_k,
42  perfect_list_type & perfect_list, thres_table_type & thresholds )
43  : start_( 0 ), stop_( 0 ), max_size_( window - 2 ), low_k_( low_k ),
44  L( 0 ), P( perfect_list ), thresholds_( thresholds ),
45  r_w( 0 ), r_v( 0 ), num_diff( 0 )
46 {
47  std::fill( c_w, c_w + 64, 0 );
48  std::fill( c_v, c_v + 64, 0 );
49 }
50 
51 //------------------------------------------------------------------------------
53 {
54  triplet_type s = triplet_list_.back();
55  triplet_list_.pop_back();
56  rem_triplet_info( r_w, c_w, s );
57  if( c_w[s] == 0 ) --num_diff;
58  ++start_;
59 
60  triplet_list_.push_front( t );
61  if( c_w[t] == 0 ) ++num_diff;
62  add_triplet_info( r_w, c_w, t );
63  ++stop_;
64 
65  if( num_diff <= 1 ) {
66  P.insert( P.begin(), perfect( start_, stop_ + 1, 0, 0 ) );
67  return false;
68  }
69  else return true;
70 }
71 
72 //------------------------------------------------------------------------------
74 {
75  // shift the left end of the window, if necessary
76  if( triplet_list_.size() >= max_size_ ) {
77  if( num_diff <= 1 ) {
78  return shift_high( t );
79  }
80 
81  triplet_type s = triplet_list_.back();
82  triplet_list_.pop_back();
83  rem_triplet_info( r_w, c_w, s );
84  if( c_w[s] == 0 ) --num_diff;
85 
86  if( L == start_ ) {
87  ++L;
88  rem_triplet_info( r_v, c_v, s );
89  }
90 
91  ++start_;
92  }
93 
94  triplet_list_.push_front( t );
95  if( c_w[t] == 0 ) ++num_diff;
96  add_triplet_info( r_w, c_w, t );
97  add_triplet_info( r_v, c_v, t );
98 
99  if( c_v[t] > low_k_ ) {
100  Uint4 off = triplet_list_.size() - (L - start_) - 1;
101 
102  do {
103  rem_triplet_info( r_v, c_v, triplet_list_[off] );
104  ++L;
105  }while( triplet_list_[off--] != t );
106  }
107 
108  ++stop_;
109 
110  if( triplet_list_.size() >= max_size_ && num_diff <= 1 ) {
111  P.clear();
112  P.insert( P.begin(), perfect( start_, stop_ + 1, 0, 0 ) );
113  return false;
114  }else return true;
115 }
116 
117 //------------------------------------------------------------------------------
119 {
120  typedef perfect_list_type::iterator perfect_iter_type;
121  counts_type counts;
122 
123  Uint4 count = stop_ - L; // count is the suffix length
124 
125  // we need a local copy of triplet counts
126  std::copy( c_v, c_v + 64, counts );
127 
128  Uint4 score = r_v; // and of the partial sum
129  perfect_iter_type perfect_iter = P.begin();
130  Uint4 max_perfect_score = 0;
131  size_type max_len = 0;
132  size_type pos = L - 1; // skipping the suffix
133  impl_citer_type it = triplet_list_.begin() + count; // skipping the suffix
134  impl_citer_type iend = triplet_list_.end();
135 
136  for( ; it != iend; ++it, ++count, --pos ) {
137  Uint1 cnt = counts[*it];
138  add_triplet_info( score, counts, *it );
139 
140  if( cnt > 0 && score*10 > thresholds_[count] ) {
141  // found the candidate for the perfect interval
142  // get the max score for the existing perfect intervals within
143  // current suffix
144  while( perfect_iter != P.end()
145  && pos <= perfect_iter->bounds_.first ) {
146  if( max_perfect_score == 0
147  || max_len*perfect_iter->score_
148  > max_perfect_score*perfect_iter->len_ ) {
149  max_perfect_score = perfect_iter->score_;
150  max_len = perfect_iter->len_;
151  }
152 
153  ++perfect_iter;
154  }
155 
156  // check if the current suffix score is at least as large
157  if( max_perfect_score == 0
158  || score*max_len >= max_perfect_score*count ) {
159  max_perfect_score = score;
160  max_len = count;
161  perfect_iter = P.insert(
162  perfect_iter, perfect( pos, stop_ + 1,
163  max_perfect_score, count ) );
164  }
165  }
166  }
167 }
168 
169 //------------------------------------------------------------------------------
171  Uint4 level, size_type window, size_type linker )
172  : level_( (level >= 2 && level <= 64) ? level : DEFAULT_LEVEL ),
173  window_( (window >= 8 && window <= 64) ? window : DEFAULT_WINDOW ),
174  linker_( (linker >= 1 && linker <= 32) ? linker : DEFAULT_LINKER ),
175  low_k_( level_/5 )
176 {
177  thresholds_.reserve( window_ - 2 );
178  thresholds_.push_back( 1 );
179 
180  for( size_type i = 1; i < window_ - 2; ++i )
181  thresholds_.push_back( i*level_ );
182 }
183 
184 //------------------------------------------------------------------------------
186  TMaskList & res, size_type wstart, size_type start )
187 {
188  if( !P.empty() ) {
189  TMaskedInterval b = P.back().bounds_;
190 
191  if( b.first < wstart ) {
192  TMaskedInterval b1( b.first + start, b.second + start );
193 
194  if( !res.empty() ) {
195  size_type s = res.back().second;
196 
197  if( s + linker_ >= b1.first ) {
198  res.back().second = max( s, b1.second );
199  }else {
200  res.push_back( b1 );
201  }
202  }else {
203  res.push_back( b1 );
204  }
205 
206  while( !P.empty() && P.back().bounds_.first < wstart ) {
207  P.pop_back();
208  }
209  }
210  }
211 }
212 
213 //------------------------------------------------------------------------------
214 std::unique_ptr< CSymDustMasker::TMaskList >
216  size_type start, size_type stop )
217 {
218  std::unique_ptr< TMaskList > res( new TMaskList );
219 
220  if( seq.empty() )
221  return res;
222 
223  if( stop >= seq.size() )
224  stop = seq.size() - 1;
225 
226  if( start > stop )
227  start = stop;
228 
229  while( stop > 2 + start ) // there must be at least one triplet
230  {
231  // initializations
232  P.clear();
234 
235  seq_citer_type it(seq, start);
236 
237  char c1 = *it, c2 = *++it;
238  triplet_type t = (converter_( c1 )<<2) + converter_( c2 );
239 
240  it.SetPos(start + w.stop() + 2);
241 
242  bool done = false;
243  while( !done && it.GetPos() <= stop )
244  {
245  save_masked_regions( *res.get(), w.start(), start );
246 
247  // shift the window
248  t = ((t<<2)&TRIPLET_MASK) + (converter_( *it )&0x3);
249  ++it;
250 
251  if( w.shift_window( t ) ) {
252  if( w.needs_processing() ) {
253  w.find_perfect();
254  }
255  }else {
256  while( it.GetPos() <= stop ) {
257  save_masked_regions( *res.get(), w.start(), start );
258  t = ((t<<2)&TRIPLET_MASK) + (converter_( *it )&0x3);
259 
260  if( w.shift_window( t ) ) {
261  done = true;
262  break;
263  }
264 
265  ++it;
266  }
267  }
268  }
269 
270  // append the rest of the perfect intervals to the result
271  {
272  size_type wstart = w.start();
273 
274  while( !P.empty() ) {
275  save_masked_regions( *res.get(), wstart, start );
276  ++wstart;
277  }
278  }
279 
280  if( w.start() > 0 ) start += w.start();
281  else break;
282  }
283 
284  return res;
285 }
286 
287 //------------------------------------------------------------------------------
288 std::unique_ptr< CSymDustMasker::TMaskList >
290 { return (*this)( seq, 0, seq.size() - 1 ); }
291 
292 //------------------------------------------------------------------------------
294  objects::CSeq_id & seq_id,
295  const sequence_type & seq,
296  std::vector< CConstRef< objects::CSeq_loc > > & locs )
297 {
298  // typedef std::vector< CConstRef< objects::CSeq_loc > > locs_type;
299  std::unique_ptr< TMaskList > res = (*this)( seq );
300  locs.clear();
301  locs.reserve( res->size() );
302 
303  for( TMaskList::const_iterator it = res->begin(); it != res->end(); ++it )
304  locs.push_back( CConstRef< objects::CSeq_loc >(
305  new objects::CSeq_loc( seq_id, it->first, it->second ) ) );
306 }
307 
308 //------------------------------------------------------------------------------
310  objects::CSeq_id & seq_id, const sequence_type & seq )
311 {
312  CRef< objects::CPacked_seqint > result( new objects::CPacked_seqint );
313  std::unique_ptr< TMaskList > res = (*this)( seq );
314 
315  for( TMaskList::const_iterator it = res->begin(); it != res->end(); ++it )
316  result->AddInterval( seq_id, it->first, it->second );
317 
318  return result;
319 }
320 
size_type stop() const
Definition: symdust.hpp:217
bool shift_high(triplet_type t)
Definition: symdust.cpp:52
bool shift_window(triplet_type t)
Definition: symdust.cpp:73
size_type start() const
Definition: symdust.hpp:216
bool needs_processing() const
Definition: symdust.hpp:251
triplets(size_type window, Uint1 low_k, perfect_list_type &perfect_list, thres_table_type &thresholds)
Definition: symdust.cpp:40
impl_type::const_iterator impl_citer_type
Definition: symdust.hpp:263
CSymDustMasker(Uint4 level=DEFAULT_LEVEL, size_type window=static_cast< size_type >(DEFAULT_WINDOW), size_type linker=static_cast< size_type >(DEFAULT_LINKER))
Object constructor.
Definition: symdust.cpp:170
Uint1 triplet_type
Type representing a triplet value.
Definition: symdust.hpp:137
std::list< perfect > perfect_list_type
Type representing a list of perfect intervals.
Definition: symdust.hpp:133
static const Uint4 DEFAULT_LEVEL
Default value of score threshold.
Definition: symdust.hpp:103
perfect_list_type P
Definition: symdust.hpp:328
sequence_type::const_iterator seq_citer_type
Definition: symdust.hpp:194
std::pair< size_type, size_type > TMaskedInterval
Type respresenting an interval selected for masking.
Definition: symdust.hpp:99
std::vector< Uint4 > thres_table_type
Table type to store score sum thresholds for each window length.
Definition: symdust.hpp:135
static const triplet_type TRIPLET_MASK
Selects the significant bits in triplet_type.
Definition: symdust.hpp:140
convert_t converter_
Definition: symdust.hpp:331
CRef< objects::CPacked_seqint > GetMaskedInts(objects::CSeq_id &seq_id, const sequence_type &seq)
Mask a sequence and return result as a CPacked_seqint instance.
Definition: symdust.cpp:309
std::unique_ptr< TMaskList > operator()(const sequence_type &seq)
Mask a sequence.
Definition: symdust.cpp:289
size_type linker_
Definition: symdust.hpp:324
void GetMaskedLocs(objects::CSeq_id &seq_id, const sequence_type &seq, std::vector< CConstRef< objects::CSeq_loc > > &locs)
Mask a sequence and return result as a sequence of CSeq_loc objects.
Definition: symdust.cpp:293
static const Uint4 DEFAULT_LINKER
Default value of the longest distance between consequtive masked intervals at which they should be me...
Definition: symdust.hpp:105
size_type window_
Definition: symdust.hpp:323
static const Uint4 DEFAULT_WINDOW
Default window size.
Definition: symdust.hpp:104
void save_masked_regions(TMaskList &res, size_type w, size_type start)
Definition: symdust.cpp:185
thres_table_type thresholds_
Definition: symdust.hpp:329
seq_t sequence_type
Public sequence type.
Definition: symdust.hpp:95
sequence_type::size_type size_type
Integer size type corresponding to sequence_type.
Definition: symdust.hpp:97
std::vector< TMaskedInterval > TMaskList
Type representing a list of masked intervals.
Definition: symdust.hpp:101
#define P(a, b)
Definition: sqlwparams.h:19
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
int i
EIPRangeType t
Definition: ncbi_localip.c:101
T max(T x_, T y_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static unsigned cnt[256]
done
Definition: token1.c:1
else result
Definition: token2.c:20
Modified on Wed Apr 24 14:10:36 2024 by modify_doxy.py rev. 669887