NCBI C++ ToolKit
win_mask_dup_table.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_dup_table.cpp 48619 2011-02-10 17:03:08Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of CheckDuplicates() function.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <vector>
35 #include <string>
36 #include <map>
37 
38 #include <corelib/ncbitype.h>
39 #include <corelib/ncbistre.hpp>
40 #include <objects/seq/Bioseq.hpp>
42 
44 #include <objmgr/scope.hpp>
46 #include <objmgr/bioseq_ci.hpp>
47 #include <objmgr/seq_vector.hpp>
48 #include <objmgr/util/sequence.hpp>
49 
52 
55 
56 
57 
58 const Uint4 SAMPLE_LENGTH = 100; /**<\internal length of a sample segment */
59 const Uint4 SAMPLE_SKIP = 10000; /**<\internal distance between subsequent samples */
60 const Uint4 MIN_SEQ_LENGTH = 50000; /**<\internal sequences of length below MIN_SEQ_LENGTH are not considered for
61  duplication check */
62 const Uint4 MAX_OFFSET_ERROR = 5; /**<\internal fuzziness in distance between consequtive matches */
63 const Uint4 MIN_MATCH_COUNT = 4; /**<\internal minimal number of successful consequtive sample matches that
64  defines a duplication */
65 
66 //------------------------------------------------------------------------------
67 /**\internal
68  **\brief This class represents a lookup table used to check incoming
69  ** sequences for duplication.
70  **/
72 {
73 public:
74 
75  /**\internal
76  **\brief Structure describing a location of a sample in a sequence.
77  **/
78  struct sample_loc
79  {
80  size_t seqnum; /**<\internal sequence number (in the order the sequences appear in the input) */
81  Uint4 offset; /**<\internal offset of the sample in the sequence defined by seqnum */
82 
83  /**\internal
84  **\brief Instance constructor.
85  **
86  **\param new_seqnum the sequence number
87  **\param new_offset the sample offset
88  **/
89  sample_loc( size_t new_seqnum, Uint4 new_offset )
90  : seqnum( new_seqnum ), offset( new_offset ) {}
91  };
92 
93  /**\internal
94  **\brief Class describing the data about a particular sample value.
95  **
96  ** A sample is a segment of sequence data of length SAMPLE_LENGTH.
97  ** This class encapsulates information about all occurances of the
98  ** given sample value in the sequences that have been read so far.
99  **/
100  class sample
101  {
102  private:
103 
104  /**\internal \brief Type for lists of sample locations. */
105  typedef vector< sample_loc > loc_list_type;
106 
107  public:
108 
109  /**\internal
110  **\brief Iterator type to traverse lists of sample locations.
111  **/
112  typedef loc_list_type::const_iterator iterator;
113 
114  /**\internal
115  **\brief Add a new sample location.
116  **
117  **\param loc new location
118  **/
119  void add_loc( const sample_loc & loc )
120  { locs.push_back( loc ); }
121 
122  /**\internal\name Location traversal.*/
123  /**@{*/
124  const iterator begin() const { return locs.begin(); }
125  const iterator end() const { return locs.end(); }
126  /**@}*/
127 
128  private:
129 
130  loc_list_type locs; /**<\internal the list of sample locations */
131  };
132 
133 private:
134 
135  /**\internal
136  **\brief Type representing a list of sequence id strings for all
137  ** sequences that have been read so far.
138  **/
139  typedef vector< string > seq_id_list_type;
140 
141  /**\internal
142  **\brief A type mapping sample strings to the information about their
143  ** occurances in the input.
144  **/
146 
147 public:
148 
149  /**\internal \brief Alias for sample::iterator. */
151 
152  /**\internal
153  **\brief Augment the data structure with the information about the
154  ** new sequence.
155  **
156  **\param seq_id id string for the new sequence
157  **\param seq_data new sequence data in IUPACNA format
158  **/
159  void add_seq_info( const string & seq_id,
160  const objects::CSeqVector & seq_data );
161 
162  /**\internal
163  **\brief Get the sequence id string from the sequence number.
164  **
165  **\param seqnum the sequence number
166  **\return the sequence id string in FASTA format
167  **/
168  const string seqid( Uint4 seqnum ) const
169  { return seq_id_list[seqnum]; }
170 
171  /**\internal
172  **\brief Get the information about positions of the sample in the
173  ** data base from the sample value.
174  **
175  **\param index the sample value
176  **\return pointer to the corresponding instance of the sample class
177  **/
178  const sample * operator[]( const string & index ) const
179  {
181  return i == sample_set.end() ? 0 : &(i->second);
182  }
183 
184 private:
185 
186  /**\internal
187  **\brief Add a sample location to the sample information structure.
188  **
189  **\param sample the sample value
190  **\param loc the sample location description
191  **/
192  void add_loc( const string & sample, const sample_loc & loc )
193  { sample_set[sample].add_loc( loc ); }
194 
195  seq_id_list_type seq_id_list; /**<\internal the list of sequence id strings */
196  sample_set_type sample_set; /**<\internal the sample->sample information map */
197 };
198 
199 //------------------------------------------------------------------------------
200 /**\internal
201  **\brief "Less than" comparison between two sample locations.
202  **
203  ** Sample locations are compared lexicographically on the (seqnum, offset)
204  ** pair.
205  **
206  **\param lhs the first sample location
207  **\param rhs the second sample location
208  **\return true if lhs < rhs; false otherwise
209  **/
210 inline bool operator<( const dup_lookup_table::sample_loc & lhs,
211  const dup_lookup_table::sample_loc & rhs )
212 {
213  return lhs.seqnum < rhs.seqnum ? true :
214  lhs.seqnum > rhs.seqnum ? false :
215  lhs.offset < rhs.offset ? true : false;
216 }
217 
218 //------------------------------------------------------------------------------
219 /**\internal
220  **\brief "Greater than" comparison of two sample locations.
221  **
222  ** Defined as: a > b iff b < a.
223  **
224  **\param lhs the first sample location
225  **\param rhs the second sample location
226  **\return true if lhs > rhs; false otherwise
227  **/
228 inline bool operator>( const dup_lookup_table::sample_loc & lhs,
229  const dup_lookup_table::sample_loc & rhs )
230 { return rhs < lhs; }
231 
232 //------------------------------------------------------------------------------
233 /**\internal
234  **\brief Comparisong of two sample locations for equality.
235  **
236  ** Defined as !(lhs < rhs) and !(rhs < lhs).
237  **
238  **\param lhs the first sample location
239  **\param rhs the second sample location
240  **\return true if lhs == rhs; false otherwise
241  **/
242 inline bool operator==( const dup_lookup_table::sample_loc & lhs,
243  const dup_lookup_table::sample_loc & rhs )
244 { return !(lhs < rhs) && !(rhs < lhs); }
245 
246 //------------------------------------------------------------------------------
247 void dup_lookup_table::add_seq_info( const string & seq_id,
248  const objects::CSeqVector & seq_data )
249 {
250  static TSeqPos next_offset( 0 );
251 
252  seq_id_list.push_back( seq_id );
253  TSeqPos data_len( seq_data.size() );
254 
255  string sample;
256  while( next_offset < data_len - SAMPLE_LENGTH )
257  {
258  sample.erase();
259  seq_data.GetSeqData(next_offset, next_offset + SAMPLE_LENGTH, sample);
260  sample_loc loc( seq_id_list.size() - 1, next_offset );
261  add_loc( sample, loc );
262  next_offset += SAMPLE_SKIP;
263  }
264 
265  next_offset = (next_offset <= data_len) ? 0 : next_offset - data_len;
266 }
267 
268 //------------------------------------------------------------------------------
269 /**\internal
270  **\brief This class encapsulates the state of duplication search process.
271  **
272  ** An instance of this class is created for each subject sequence to search
273  ** for duplicates among the sequences that were processed earlier.
274  **/
275 class tracker
276 {
277 public:
278 
279  /**\internal \brief Alias for the sample location description type. */
281 
282 private:
283 
284  /**\internal
285  **\brief Alias for the sample location information iterator type.
286  **/
288 
289  /**\internal
290  **\brief Type representing a possible match currently being tracked
291  ** by the tracker instance.
292  **/
293  struct result
294  {
295  Uint4 count; /**<\internal current number of consequtive sample matches */
296  sample_loc loc; /**<\internal information about query location of the last sample match */
297  string::size_type s_offset; /**<\internal location in the subject of the last sample match */
298 
299  /**\internal
300  **\brief Object constructor.
301  **
302  **\param newloc location in the query
303  **\param new_offset location in the subject
304  **\param new_count initial value of match count
305  **/
306  result( const sample_loc & newloc,
307  string::size_type new_offset,
308  Uint4 new_count = 1 )
309  : count( new_count ), loc( newloc ), s_offset( new_offset ) {}
310  };
311 
312  /**\internal
313  **\brief type used to store the set of currently tracked matches.
314  **/
315  typedef vector< result > result_list_type;
316 
317 public:
318 
319  /**\internal
320  **\brief Object constructor.
321  **
322  **\param the_table the lookup table to search against
323  **\param the_subject_id the id string for the subject sequence
324  **/
325  tracker( const dup_lookup_table & the_table, const string & the_subject_id )
326  : table( the_table ), subject_id( the_subject_id ) {}
327 
328  /**\internal \brief Object destructor. */
329  ~tracker();
330 
331  /**\internal
332  **\brief Process a set of matches to the lookup table.
333  **
334  ** The list of matches is given by the pair of iterators. For the
335  ** current set of results determines which could be extended. The
336  ** ones that can not be extended and whose subject offset is too
337  ** far in the past are deleted. The location in [start, end) that
338  ** do not extend existing matches start new matches.
339  **
340  **\param index the sample sequence at the current subject offset
341  **\param seqnum the sequence number of the current sequence
342  **\param subject_offset current position in the subject sequence
343  **\param start start of the list of matches to the lookup table
344  **\param end end of the list of matches to the lookup table
345  **/
346  void operator()( const string & index, Uint4 seqnum,
347  string::size_type subject_offset,
348  iterator start, iterator end );
349 
350 private:
351 
352  const dup_lookup_table & table; /**<\internal lookup table to use */
353  const string & subject_id; /**<\internal id string of the current subject sequence */
354 
355  /**\internal
356  **\brief Report a candidate for duplicate sequence pair to the
357  ** standard error.
358  **
359  **\param queryseq number of the query sequence
360  **\param match_count number consequtive sample matches detected
361  **\param s_off last position in the subject sequence
362  **\param q_off last position in the query sequence
363  **/
364  void report_match( Uint4 queryseq,
365  Uint4 match_count,
366  string::size_type s_off,
367  string::size_type q_off );
368 
369  result_list_type main_list; /**<\internal current result list */
370  result_list_type aux_list; /**<\internal additional (helper) result list */
371 };
372 
373 //------------------------------------------------------------------------------
374 void tracker::report_match( Uint4 queryseq, Uint4 match_count,
375  string::size_type s_off,
376  string::size_type q_off )
377 {
378  string query_id( table.seqid( queryseq ) );
379  LOG_POST( Warning <<
380  "Possible duplication of sequences:\n"
381  << "subject: " << subject_id << " and query: " << query_id << "\n"
382  << "at intervals\n"
383  << "subject: " << s_off - match_count*SAMPLE_SKIP
384  << " --- " << s_off - SAMPLE_SKIP << "\n"
385  << "query : " << q_off - match_count*SAMPLE_SKIP
386  << " --- " << q_off - SAMPLE_SKIP << "\n" );
387 }
388 
389 //------------------------------------------------------------------------------
391 {
392  typedef result_list_type::const_iterator r_iterator;
393 
394  r_iterator riter( main_list.begin() );
395  r_iterator rend( main_list.end() );
396 
397  while( riter != rend )
398  {
399  if( riter->count >= MIN_MATCH_COUNT )
400  report_match( riter->loc.seqnum, riter->count,
401  riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
402 
403  ++riter;
404  }
405 }
406 
407 //------------------------------------------------------------------------------
408 void tracker::operator()( const string & index,
409  Uint4 seqnum,
410  string::size_type subject_offset,
413 {
414  typedef result_list_type::const_iterator r_iterator;
416 
417  r_iterator riter( main_list.begin() );
418  r_iterator rend( main_list.end() );
419 
420  bool do_swap( iter == end ? false : true );
421 
422  while( true )
423  if( riter == rend )
424  if( iter == end )
425  break;
426  else
427  {
428  aux_list.push_back( result( sample_loc( iter->seqnum,
429  iter->offset + SAMPLE_SKIP ),
430  subject_offset ) );
431  ++iter;
432  }
433  else if( iter == end )
434  {
435  if( riter->s_offset + SAMPLE_SKIP + MAX_OFFSET_ERROR < subject_offset )
436  {
437  if( riter->count >= MIN_MATCH_COUNT )
438  report_match( riter->loc.seqnum, riter->count,
439  riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
440  }
441  else aux_list.push_back( *riter );
442 
443  ++riter;
444  }
445  else // both iter and riter are valid
446  {
447  if( *iter < riter->loc )
448  {
449  aux_list.push_back( result( sample_loc( iter->seqnum,
450  iter->offset + SAMPLE_SKIP ),
451  subject_offset ) );
452  ++iter;
453  }
454  else if( *iter > riter->loc )
455  {
456  if( riter->s_offset + SAMPLE_SKIP + MAX_OFFSET_ERROR < subject_offset )
457  {
458  if( riter->count >= MIN_MATCH_COUNT )
459  report_match( riter->loc.seqnum, riter->count,
460  riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
461  }
462  else aux_list.push_back( *riter );
463 
464  ++riter;
465  }
466  else // *iter == riter->loc --- same sequence and corresponding offsets
467  {
468  Uint4 count( 1 );
469 
470  while( riter != rend && riter->loc == *iter )
471  {
472  if( subject_offset < riter->s_offset + SAMPLE_SKIP - MAX_OFFSET_ERROR )
473  aux_list.push_back( *riter );
474  else if( subject_offset > riter->s_offset + SAMPLE_SKIP
475  + MAX_OFFSET_ERROR )
476  {
477  if( riter->count >= MIN_MATCH_COUNT )
478  report_match( riter->loc.seqnum, riter->count,
479  riter->s_offset + SAMPLE_SKIP, riter->loc.offset );
480  }
481  else // MATCH!!! Extend it.
482  count = riter->count + 1;
483 
484  ++riter;
485  }
486 
487  aux_list.push_back( result( sample_loc( iter->seqnum,
488  iter->offset + SAMPLE_SKIP ),
489  subject_offset, count ) );
490  ++iter;
491  }
492  }
493 
494  // Swap the lists.
495  if( do_swap )
496  {
497  main_list.clear();
498  main_list.swap( aux_list );
499  }
500 }
501 
502 #if 0
503 //------------------------------------------------------------------------------
504 /**\internal
505  **\brief Get a FASTA formatted id string (the first available) from the
506  ** CSeq_entry structure.
507  **
508  **\param entry sequence description structure
509  **\return the first id string corresponding to entry
510  **/
511 static const string GetIdString( const CSeq_entry & entry )
512 {
514  const CBioseq & seq = entry.GetSeq();
515  CRef<CScope> scope(new CScope(*om));
516  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(
517  const_cast< CSeq_entry & >( entry ) );
518  return CWinMaskSeqTitle::GetId( seh, seq );
519 /*
520  list< CRef< CSeq_id > > idlist = seq.GetId();
521 
522  if( idlist.empty() )
523  return "???";
524  else
525  {
526  CNcbiOstrstream os;
527  (*idlist.begin())->WriteAsFasta( os );
528  return CNcbiOstrstreamToString(os);
529  }
530 */
531 }
532 #endif
533 
534 //------------------------------------------------------------------------------
535 void CheckDuplicates( const vector< string > & input,
536  const string & infmt,
537  const CWinMaskUtil::CIdSet * ids,
538  const CWinMaskUtil::CIdSet * exclude_ids )
539 {
540  typedef vector< string >::const_iterator input_iterator;
541 
544 
545  for( input_iterator i( input.begin() ); i != input.end(); ++i )
546  {
547  Uint4 seqnum( 0 );
548 
549  for(CWinMaskUtil::CInputBioseq_CI bs_iter(*i, infmt); bs_iter; ++bs_iter)
550  {
551  CBioseq_Handle bsh = *bs_iter;
552 
553  if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
554  {
555  TSeqPos data_len = bsh.GetBioseqLength();
556  if( data_len < MIN_SEQ_LENGTH )
557  continue;
558 
559  string id;
561  .GetSeqId()->GetLabel(&id);
562  data_len -= SAMPLE_SKIP;
563  tracker track( table, id );
564 
565  string index;
566  CSeqVector data =
568  for( TSeqPos i = 0; i < data_len; ++i )
569  {
570  index.erase();
571  data.GetSeqData(i, i + SAMPLE_LENGTH, index);
572  const dup_lookup_table::sample * sample( table[index] );
573 
574  if( sample != 0 )
575  track( index, seqnum, i, sample->begin(), sample->end() );
576  }
577 
578  table.add_seq_info( id, data );
579  ++seqnum;
580  }
581  }
582  }
583 }
584 
585 
CBioseq_Handle –.
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
Function iterating over bioseqs in input.
static bool consider(const objects::CBioseq_Handle &bsh, const CIdSet *ids, const CIdSet *exclude_ids)
Check if the given bioseq should be considered for processing.
loc_list_type::const_iterator iterator
vector< sample_loc > loc_list_type
const iterator begin() const
\name Location traversal.
const iterator end() const
void add_loc(const sample_loc &loc)
sample::iterator iterator
sample_set_type sample_set
void add_loc(const string &sample, const sample_loc &loc)
void add_seq_info(const string &seq_id, const objects::CSeqVector &seq_data)
seq_id_list_type seq_id_list
vector< string > seq_id_list_type
const string seqid(Uint4 seqnum) const
map< string, sample > sample_set_type
const sample * operator[](const string &index) const
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
result_list_type main_list
const dup_lookup_table & table
dup_lookup_table::sample_loc sample_loc
void operator()(const string &index, Uint4 seqnum, string::size_type subject_offset, iterator start, iterator end)
vector< result > result_list_type
dup_lookup_table::iterator iterator
const string & subject_id
tracker(const dup_lookup_table &the_table, const string &the_subject_id)
result_list_type aux_list
void report_match(Uint4 queryseq, Uint4 match_count, string::size_type s_off, string::size_type q_off)
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
TSeqPos GetBioseqLength(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
static int input()
int i
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Defines Limits for the types used in NCBI C/C++ toolkit.
The Object manager core.
CRef< objects::CObjectManager > om
sample_loc(size_t new_seqnum, Uint4 new_offset)
result(const sample_loc &newloc, string::size_type new_offset, Uint4 new_count=1)
string::size_type s_offset
else result
Definition: token2.c:20
USING_SCOPE(objects)
bool operator==(const dup_lookup_table::sample_loc &lhs, const dup_lookup_table::sample_loc &rhs)
void CheckDuplicates(const vector< string > &input, const string &infmt, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids)
Check for possibly duplicate sequences in the input.
const Uint4 MIN_SEQ_LENGTH
bool operator>(const dup_lookup_table::sample_loc &lhs, const dup_lookup_table::sample_loc &rhs)
const Uint4 SAMPLE_SKIP
bool operator<(const dup_lookup_table::sample_loc &lhs, const dup_lookup_table::sample_loc &rhs)
const Uint4 MAX_OFFSET_ERROR
const Uint4 MIN_MATCH_COUNT
const Uint4 SAMPLE_LENGTH
Modified on Sun Jul 14 05:02:02 2024 by modify_doxy.py rev. 669887