NCBI C++ ToolKit
win_mask_gen_counts.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_gen_counts.cpp 98105 2022-09-29 00:28:26Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of CWinMaskCountsGenerator class.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <stdlib.h>
35 
36 #include <vector>
37 #include <sstream>
38 
39 #include <objects/seq/Bioseq.hpp>
40 #include <objects/seq/Seq_inst.hpp>
41 #include <objects/seq/Seq_data.hpp>
43 #include <objects/seq/IUPACna.hpp>
44 
46 #include <objmgr/scope.hpp>
48 #include <objmgr/bioseq_ci.hpp>
49 #include <objmgr/seq_vector.hpp>
50 
52 
57 
60 
61 //------------------------------------------------------------------------------
62 static Uint4 letter( char c )
63 {
64  switch( c )
65  {
66  case 'a': case 'A': return 0;
67  case 'c': case 'C': return 1;
68  case 'g': case 'G': return 2;
69  case 't': case 'T': return 3;
70  default: return 0;
71  }
72 }
73 
74 //------------------------------------------------------------------------------
75 static inline bool ambig( char c )
76 {
77  return c != 'a' && c != 'A' && c != 'c' && c != 'C'
78  && c != 'g' && c != 'G' && c != 't' && c != 'T';
79 }
80 
81 #if 0
82 //------------------------------------------------------------------------------
83 string mkdata( const CSeq_entry & entry )
84 {
85  const CBioseq & bioseq( entry.GetSeq() );
86 
87  if( bioseq.CanGetInst()
88  && bioseq.GetInst().CanGetLength()
89  && bioseq.GetInst().CanGetSeq_data() )
90  {
91  TSeqPos len( bioseq.GetInst().GetLength() );
92  const CSeq_data & seqdata( bioseq.GetInst().GetSeq_data() );
93  unique_ptr< CSeq_data > dest( new CSeq_data );
94  CSeqportUtil::Convert( seqdata, dest.get(), CSeq_data::e_Iupacna,
95  0, len );
96  return dest->GetIupacna().Get();
97  }
98 
99  return string( "" );
100 }
101 #endif
102 
103 //------------------------------------------------------------------------------
104 Uint8 CWinMaskCountsGenerator::fastalen( const string & fname ) const
105 {
106  Uint8 result = 0;
107 
108  for(CWinMaskUtil::CInputBioseq_CI bs_iter(fname, infmt); bs_iter; ++bs_iter)
109  {
110  CBioseq_Handle bsh = *bs_iter;
111 
112  if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
113  result += bsh.GetBioseqLength();
114  }
115 
116  return result;
117 }
118 
119 //------------------------------------------------------------------------------
121 { return CSeqMaskerUtil::reverse_complement( seq, size ); }
122 
123 //------------------------------------------------------------------------------
125  const string & arg_input,
126  CNcbiOstream & os,
127  const string & infmt_arg,
128  const string & sformat,
129  const string & arg_th,
130  Uint4 mem_avail,
131  Uint1 arg_unit_size,
132  Uint8 arg_genome_size,
133  Uint4 arg_min_count,
134  Uint4 arg_max_count,
135  bool arg_check_duplicates,
136  bool arg_use_list,
137  const CWinMaskUtil::CIdSet * arg_ids,
138  const CWinMaskUtil::CIdSet * arg_exclude_ids,
139  bool use_ba, string const & metadata,
140  double min_pct, double extend_pct, double thres_pct, double max_pct )
141 : input( arg_input ),
142  ustat( CSeqMaskerOstatFactory::create(
143  sformat, os, use_ba, metadata ) ),
144  max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
145  genome_size( arg_genome_size ),
146  min_count( arg_min_count == 0 ? 1 : arg_min_count ),
147  // max_count( 1024*1024UL ),
148  max_count( 500 ),
149  t_high( arg_max_count ),
150  has_min_count( arg_min_count != 0 ),
151  no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
152  check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
153  total_ecodes( 0 ),
154  score_counts( max_count, 0 ),
155  ids( arg_ids ), exclude_ids( arg_exclude_ids ),
156  infmt( infmt_arg )
157 {
158  // Parse arg_th to set up th[].
159  string::size_type pos( 0 );
160  Uint1 count( 0 );
161 
162  while( pos != string::npos && count < 4 )
163  {
164  string::size_type newpos = arg_th.find_first_of( ",", pos );
165  th[count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
166  pos = (newpos == string::npos ) ? newpos : newpos + 1;
167  }
168 }
169 
170 //------------------------------------------------------------------------------
172  const string & arg_input,
173  const string & output,
174  const string & infmt_arg,
175  const string & sformat,
176  const string & arg_th,
177  Uint4 mem_avail,
178  Uint1 arg_unit_size,
179  Uint8 arg_genome_size,
180  Uint4 arg_min_count,
181  Uint4 arg_max_count,
182  bool arg_check_duplicates,
183  bool arg_use_list,
184  const CWinMaskUtil::CIdSet * arg_ids,
185  const CWinMaskUtil::CIdSet * arg_exclude_ids,
186  bool use_ba, string const & metadata,
187  double min_pct, double extend_pct, double thres_pct, double max_pct )
188 : input( arg_input ),
189  ustat( CSeqMaskerOstatFactory::create(
190  sformat, output, use_ba, metadata ) ),
191  max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
192  genome_size( arg_genome_size ),
193  min_count( arg_min_count == 0 ? 1 : arg_min_count ),
194  // max_count( 1024*1024UL ),
195  max_count( 500 ),
196  t_high( arg_max_count ),
197  has_min_count( arg_min_count != 0 ),
198  no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
199  check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
200  total_ecodes( 0 ),
201  score_counts( max_count, 0 ),
202  ids( arg_ids ), exclude_ids( arg_exclude_ids ),
203  infmt( infmt_arg )
204 {
205  // Parse arg_th to set up th[].
206  string::size_type pos( 0 );
207  Uint1 count( 0 );
208 
209  while( pos != string::npos && count < 4 )
210  {
211  string::size_type newpos = arg_th.find_first_of( ",", pos );
212  th[count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
213  pos = (newpos == string::npos ) ? newpos : newpos + 1;
214  }
215 
216  if( min_pct >= 0.0 ) th[0] = min_pct;
217  if( extend_pct >= 0.0 ) th[1] = extend_pct;
218  if( thres_pct >= 0.0 ) th[2] = thres_pct;
219  if( max_pct >= 0.0 ) th[3] = max_pct;
220 }
221 
222 //------------------------------------------------------------------------------
224 
225 //------------------------------------------------------------------------------
227 {
228  // Generate a list of files to process.
229  vector< string > file_list;
230 
231  if( !use_list ) {
232  NStr::Split(input, ",", file_list);
233  } else {
234  string line;
235  CNcbiIfstream fl_stream( input.c_str() );
236 
237  while( getline( fl_stream, line ) ) {
238  if( !line.empty() ) {
239  file_list.push_back( line );
240  }
241  }
242  }
243 
244  // Check for duplicates, if necessary.
245  if( check_duplicates )
246  {
247  CheckDuplicates( file_list, infmt, ids, exclude_ids );
248  }
249 
250  if( unit_size == 0 )
251  {
252  if( genome_size == 0 )
253  {
254  LOG_POST( "computing the genome length" );
255  Uint8 total = 0;
256 
257  for( vector< string >::const_iterator i = file_list.begin();
258  i != file_list.end(); ++i )
259  {
260  total += fastalen( *i );
261  }
262 
263  genome_size = total;
264 
265  if( genome_size == 0 ) {
266  NCBI_THROW( GenCountsException, eNullGenome, "" );
267  }
268  }
269 
270  for( unit_size = 15; unit_size > 0; --unit_size ) {
271  if( (genome_size>>(2*unit_size)) >= 5 ) {
272  break;
273  }
274  }
275 
276  ++unit_size;
277  _TRACE( "unit size is: " << unit_size );
278  }
279 
280  // Estimate the length of the prefix.
281  // Prefix length is unit_size - suffix length, where suffix length
282  // is max N: (4**N) < max_mem.
283  Uint1 prefix_size( 0 ), suffix_size( unit_size );
284  Uint8 n_units( max_mem/sizeof( Uint4 ) );
285 
286  while( suffix_size > 0 ) {
287  Uint8 units_needed( 1ULL<<(2*suffix_size) );
288  if( units_needed <= n_units ) break;
289  --suffix_size;
290  }
291 
292  NCBI_ASSERT( suffix_size > 0, "suffix size is 0" );
293  prefix_size = unit_size - suffix_size;
295 
296  // Now process for each prefix.
297  Uint4 prefix_exp( 1<<(2*prefix_size) );
298  Uint4 passno = 1;
299  LOG_POST( "pass " << passno );
300 
301  for( Uint4 prefix( 0 ); prefix < prefix_exp; ++prefix ) {
302  process( prefix, prefix_size, file_list, no_extra_pass );
303  }
304 
305  ++passno;
306 
307  // Now put the final statistics as comments at the end of the output.
308  for( Uint4 i( 1 ); i < max_count; ++i )
309  score_counts[i] += score_counts[i-1];
310 
312  Uint4 index[4] = {0, 0, 0, 0};
313  double previous( 0.0 );
314  double current;
315 
316  if( no_extra_pass )
317  {
318  ostringstream s;
319  s << " " << total_ecodes << " ecodes";
320  ustat->setComment( s.str() );
321  }
322 
324 
325  for( Uint4 i( 1 ); i <= max_count; ++i )
326  {
327  current = 100.0*(((double)(score_counts[i - 1] + offset))
328  /((double)total_ecodes));
329 
330  if( no_extra_pass )
331  {
332  ostringstream s;
333  s << " " << dec << i << "\t" << score_counts[i - 1] + offset << "\t"
334  << current;
335  ustat->setComment( s.str() );
336  ustat->SetCount( i, current );
337  }
338 
339  for( Uint1 j( 0 ); j < 4; ++j )
340  if( previous < th[j] && current >= th[j] )
341  index[j] = i;
342 
343  previous = current;
344  }
345 
346  // If min_count or t_high must be deduced do it and reprocess.
347  if( !no_extra_pass )
348  {
349  total_ecodes = 0;
350 
351  if( !has_min_count )
352  min_count = index[0];
353 
354  if( t_high == 0 )
355  t_high = index[3];
356 
357  if( min_count == 0 )
358  min_count = 1;
359 
360  for( Uint4 i( 0 ); i < max_count; ++i )
361  score_counts[i] = 0;
362 
363  LOG_POST( "pass " << passno );
364 
365  for( Uint4 prefix( 0 ); prefix < prefix_exp; ++prefix )
366  process( prefix, prefix_size, file_list, true );
367 
368  for( Uint4 i( 1 ); i < max_count; ++i )
369  score_counts[i] += score_counts[i-1];
370 
372 
373  {
374  ostringstream s;
375  s << " " << total_ecodes << " ecodes";
376  ustat->setComment( s.str() );
377  }
378 
379  for( Uint4 i( 1 ); i <= max_count; ++i )
380  {
381  current
382  = 100.0*(((double)(score_counts[i - 1] + offset))
383  /((double)total_ecodes));
384  ostringstream s;
385  s << " " << dec << i << "\t" << score_counts[i - 1] + offset << "\t"
386  << current;
387  ustat->setComment( s.str() );
388  ustat->SetCount( i, current );
389  }
390  }
391 
392  ustat->setComment( "" );
393 
394  for( Uint1 i( 0 ); i < 4; ++i )
395  {
396  ostringstream s;
397  s << " " << th[i] << "%% threshold at " << index[i];
398  ustat->setComment( s.str() );
399  }
400 
401  ustat->setParam( "t_low ", index[0] );
402  ustat->setParam( "t_extend ", index[1] );
403  ustat->setParam( "t_threshold", index[2] );
404  ustat->setParam( "t_high ", index[3] );
405  ustat->finalize();
406 }
407 
408 //------------------------------------------------------------------------------
410  Uint1 prefix_size,
411  const vector< string > & input_list,
412  bool do_output )
413 {
414  Uint1 suffix_size( unit_size - prefix_size );
415  Uint8 vector_size( 1ULL<<(2*suffix_size) );
416  vector< Uint4 > counts( vector_size, 0 );
417  Uint4 unit_mask( (1<<(2*unit_size)) - 1 );
418  Uint4 prefix_mask( ((1<<(2*prefix_size)) - 1)<<(2*suffix_size) );
419  Uint4 suffix_mask( (1<<2*suffix_size) - 1 );
420  if( unit_size == 16 ) unit_mask = 0xFFFFFFFF;
421 
422  if( suffix_size == 16 )
423  {
424  suffix_mask = 0xFFFFFFFF;
425  prefix_mask = 0;
426  }
427 
428  _TRACE( "prefix: " << prefix <<
429  "\nprefix_size: " << (int)prefix_size <<
430  "\nsuffix_size: " << (int)suffix_size <<
431  "\nvector_size: " << vector_size <<
432  "\nunit_mask: " << unit_mask <<
433  "\nprefix_mask: " << prefix_mask <<
434  "\nsufffix_mask: " << suffix_mask );
435 
436  /*
437  std::cerr << "prefix: " << prefix <<
438  "\nprefix_size: " << (int)prefix_size <<
439  "\nsuffix_size: " << (int)suffix_size <<
440  "\nvector_size: " << vector_size <<
441  "\nunit_mask: " << unit_mask <<
442  "\nprefix_mask: " << prefix_mask <<
443  "\nsufffix_mask: " << suffix_mask << std::endl;
444  */
445 
446  prefix <<= (2*suffix_size);
448 
449  for( vector< string >::const_iterator it( input_list.begin() );
450  it != input_list.end(); ++it )
451  {
452  for(CWinMaskUtil::CInputBioseq_CI bs_iter(*it, infmt); bs_iter; ++bs_iter)
453  {
454  CBioseq_Handle bsh = *bs_iter;
455 
456  if( CWinMaskUtil::consider( bsh, ids, exclude_ids ) )
457  {
458  CSeqVector data =
459  bs_iter->GetSeqVector(CBioseq_Handle::eCoding_Iupac);
460 
461  if( data.empty() )
462  continue;
463 
464  TSeqPos length( data.size() );
465  Uint4 count( 0 );
466  Uint4 unit( 0 );
467 
468  for( Uint4 i( 0 ); i < length; ++i ) {
469  if( ambig( data[i] ) )
470  {
471  count = 0;
472  unit = 0;
473  continue;
474  }
475  else
476  {
477  unit = ((unit<<2)&unit_mask) + letter( data[i] );
478 
479  if( count >= unit_size - 1 )
480  {
481  Uint4 runit( reverse_complement( unit, unit_size ) );
482 
483  if( unit <= runit && (unit&prefix_mask) == prefix )
484  {
485  auto & c( counts[unit&suffix_mask] );
486 
487  if( c < 0xffffffffUL )
488  {
489  ++c;
490  }
491  // ++counts[unit&suffix_mask];
492  }
493 
494  if( runit <= unit && (runit&prefix_mask) == prefix )
495  {
496  auto & c( counts[runit&suffix_mask] );
497 
498  if( c < 0xffffffffUL )
499  {
500  ++c;
501  }
502  // ++counts[runit&suffix_mask];
503  }
504  }
505 
506  ++count;
507  }
508  }
509  }
510  }
511  }
512 
513  /*
514  {
515  std::ofstream ofs( "./counts.txt" );
516 
517  for( Uint8 i( 0 ); i < vector_size; ++i )
518  {
519  Uint4 u( prefix + i ), ru( 0 );
520  ofs << u << ' ' << counts[i] << '\n';
521  }
522 
523  ofs << std::flush;
524  }
525  */
526 
527  for( Uint8 i( 0 ); i < vector_size; ++i )
528  {
529  Uint4 u( prefix + i ), ru( 0 );
530 
531  if( counts[i] > 0 )
532  {
533  ru = reverse_complement( u, unit_size );
534  if( u == ru ) ++total_ecodes; else total_ecodes += 2;
535  }
536 
537  if( counts[i] >= min_count )
538  {
539  if( counts[i] >= max_count )
540  if( u == ru ) ++score_counts[max_count - 1];
541  else score_counts[max_count - 1] += 2;
542  else if( u == ru ) ++score_counts[counts[i] - 1];
543  else score_counts[counts[i] - 1] += 2;
544 
545  if( do_output )
547  u, (counts[i] > t_high) ? t_high : counts[i] );
548  }
549  }
550 }
551 
552 //------------------------------------------------------------------------------
553 const char *
555 {
556  switch( GetErrCode() ) {
557  case eNullGenome: return "empty genome";
558  default: return CException::GetErrCodeString();
559  }
560 }
561 
User-defined methods of the data storage class.
CBioseq_Handle –.
Factory of CSeqMaskerOstat objects.
void setComment(const string &msg)
Add a comment to the unit counts file.
void SetCount(Uint4 count, double pct)
void SetMaxCount(Uint4 mc)
void setUnitCount(Uint4 unit, Uint4 count)
Add count value for a particular unit.
void finalize()
Perform any final tasks required to generate unit counts in the particular format.
void setParam(const string &name, Uint4 value)
Set a value of a WindowMasker parameter.
void setUnitSize(Uint1 us)
Set the unit size value.
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
Reverse complement of a unit.
CSeqVector –.
Definition: seq_vector.hpp:65
Definition: Seq_entry.hpp:56
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
Exceptions that CWinMaskCountsGenerator may throw.
virtual const char * GetErrCodeString() const override
Return description string corresponding to an error code.
~CWinMaskCountsGenerator()
Object destructor.
void process(Uint4 prefix, Uint1 prefix_size, const vector< string > &input, bool do_output)
CRef< CSeqMaskerOstat > ustat
const CWinMaskUtil::CIdSet * ids
void operator()()
This function does the actual n-mer counting.
Uint8 fastalen(const string &fname) const
CWinMaskCountsGenerator(const string &input, const string &output, const string &infmt, const string &sformat, const string &th, Uint4 mem_avail, Uint1 unit_size, Uint8 genome_size, Uint4 min_count, Uint4 max_count, bool check_duplicates, bool use_list, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids, bool use_ba, string const &metadata, double min_pct=-1.0, double extend_pct=-1.0, double thres_pct=-1.0, double max_pct=-1.0)
Constructor.
const CWinMaskUtil::CIdSet * exclude_ids
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
Function iterating over bioseqs in input.
static bool consider(const objects::CBioseq_Handle &bsh, const CIdSet *ids, const CIdSet *exclude_ids)
Check if the given bioseq should be considered for processing.
static SQLCHAR output[256]
Definition: print.c:5
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
string
Definition: cgiapp.hpp:687
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
TSeqPos GetBioseqLength(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
static int input()
int i
int len
const struct ncbi::grid::netcache::search::fields::SIZE size
The Object manager core.
static const char * prefix[]
Definition: pcregrep.c:405
CRef< objects::CObjectManager > om
else result
Definition: token2.c:20
void CheckDuplicates(const vector< string > &input, const string &infmt, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids)
Check for possibly duplicate sequences in the input.
USING_SCOPE(objects)
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
static Uint4 letter(char c)
static bool ambig(char c)
Modified on Thu May 23 12:26:48 2024 by modify_doxy.py rev. 669887