NCBI C++ ToolKit
win_mask_counts_converter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: win_mask_counts_converter.cpp 98105 2022-09-29 00:28:26Z morgulis $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of counts format converter class.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <sstream>
36 
41 
43 
45 
46 //------------------------------------------------------------------------------
48  const string & input_fname, const string & output_fname,
49  const string & counts_oformat, string const & metadata )
50  : istat( 0 ), ofname( output_fname ), oformat( counts_oformat ), os( 0 ),
51  metadata( metadata )
52 {
53  if( input_fname == "-" ) {
54  NCBI_THROW(
55  Exception, eBadOption, "input file name must be non-empty" );
56  }
57 
58  if( output_fname == "-" ) {
59  NCBI_THROW(
60  Exception, eBadOption, "output file name must be non-empty" );
61  }
62 
63  LOG_POST( "reading counts..." );
65  input_fname, 0, 0, 0, 0, 0, 0, true );
66 }
67 
68 //------------------------------------------------------------------------------
70  const string & input_fname, CNcbiOstream & out_stream,
71  const string & counts_oformat, string const & metadata )
72  : istat( 0 ), ofname( "" ), oformat( counts_oformat ), os( &out_stream ),
73  metadata( metadata )
74 {
75  if( input_fname == "-" ) {
76  NCBI_THROW(
77  Exception, eBadOption, "input file name must be non-empty" );
78  }
79 
80  LOG_POST( "reading counts..." );
82  input_fname, 0, 0, 0, 0, 0, 0, true );
83 }
84 
85 //------------------------------------------------------------------------------
87 {
88  CRef< CSeqMaskerOstat > ostat( 0 );
89  string md( metadata );
90 
91  if( md.empty() ) md = istat->GetMetaData();
92 
93  if( os == 0 ) {
95  oformat, ofname, true, md );
96  }
97  else ostat = CSeqMaskerOstatFactory::create( oformat, *os, true, md );
98 
99  Uint4 unit_size = istat->UnitSize();
100  _TRACE( "set unit size to " << unit_size );
101  ostat->setUnitSize( unit_size );
102  Uint8 num_units = (unit_size < 16) ? (1ULL<<(2*unit_size))
103  : 0x100000000ULL;
104  LOG_POST( "converting counts..." );
105 
106  for( Uint8 i = 0; i < num_units; ++i ) {
107  Uint4 ri = CSeqMaskerUtil::reverse_complement( i, unit_size );
108 
109  if( i <= ri ) {
110  Uint4 count = istat->trueat( i );
111  if( count != 0 ) ostat->setUnitCount( i, count );
112  }
113  }
114 
115  LOG_POST( "converting parameters..." );
116 
117  Uint4 t_low = istat->get_min_count();
118  Uint4 t_extend = istat->get_textend();
119  Uint4 t_threshold = istat->get_threshold();
120  Uint4 t_high = istat->get_max_count();
121  ostat->setParam( "t_low ", t_low );
122  ostat->setParam( "t_extend ", t_extend );
123  ostat->setParam( "t_threshold", t_threshold );
124  ostat->setParam( "t_high ", t_high );
125  LOG_POST( "final processing..." );
127 
128  if( !istat->GetCountMap().empty() )
129  {
130  Uint4 max_count( istat->GetMaxCount() );
131  auto const & cm( istat->GetCountMap() );
132  ostat->SetMaxCount( max_count );
133  for( size_t i( 0 ); i <= max_count; ++i ) ostat->SetCount( i, cm[i] );
134  }
135 
136  ostat->finalize();
137  return 0;
138 }
139 
140 //------------------------------------------------------------------------------
141 const char *
143 {
144  switch( GetErrCode() ) {
145  case eBadOption: return "argument error";
146  default: return CException::GetErrCodeString();
147  }
148 }
149 
151 
static CSeqMaskerIstat * create(const string &name, Uint4 threshold, Uint4 textend, Uint4 max_count, Uint4 use_max_count, Uint4 min_count, Uint4 use_min_count, bool use_ba, double min_pct=-1.0, double extend_pct=-1.0, double thres_pct=-1.0, double max_pct=-1.0)
Create a unit counts container from a file.
virtual Uint4 trueat(Uint4 unit) const =0
Get the true count for an n-mer.
Uint4 get_min_count() const
Get the value of T_low.
std::vector< double > const & GetCountMap() const
string const & GetMetaData() const
Return the metadata string.
Uint4 GetMaxCount() const
Uint4 get_textend() const
Get the value of T_extend.
Uint4 get_max_count() const
Get the current value of T_high.
CSeqMaskerVersion const & GetStatAlgoVersion() const
Return the version of the algorithm used to generate counts.
virtual Uint1 UnitSize() const =0
Get the unit size.
Uint4 get_threshold() const
Get the value of T_threshold.
static CSeqMaskerOstat * create(const string &ustat_type, const string &name, bool use_ba, string const &metadata="")
Method used to create a CSeqMakserOstat object by format name.
void SetStatAlgoVersion(CSeqMaskerVersion const &v)
Set the counts generation algorithm version explicitly (needed for convertions).
void SetCount(Uint4 count, double pct)
void SetMaxCount(Uint4 mc)
void setUnitCount(Uint4 unit, Uint4 count)
Add count value for a particular unit.
void finalize()
Perform any final tasks required to generate unit counts in the particular format.
void setParam(const string &name, Uint4 value)
Set a value of a WindowMasker parameter.
void setUnitSize(Uint1 us)
Set the unit size value.
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
Reverse complement of a unit.
Class defining exceptions specific to CWinMaskCountsConverter.
virtual const char * GetErrCodeString() const override
Return description string corresponding to an error code.
@ eBadOption
Command line options inconsistency.
string oformat
target n-mer counts format for the output
CWinMaskCountsConverter(const string &input_fname, const string &output_fname, const string &counts_oformat, string const &metadata)
Instance constructor.
CRef< CSeqMaskerIstat > istat
object containing unit counts read from the input
int operator()()
Method performing the actual conversion.
#define md
Definition: compat-1.3.h:2001
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
int i
Modified on Sun Apr 14 05:28:16 2024 by modify_doxy.py rev. 669887