66 case 'a':
case 'A':
return 0;
67 case 'c':
case 'C':
return 1;
68 case 'g':
case 'G':
return 2;
69 case 't':
case 'T':
return 3;
75 static inline bool ambig(
char c )
77 return c !=
'a' && c !=
'A' && c !=
'c' && c !=
'C'
78 && c !=
'g' && c !=
'G' && c !=
't' && c !=
'T';
87 if( bioseq.CanGetInst()
88 && bioseq.GetInst().CanGetLength()
89 && bioseq.GetInst().CanGetSeq_data() )
92 const CSeq_data & seqdata( bioseq.GetInst().GetSeq_data() );
93 unique_ptr< CSeq_data > dest(
new CSeq_data );
96 return dest->GetIupacna().Get();
125 const string & arg_input,
127 const string & infmt_arg,
128 const string & sformat,
129 const string & arg_th,
132 Uint8 arg_genome_size,
135 bool arg_check_duplicates,
139 bool use_ba,
string const & metadata,
140 double min_pct,
double extend_pct,
double thres_pct,
double max_pct )
141 :
input( arg_input ),
143 sformat, os, use_ba, metadata ) ),
144 max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
145 genome_size( arg_genome_size ),
146 min_count( arg_min_count == 0 ? 1 : arg_min_count ),
149 t_high( arg_max_count ),
150 has_min_count( arg_min_count != 0 ),
151 no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
152 check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
154 score_counts( max_count, 0 ),
155 ids( arg_ids ), exclude_ids( arg_exclude_ids ),
159 string::size_type pos( 0 );
162 while( pos != string::npos &&
count < 4 )
164 string::size_type newpos = arg_th.find_first_of(
",", pos );
165 th[
count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
166 pos = (newpos == string::npos ) ? newpos : newpos + 1;
172 const string & arg_input,
174 const string & infmt_arg,
175 const string & sformat,
176 const string & arg_th,
179 Uint8 arg_genome_size,
182 bool arg_check_duplicates,
186 bool use_ba,
string const & metadata,
187 double min_pct,
double extend_pct,
double thres_pct,
double max_pct )
188 :
input( arg_input ),
190 sformat,
output, use_ba, metadata ) ),
191 max_mem( mem_avail*1024*1024ULL ), unit_size( arg_unit_size ),
192 genome_size( arg_genome_size ),
193 min_count( arg_min_count == 0 ? 1 : arg_min_count ),
196 t_high( arg_max_count ),
197 has_min_count( arg_min_count != 0 ),
198 no_extra_pass( arg_min_count != 0 && arg_max_count != 0 ),
199 check_duplicates( arg_check_duplicates ),use_list( arg_use_list ),
201 score_counts( max_count, 0 ),
202 ids( arg_ids ), exclude_ids( arg_exclude_ids ),
206 string::size_type pos( 0 );
209 while( pos != string::npos &&
count < 4 )
211 string::size_type newpos = arg_th.find_first_of(
",", pos );
212 th[
count++] = atof( arg_th.substr( pos, newpos - pos ).c_str() );
213 pos = (newpos == string::npos ) ? newpos : newpos + 1;
216 if( min_pct >= 0.0 )
th[0] = min_pct;
217 if( extend_pct >= 0.0 )
th[1] = extend_pct;
218 if( thres_pct >= 0.0 )
th[2] = thres_pct;
219 if( max_pct >= 0.0 )
th[3] = max_pct;
229 vector< string > file_list;
237 while( getline( fl_stream, line ) ) {
238 if( !line.empty() ) {
239 file_list.push_back( line );
254 LOG_POST(
"computing the genome length" );
258 i != file_list.end(); ++
i )
286 while( suffix_size > 0 ) {
287 Uint8 units_needed( 1ULL<<(2*suffix_size) );
288 if( units_needed <= n_units )
break;
292 NCBI_ASSERT( suffix_size > 0,
"suffix size is 0" );
297 Uint4 prefix_exp( 1<<(2*prefix_size) );
301 for(
Uint4 prefix( 0 ); prefix < prefix_exp; ++prefix ) {
312 Uint4 index[4] = {0, 0, 0, 0};
313 double previous( 0.0 );
339 for(
Uint1 j( 0 ); j < 4; ++j )
340 if( previous <
th[j] && current >=
th[j] )
365 for(
Uint4 prefix( 0 ); prefix < prefix_exp; ++prefix )
366 process( prefix, prefix_size, file_list,
true );
397 s <<
" " <<
th[
i] <<
"%% threshold at " << index[
i];
411 const vector< string > & input_list,
415 Uint8 vector_size( 1ULL<<(2*suffix_size) );
416 vector< Uint4 > counts( vector_size, 0 );
418 Uint4 prefix_mask( ((1<<(2*prefix_size)) - 1)<<(2*suffix_size) );
419 Uint4 suffix_mask( (1<<2*suffix_size) - 1 );
420 if(
unit_size == 16 ) unit_mask = 0xFFFFFFFF;
422 if( suffix_size == 16 )
424 suffix_mask = 0xFFFFFFFF;
428 _TRACE(
"prefix: " << prefix <<
429 "\nprefix_size: " << (
int)prefix_size <<
430 "\nsuffix_size: " << (
int)suffix_size <<
431 "\nvector_size: " << vector_size <<
432 "\nunit_mask: " << unit_mask <<
433 "\nprefix_mask: " << prefix_mask <<
434 "\nsufffix_mask: " << suffix_mask );
446 prefix <<= (2*suffix_size);
450 it != input_list.end(); ++it )
468 for(
Uint4 i( 0 );
i < length; ++
i ) {
483 if( unit <= runit && (unit&prefix_mask) == prefix )
485 auto & c( counts[unit&suffix_mask] );
487 if( c < 0xffffffffUL )
494 if( runit <= unit && (runit&prefix_mask) == prefix )
496 auto & c( counts[runit&suffix_mask] );
498 if( c < 0xffffffffUL )
527 for(
Uint8 i( 0 );
i < vector_size; ++
i )
529 Uint4 u( prefix +
i ), ru( 0 );
User-defined methods of the data storage class.
Factory of CSeqMaskerOstat objects.
void setComment(const string &msg)
Add a comment to the unit counts file.
void SetCount(Uint4 count, double pct)
void SetMaxCount(Uint4 mc)
void setUnitCount(Uint4 unit, Uint4 count)
Add count value for a particular unit.
void finalize()
Perform any final tasks required to generate unit counts in the particular format.
void setParam(const string &name, Uint4 value)
Set a value of a WindowMasker parameter.
void setUnitSize(Uint1 us)
Set the unit size value.
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
Reverse complement of a unit.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
Exceptions that CWinMaskCountsGenerator may throw.
@ eNullGenome
Genome has 0 size.
virtual const char * GetErrCodeString() const override
Return description string corresponding to an error code.
~CWinMaskCountsGenerator()
Object destructor.
vector< Uint4 > score_counts
void process(Uint4 prefix, Uint1 prefix_size, const vector< string > &input, bool do_output)
CRef< CSeqMaskerOstat > ustat
const CWinMaskUtil::CIdSet * ids
void operator()()
This function does the actual n-mer counting.
Uint8 fastalen(const string &fname) const
CWinMaskCountsGenerator(const string &input, const string &output, const string &infmt, const string &sformat, const string &th, Uint4 mem_avail, Uint1 unit_size, Uint8 genome_size, Uint4 min_count, Uint4 max_count, bool check_duplicates, bool use_list, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids, bool use_ba, string const &metadata, double min_pct=-1.0, double extend_pct=-1.0, double thres_pct=-1.0, double max_pct=-1.0)
Constructor.
const CWinMaskUtil::CIdSet * exclude_ids
Base class for sets of seq_id representations used with -ids and -exclude-ids options.
static bool consider(const objects::CBioseq_Handle &bsh, const CIdSet *ids, const CIdSet *exclude_ids)
Check if the given bioseq should be considered for processing.
static SQLCHAR output[256]
unsigned int TSeqPos
Type for sequence locations and lengths.
#define NCBI_ASSERT(expr, mess)
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
TErrCode GetErrCode(void) const
Get error code.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
TSeqPos GetBioseqLength(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
const TSeq & GetSeq(void) const
Get the variant data.
@ e_Iupacna
IUPAC 1 letter nuc acid code.
const struct ncbi::grid::netcache::search::fields::SIZE size
CRef< objects::CObjectManager > om
void CheckDuplicates(const vector< string > &input, const string &infmt, const CWinMaskUtil::CIdSet *ids, const CWinMaskUtil::CIdSet *exclude_ids)
Check for possibly duplicate sequences in the input.
static Uint4 reverse_complement(Uint4 seq, Uint1 size)
static Uint4 letter(char c)
static bool ambig(char c)