161 for(
int i = 0;
i < 16;
i++) {
243 bool new_format =
false;
267 Uint4 amb_words = (new_format
268 ? (0x80000000 | (num_amb * 2))
274 for(
int i = 0;
i < num_amb;
i++) {
292 int length_m1 =
r.Length() - 1;
295 _ASSERT((length_m1 >> 12) == 0);
300 char ch0 = (
r.Value() << 4) | (length_m1 >> 8);
301 char ch1 = length_m1 & 0xFF;
319 int length_m1 =
r.Length() - 1;
320 int off =
r.Offset();
323 _ASSERT((length_m1 >> 4) == 0);
329 char ch0 = (
r.Value() << 4) | length_m1;
332 A1[1] = (off >> 16) & 0xFF;
333 A1[2] = (off >> 8) & 0xFF;
334 A1[3] = (off ) & 0xFF;
379 cerr <<
"Error: '0' ambiguity code found, changing to 15." << endl;
383 int bitcount = ((
value & 1) +
394 for(
int i = 0;
i < 4;
i++) {
396 if ((
value & (1 <<
i)) == 0)
434 vector<unsigned char>
ctable;
437 for(
int i = 0;
i<4;
i++) {
450 typedef unsigned char uchar;
459 int last_byte = blast_bytes - 1;
464 if (!((
int)inp_bytes == (
int)byte_length)) {
465 cout <<
"ib=" << inp_bytes <<
",n4sz=" << byte_length << endl;
468 _ASSERT((
int)inp_bytes == (
int)byte_length);
470 seq.resize(blast_bytes);
479 for(
int i = 0;
i < inp_bytes;
i++) {
485 uchar b2 = inp & 0xF;
493 if (((c1 | c2) & 0x80) == 0) {
496 half = (c1 << 2) | c2;
508 half |= ambiguities.
Check(b1,
i*2) << 2;
509 half |= ambiguities.
Check(b2,
i*2+1);
512 seq[
i/2] |= (
i & 1) ? half : (half << 4);
514 seq[last_byte] &= 255-3;
515 seq[last_byte] |= remainder;
533 const vector<char> & v =
si.GetSeq_data().GetNcbistdaa().Get();
536 seq.assign(& v[0], v.size());
541 const string & v =
si.GetSeq_data().GetNcbieaa().Get();
556 const string & v =
si.GetSeq_data().GetIupacaa().Get();
576 int last_byte = blast_bytes - 1;
578 const vector<char> & v =
si.GetSeq_data().GetNcbi2na().Get();
580 _ASSERT((
int)data_bytes == (
int)v.size());
582 seq.reserve(blast_bytes);
583 seq.assign(& v[0], data_bytes);
584 seq.resize(blast_bytes);
586 seq[last_byte] &= 255-3;
587 seq[last_byte] |= remainder;
592 const string & v =
si.GetSeq_data().GetIupacna().Get();
607 (
int)
si.GetLength(),
Encode ambiguities in blast database format.
int m_Log2[16]
Table mapping 1248 to 0123.
void x_AddAmbig(int value, int offset)
Add an ambiguity letter.
CAmbigDataBuilder(int sz)
Constructor.
void GetAmbig(string &amb)
Compute and return the encoded list of ambiguities.
void x_PackOldAmbig(string &amb, CAmbiguousRegion &r)
Append the 'old' encoding of one ambiguous region to a string.
vector< CAmbiguousRegion > m_Regions
Ambiguous regions for the sequence.
int Check(int data, int offset)
Check (and maybe store) a possibly ambiguous letter.
int x_Random(int value)
Pick a random letter from the set represented by an ambiguity.
CRandom m_Random
Random number generator.
void x_PackNewAmbig(string &amb, const CAmbiguousRegion &r)
Append the 'new' encoding of one ambiguous region to a string.
int m_Size
Size of the input sequence.
Ambiguous portion of a sequence.
CAmbiguousRegion(int value, int offset, int length)
Construct a new ambiguous region of a specified length.
int m_Value
Value of base (ambiguity letter).
int Length() const
Get the length of this ambiguous region.
CAmbiguousRegion(int value, int offset)
Construct a new ambiguous region one letter in length.
bool Append(int value, int offset)
Try to append a letter to an ambiguous region.
int m_Start
Starting offset (offset of first base).
int m_End
End offset (offset of first disincluded base.)
int Value() const
Get the letter value for this region.
@ eMaxLength
Maximum length of a region.
CAmbiguousRegion()
Construct a new, empty, ambiguous region.
int Offset() const
Get the starting offset of the region.
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
static int base_length[29]
static const char si[8][64]
static unsigned char ctable[16]
const TPrim & Get(void) const
uint32_t Uint4
4-byte (32-bit) unsigned integer
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
TLength GetLength(void) const
Get the Length member data.
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
unsigned int
A callback function used to compare two keys in a database.
const GenericPointer< typename T::ValueType > T2 value
static const BitmapCharRec ch1
static const BitmapCharRec ch0
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void WriteDB_Ncbi4naToBinary(const char *ncbi4na, int byte_length, int base_length, string &seq, string &amb)
Build binary blast2na + ambig encoding based on ncbi4na input.
void WriteDB_Ncbi2naToBinary(const CSeq_inst &si, string &seq)
Build blast db nucleotide format from Ncbi2na Seq-inst.
vector< unsigned char > s_BuildNa4ToNa2Table()
Builds a table from NA4 to NA2 (with ambiguities marked as 0xFF.)
void WriteDB_EaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Eaa protein Seq-inst.
void WriteDB_IupacaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Iupacaa protein Seq-inst.
void WriteDB_StdaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Stdaa protein Seq-inst.
void WriteDB_IupacnaToBinary(const CSeq_inst &si, string &seq, string &amb)
Build blast db nucleotide format from Iupacna Seq-inst.
Data conversion tools for CWriteDB and associated code.
void s_AppendInt4(string &outp, int x)
Append a value to a string as a 4 byte big-endian integer.
Implementation for general purpose utilities for WriteDB.
int s_DivideRoundUp(int value, int blocksize)
Divide by a number, rounding up to a whole integer.