NCBI C++ ToolKit
writedb_convert.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb_convert.cpp 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file writedb_convert.cpp
31 /// Data conversion tools for CWriteDB and associated code.
32 /// class for WriteDB.
33 #include <ncbi_pch.hpp>
35 #include <util/random_gen.hpp>
38 #include <iostream>
39 
41 
43 
44 /// Ambiguous portion of a sequence.
45 ///
46 /// This class represents a portion of a sequence that is ambiguous.
47 /// The ambiguites represented by this region must have the same
48 /// ambiguity letter, be contiguous, and span at most 4095 letters.
49 /// If any of these conditions is not met, the issue may be solved by
50 /// the use of multiple objects of this class.
52 public:
53  /// Construct a new, empty, ambiguous region.
55  : m_Start (0),
56  m_End (0),
57  m_Value (0)
58  {
59  }
60 
61  /// Construct a new ambiguous region one letter in length.
62  /// @param value Ambiguity letter to use. [in]
63  /// @param offset Starting offset of the ambiguity. [in]
65  : m_Start (offset),
66  m_End (offset+1),
67  m_Value (value)
68  {
69  }
70 
71  /// Construct a new ambiguous region of a specified length.
72  /// @param value Ambiguity letter to use. [in]
73  /// @param offset Starting offset of the ambiguity. [in]
74  /// @param length Length of the ambiguity. [in]
75  CAmbiguousRegion(int value, int offset, int length)
76  : m_Start (offset),
77  m_End (offset+length),
78  m_Value (value)
79  {
80  }
81 
82  /// Try to append a letter to an ambiguous region.
83  ///
84  /// The provided letter will be appended to this region if the
85  /// resulting region would be 4095 or fewer letters in length,
86  /// contain only one ambiguity letter, and be contiguous. If it
87  /// would not, false is returned and this region is unchanged.
88  ///
89  /// @param value Ambiguity letter to use. [in]
90  /// @param offset Starting offset of the ambiguity. [in]
91  /// @return True if the letter was appended, false otherwise.
92  bool Append(int value, int offset)
93  {
94  _ASSERT(m_End && m_Value);
95 
96  if (value == m_Value && offset == m_End) {
97  if (Length() < eMaxLength) {
98  m_End ++;
99  return true;
100  }
101  }
102 
103  return false;
104  }
105 
106  /// Get the letter value for this region.
107  /// @return Letter value.
108  int Value() const
109  {
110  return m_Value;
111  }
112 
113  /// Get the length of this ambiguous region.
114  /// @return Length of the region.
115  int Length() const
116  {
117  return m_End - m_Start;
118  }
119 
120  /// Get the starting offset of the region.
121  /// @return Starting offset of the region.
122  int Offset() const
123  {
124  return m_Start;
125  }
126 
127 private:
128  // Maxinum length that we allow for one region of ambiguities;
129  // longer regions than this will be packed as multiple regions.
130  //
131  // Note that in WriteDB, the new format is triggered by long
132  // regions OR long sequences. In formatdb, the new format is
133  // triggered only by long sequences, which can result in long
134  // ambiguous regions being stored as lists of thousands of lists
135  // of short ambiguity runs.
136 
137  // Old format uses <= 16 bases per Int4, which means that a run of
138  // 32 'N' bases results in 32/16 = 2 Int4s, which is 8 bytes,
139  // which is the same space as used to stored 16 compressed bases.
140 
141  enum {
142  eMaxLength = 0xFFF ///< Maximum length of a region.
143  };
144 
145  int m_Start; ///< Starting offset (offset of first base).
146  int m_End; ///< End offset (offset of first disincluded base.)
147  int m_Value; ///< Value of base (ambiguity letter).
148 };
149 
150 /// Encode ambiguities in blast database format.
151 ///
152 /// This class encodes nucleotide ambiguities in blast database format
153 /// from a series of ambiguous letter values and offsets.
155 public:
156  /// Constructor.
157  /// @param sz Size of the sequence in letters. [in]
159  : m_Size(sz), m_Random(sz)
160  {
161  for(int i = 0; i < 16; i++) {
162  m_Log2[i] = -1;
163  }
164 
165  // Only these values should be specified
166  m_Log2[1] = 0;
167  m_Log2[2] = 1;
168  m_Log2[4] = 2;
169  m_Log2[8] = 3;
170  }
171 
172  /// Check (and maybe store) a possibly ambiguous letter.
173  ///
174  /// If the letter is not an ambiguity, this method converts it to
175  /// Blast-NA2 format, and returns it. If the letter value is an
176  /// ambiguity, it is added to the list of ambiguities, and a
177  /// randomly selected letter value is returned. Each ambiguity
178  /// letter (there are 12 for nucleotide) represents a possiblity
179  /// between two or more nucleotide bases. The random letter is
180  /// always selected from the set of these values corresponding to
181  /// the input ambiguity value.
182  ///
183  /// @param data Letter value in BlastNA8. [in]
184  /// @param offset Offset of letter. [in]
185  /// @return Value to encode as BlastNA2.
186  int Check(int data, int offset)
187  {
188  // If offset is past the sequence end, return zero. This can
189  // happen since the calling code checks for ambiguities two
190  // letters at a time.
191 
192  if (offset >= m_Size) {
193  return 0;
194  }
195 
196  // If we recieve a non-ambiguity, return the normal
197  // translation for this letter code.
198 
199  _ASSERT(data != 0);
200 
201  int rv = m_Log2[data & 0xF];
202 
203  if (rv != -1) {
204  return rv;
205  }
206 
207  // Bona-fide ambiguity; we need to make up some random junk,
208  // and also build an ambiguous region.
209 
211 
212  return x_Random(data);
213  }
214 
215  // New format: (inclusive ranges)
216  // 4 bits (31-28): value (residue mask, i.e. 0xF for 'N')
217  // 12 bits (27-16): length of region
218  // 16 bits (15- 0): (unused)
219  //
220  // 32 bits: offset
221  //
222  // Old format:
223  // 4 bits (31-28): value (residue mask, i.e. 0xF for 'N')
224  // 4 bits (27-24): length of region
225  // 24 bits (23- 0): offset
226 
227  /// Compute and return the encoded list of ambiguities.
228  ///
229  /// The list of ambiguous regions is packed in blast database
230  /// format and returned to the user. If the length of the
231  /// sequence is larger than 2^24-1, or any of the ambiguous
232  /// regions is larger than 0xF, the 'new' format of ambiguity is
233  /// used, which allows for larger ambiguous regions at higher
234  /// sequence offsets, but requires 8 bytes per ambiguous region
235  /// instead of four bytes required by the 'old' format.
236  ///
237  /// @param amb The ambiguity data in blast database format. [out]
238  void GetAmbig(string & amb)
239  {
240  // The decision to use the new format should (probably) be
241  // selected via an analysis of all ambiguities encoded here.
242 
243  bool new_format = false;
244 
245  // Current technique: If the sequence length is longer than
246  // 2^24-1, I use the new format. Else, if any region of
247  // ambiguities is longer than 16 bases, I use the new format.
248  // Otherwise, I use the old format.
249 
250  if (m_Size > 0xFFFFFF) {
251  new_format = true;
252  } else {
253  for(unsigned i = 0; i<m_Regions.size(); i++) {
254  if (m_Regions[i].Length() > 0xF) {
255  new_format = true;
256  break;
257  }
258  }
259  }
260 
261  int num_amb = (int) m_Regions.size();
262 
263  // The size packed here is actually the number of *words* used
264  // rather than the number of ambiguous regions; the new format
265  // two words per region.
266 
267  Uint4 amb_words = (new_format
268  ? (0x80000000 | (num_amb * 2))
269  : num_amb);
270 
271  amb.reserve((1 + m_Regions.size())*8);
272  s_AppendInt4(amb, amb_words);
273 
274  for(int i = 0; i < num_amb; i++) {
275  // Regions over 4k are split during ambiguity scanning; at
276  // this point in the code, splitting has already happened.
277 
278  if (new_format) {
279  x_PackNewAmbig(amb, m_Regions[i]);
280  } else {
281  x_PackOldAmbig(amb, m_Regions[i]);
282  }
283  }
284  }
285 
286  /// Append the 'new' encoding of one ambiguous region to a string.
287  /// @param amb String encoding of all ambiguous regions. [in|out]
288  /// @param r Ambiguous region. [in]
289  void x_PackNewAmbig(string & amb,
290  const CAmbiguousRegion & r)
291  {
292  int length_m1 = r.Length() - 1;
293 
294  _ASSERT(r.Value() <= 15);
295  _ASSERT((length_m1 >> 12) == 0);
296 
297  // First word - residue value and run length.
298  char A1[4];
299 
300  char ch0 = (r.Value() << 4) | (length_m1 >> 8);
301  char ch1 = length_m1 & 0xFF;
302 
303  A1[0] = ch0;
304  A1[1] = ch1;
305  A1[2] = A1[3] = 0; // unused space
306 
307  amb.append(A1, 4);
308 
309  // Second word - starting offset.
310 
311  s_AppendInt4(amb, r.Offset());
312  }
313 
314  /// Append the 'old' encoding of one ambiguous region to a string.
315  /// @param amb String encoding of all ambiguous regions. [in|out]
316  /// @param r Ambiguous region. [in]
317  void x_PackOldAmbig(string & amb, CAmbiguousRegion & r)
318  {
319  int length_m1 = r.Length() - 1;
320  int off = r.Offset();
321 
322  _ASSERT(r.Value() <= 15);
323  _ASSERT((length_m1 >> 4) == 0);
324  _ASSERT(off <= 0xFFFFFF); // old form uses three byte offsets
325 
326  // All in one word.
327  char A1[4];
328 
329  char ch0 = (r.Value() << 4) | length_m1;
330 
331  A1[0] = ch0;
332  A1[1] = (off >> 16) & 0xFF;
333  A1[2] = (off >> 8) & 0xFF;
334  A1[3] = (off ) & 0xFF;
335 
336  amb.append(A1, 4);
337  }
338 
339 private:
340  /// Add an ambiguity letter.
341  ///
342  /// The internal encoding contains a list of ambiguous ranges.
343  /// This method adds the given letter at the given offset to the
344  /// most recent region, if possible, or creates a new region for
345  /// it.
346  ///
347  /// @param value Ambiguous letter to add. [in]
348  /// @param offset Offset at which letter occurs. [in]
349  void x_AddAmbig(int value, int offset)
350  {
351  if (m_Regions.size()) {
352  if (m_Regions.back().Append(value, offset)) {
353  return;
354  }
355  }
356 
358  m_Regions.push_back(r);
359  }
360 
361  /// Pick a random letter from the set represented by an ambiguity.
362  ///
363  /// This method takes an ambiguous value as input, and returns a
364  /// letter randomly chosen from the set of letters the ambiguity
365  /// represents.
366  ///
367  /// @param value An ambiguous letter. [in]
368  /// @return A non-ambiguous letter.
369  int x_Random(int value)
370  {
371  // 0xF is the most common case of ambiguity so it's worth to
372  // process it in fastest way, especially because it's very easy.
373 
374  if (value == 0xF) {
375  return m_Random.GetRand() & 0x3;
376  }
377 
378  if (value == 0) {
379  cerr << "Error: '0' ambiguity code found, changing to 15." << endl;
380  return m_Random.GetRand() & 0x3;
381  }
382 
383  int bitcount = ((value & 1) +
384  ((value >> 1) & 1) +
385  ((value >> 2) & 1) +
386  ((value >> 3) & 1));
387 
388  // 1-bit ambiguities here, indicate an error in this class.
389  _ASSERT(bitcount >= 2);
390  _ASSERT(bitcount <= 3);
391 
392  int pick = m_Random.GetRand() % bitcount;
393 
394  for(int i = 0; i < 4; i++) {
395  // skip 0 bits in input.
396  if ((value & (1 << i)) == 0)
397  continue;
398 
399  // If the bitcount is zero, this is the bit we want.
400  if (! pick)
401  return i;
402 
403  // Else, decrement.
404  pick--;
405  }
406 
407  // This should be unreachable.
408  _ASSERT(0);
409  return 0;
410  }
411 
412  // Data
413 
414  /// Table mapping 1248 to 0123.
415  int m_Log2[16];
416 
417  /// Size of the input sequence.
418  int m_Size;
419 
420  /// Ambiguous regions for the sequence.
421  vector<CAmbiguousRegion> m_Regions;
422 
423  /// Random number generator
425 };
426 
427 /// Builds a table from NA4 to NA2 (with ambiguities marked as 0xFF.)
428 /// @return A vector indexed by NA4 value, with values from 0-3 or 0xFF.
429 inline vector<unsigned char> s_BuildNa4ToNa2Table()
430 {
431  // ctable takes a byte index and returns 0-16 output from 0-16.
432  // Invalid or ambiguous elements return FF instead.
433 
434  vector<unsigned char> ctable;
435  ctable.resize(16, 0xFF);
436 
437  for(int i = 0; i<4; i++) {
438  ctable[1 << i] = i;
439  }
440 
441  return ctable;
442 }
443 
444 void WriteDB_Ncbi4naToBinary(const char * ncbi4na,
445  int byte_length,
446  int base_length,
447  string & seq,
448  string & amb)
449 {
450  typedef unsigned char uchar;
451 
452  static vector<uchar> ctable = s_BuildNa4ToNa2Table();
453 
454  // Build the sequence data.
455 
456  int inp_bytes = s_DivideRoundUp(base_length, 2);
457  int blast_bytes = base_length / 4 + 1;
458  int remainder = base_length & 3;
459  int last_byte = blast_bytes - 1;
460 
461  // Accumulator for ambiguity data.
462  CAmbigDataBuilder ambiguities(base_length);
463 
464  if (!((int)inp_bytes == (int)byte_length)) {
465  cout << "ib=" << inp_bytes << ",n4sz=" << byte_length << endl;
466  }
467 
468  _ASSERT((int)inp_bytes == (int)byte_length);
469 
470  seq.resize(blast_bytes);
471 
472  // Fun Fact: If the number of bases is even, we can process two at
473  // a time, but if the number of input bases is odd, we want to
474  // short circuit the last 'read' from the input array, since that
475  // would walk off the end of the array. So, i2_limit computes the
476  // proper limit for i2's input data, where the primary iteration
477  // refers to the iteration limit for i1.
478 
479  for(int i = 0; i < inp_bytes; i++) {
480  // one input byte
481  uchar inp = ncbi4na[i];
482 
483  // represents 2 bases
484  uchar b1 = inp >> 4;
485  uchar b2 = inp & 0xF;
486 
487  // compress each to 2 bits
488  uchar c1 = ctable[b1];
489  uchar c2 = ctable[b2];
490 
491  uchar half = 0;
492 
493  if (((c1 | c2) & 0x80) == 0) {
494  // No ambiguities, so we can do this the easy way.
495 
496  half = (c1 << 2) | c2;
497  } else {
498  // Check each element, accumulate ambiguity data.
499 
500  if (! b1) {
501  b1 = 0xF; // replace gap with 'N'
502  }
503 
504  if (! b2 && (i*2+1) < base_length) {
505  b2 = 0xF; // the last base is the sentinel
506  }
507 
508  half |= ambiguities.Check(b1, i*2) << 2;
509  half |= ambiguities.Check(b2, i*2+1);
510  }
511 
512  seq[i/2] |= (i & 1) ? half : (half << 4);
513  }
514  seq[last_byte] &= 255-3;
515  seq[last_byte] |= remainder;
516 
517  ambiguities.GetAmbig(amb);
518 }
519 
520 void WriteDB_Ncbi4naToBinary(const CSeq_inst & seqinst,
521  string & seq,
522  string & amb)
523 {
524  const vector<char> & na4 = seqinst.GetSeq_data().GetNcbi4na().Get();
525  int base_length = seqinst.GetLength();
526 
527  WriteDB_Ncbi4naToBinary(& na4[0], (int) na4.size(), base_length, seq, amb);
528 }
529 
530 void WriteDB_StdaaToBinary(const CSeq_inst & si, string & seq)
531 {
532  // No conversion is actually done here.
533  const vector<char> & v = si.GetSeq_data().GetNcbistdaa().Get();
534 
535  _ASSERT(si.GetLength() == v.size());
536  seq.assign(& v[0], v.size());
537 }
538 
539 void WriteDB_EaaToBinary(const CSeq_inst & si, string & seq)
540 {
541  const string & v = si.GetSeq_data().GetNcbieaa().Get();
542 
543  _ASSERT(si.GetLength() == v.size());
544 
545  // convert to string.
548  0,
549  (int) v.size(),
550  seq,
552 }
553 
554 void WriteDB_IupacaaToBinary(const CSeq_inst & si, string & seq)
555 {
556  const string & v = si.GetSeq_data().GetIupacaa().Get();
557 
558  _ASSERT(si.GetLength() == v.size());
559 
560  // convert to string.
563  0,
564  (int) v.size(),
565  seq,
567 }
568 
570  string & seq)
571 {
572  int base_length = si.GetLength();
573  int data_bytes = s_DivideRoundUp(base_length, 4);
574  int blast_bytes = base_length / 4 + 1;
575  int remainder = base_length & 3;
576  int last_byte = blast_bytes - 1;
577 
578  const vector<char> & v = si.GetSeq_data().GetNcbi2na().Get();
579 
580  _ASSERT((int)data_bytes == (int)v.size());
581 
582  seq.reserve(blast_bytes);
583  seq.assign(& v[0], data_bytes);
584  seq.resize(blast_bytes);
585 
586  seq[last_byte] &= 255-3;
587  seq[last_byte] |= remainder;
588 }
589 
590 void WriteDB_IupacnaToBinary(const CSeq_inst & si, string & seq, string & amb)
591 {
592  const string & v = si.GetSeq_data().GetIupacna().Get();
593 
594  _ASSERT(si.GetLength() == v.size());
595 
596  string tmp;
597  // convert to string.
600  0,
601  (int) v.size(),
602  tmp,
604 
606  (int) tmp.size(),
607  (int) si.GetLength(),
608  seq,
609  amb);
610 }
611 
Encode ambiguities in blast database format.
int m_Log2[16]
Table mapping 1248 to 0123.
void x_AddAmbig(int value, int offset)
Add an ambiguity letter.
CAmbigDataBuilder(int sz)
Constructor.
void GetAmbig(string &amb)
Compute and return the encoded list of ambiguities.
void x_PackOldAmbig(string &amb, CAmbiguousRegion &r)
Append the 'old' encoding of one ambiguous region to a string.
vector< CAmbiguousRegion > m_Regions
Ambiguous regions for the sequence.
int Check(int data, int offset)
Check (and maybe store) a possibly ambiguous letter.
int x_Random(int value)
Pick a random letter from the set represented by an ambiguity.
CRandom m_Random
Random number generator.
void x_PackNewAmbig(string &amb, const CAmbiguousRegion &r)
Append the 'new' encoding of one ambiguous region to a string.
int m_Size
Size of the input sequence.
Ambiguous portion of a sequence.
CAmbiguousRegion(int value, int offset, int length)
Construct a new ambiguous region of a specified length.
int m_Value
Value of base (ambiguity letter).
int Length() const
Get the length of this ambiguous region.
CAmbiguousRegion(int value, int offset)
Construct a new ambiguous region one letter in length.
bool Append(int value, int offset)
Try to append a letter to an ambiguous region.
int m_Start
Starting offset (offset of first base).
int m_End
End offset (offset of first disincluded base.)
int Value() const
Get the letter value for this region.
@ eMaxLength
Maximum length of a region.
CAmbiguousRegion()
Construct a new, empty, ambiguous region.
int Offset() const
Get the starting offset of the region.
CRandom::
Definition: random_gen.hpp:66
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
static int base_length[29]
static const char si[8][64]
Definition: des.c:146
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
static unsigned char ctable[16]
const TPrim & Get(void) const
Definition: serialbase.hpp:347
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
Definition: random_gen.hpp:238
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
static const BitmapCharRec ch0
Definition: ncbi_10x20.c:13
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static const CMedia A1("A1", 594, 841, CUnit::eMillimeter)
#define _ASSERT
unsigned char uchar
Definition: unzcrash.c:37
void WriteDB_Ncbi4naToBinary(const char *ncbi4na, int byte_length, int base_length, string &seq, string &amb)
Build binary blast2na + ambig encoding based on ncbi4na input.
void WriteDB_Ncbi2naToBinary(const CSeq_inst &si, string &seq)
Build blast db nucleotide format from Ncbi2na Seq-inst.
vector< unsigned char > s_BuildNa4ToNa2Table()
Builds a table from NA4 to NA2 (with ambiguities marked as 0xFF.)
void WriteDB_EaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Eaa protein Seq-inst.
void WriteDB_IupacaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Iupacaa protein Seq-inst.
USING_SCOPE(std)
void WriteDB_StdaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Stdaa protein Seq-inst.
void WriteDB_IupacnaToBinary(const CSeq_inst &si, string &seq, string &amb)
Build blast db nucleotide format from Iupacna Seq-inst.
Data conversion tools for CWriteDB and associated code.
void s_AppendInt4(string &outp, int x)
Append a value to a string as a 4 byte big-endian integer.
Implementation for general purpose utilities for WriteDB.
int s_DivideRoundUp(int value, int blocksize)
Divide by a number, rounding up to a whole integer.
Modified on Fri Sep 20 14:58:18 2024 by modify_doxy.py rev. 669887