NCBI C++ ToolKit
utf8.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utf8.cpp 70627 2016-01-08 13:02:41Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksey Vinokurov, Vladimir Ivanov
27  *
28  * File Description: UTF8 converter functions
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <util/utf8.hpp>
34 
37 
38 // Translation tables.
39 // I've put codes from ASCII-7 table here. So in this table should be only
40 // 7-bit characters and two special characters - 0x00 (unable to translate)
41 // and 0xFF (character should be skipped).
42 
43 static unsigned char tblTrans[] =
44 {
45  // Latin Base
46  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
47  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 08
48  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 09
49  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 'a', 0, '"', 0, 0, '-', // 0A
50  0xFF, 0, '2', '3','\'', 0, 0, '.', 0, '1', 'o', 0, '"', 0, 0, 0 , // 0B
51  'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', // 0C
52  'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'Y', 0, 'B', // 0D
53  'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', // 0E
54  'o', 'n', 'o', 'o', 'o', 'o', 'o', '-', 'o', 'u', 'u', 'u', 'u', 'y', 0, 'y', // 0F
55  // Latin A
56  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
57  'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', // 10
58  'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', // 11
59  'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', // 12
60  'I', 'i', 'J', 'j', 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', // 13
61  'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 14
62  'O', 'o', 'O', 'o', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', // 15
63  'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 16
64  'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 0 , // 17
65  // Latin B
66  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
67  'b', 'B', 0 , 0 , 0 , 0 , 0 , 'C', 'c', 'D', 'D', 0 , 0 , 0 , 0 , 0 , // 18
68  'E', 'F', 'f', 'G', 0 , 0 , 0 , 'I', 'K', 'k', 0 , 0 , 0 , 'N', 'n', 0 , // 19
69  'O', 'o', 0 , 0 , 'P', 'p', 'R', 0 , 0 , 0 , 0 , 't', 'T', 't', 'T', 'U', // 1A
70  'u', 0 , 0 , 'Y', 'y', 'Z', 'z', 'Z', 0 , 0 , 'z', 0 , 0 , 0 , 0 , 0 , // 1B
71  0 , 0 , 0 , '!', 'D', 'd', 'd', 'L', 'L', 'l', 'N', 'N', 'n', 'A', 'a', 'I', // 1C
72  'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 0 , 'A', 'a', // 1D
73  'A', 'a', 'A', 'a', 'G', 'g', 'G', 'g', 'K', 'k', 'O', 'o', 'O', 'o', 'Z', 'z', // 1E
74  'j', 'D', 'D', 'd', 'G', 'g', 0 , 0 , 'N', 'n', 'A', 'a', 0, 0 , 'O', 'o', // 1F
75  'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 20
76  'R', 'r', 'R', 'r', 'U', 'u', 'U', 'u', 'S', 's', 'T', 't', 0 , 0 , 'H', 'h', // 21
77  0 , 0 , 0 , 0 , 'Z', 'z', 'A', 'a', 'E', 'e', 'O', 'o', 'O', 'o', 'O', 'o', // 22
78  'O', 'o', 'Y', 'y', 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 23
79  0 , 0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 24
80  // IPA Extensions
81  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
82  0 , 'a', 0 , 0 , 0 , 0 , 'd', 'd', 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 25
83  'g', 'g', 'G', 0 , 0 , 0 , 'h' ,'h', 'i', 'i', 'I', 0 , 0 , 0 , 0 , 0 , // 26
84  0, 'm', 0, 'n', 'N', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 27
85  'R', 0, 's', 0, 0, 0, 0, 0, 't', 'u', 0, 0, 0, 0, 0, 'Y', // 28
86  'Z', 'Z', 'z', 'z', 0, 0, 0, 0, 'O', 'B', 0, 'G', 'H', 'j', 0, 'L', // 29
87  'q', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , // 2A
88  // Spacing Modifiers
89  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
90  'h', 'h', 'j', 'r', 0 , 0 , 0 , 'w', 'y','\'', '"','\'','\'','\'','\'','\'', // 2B
91  '?', '?', '<', '>', '^', 'v', '^', 'v','\'', '-','\'', '`','\'', '_','\'', '`', // 2C
92  0, 0, '\'','\'', 0 , 0 , '+', '-', '~', '.', '.', 0, '~', '"' , 0 , 'x', // 2D
93  0 , 0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 2E
94  0 , 'l', 's', 'x', 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 'v' ,'=', '"', 0 // 2F
95 
96 };
97 
98 static unsigned char tblTransA[] =
99 {
100  // Spacing Modifiers
101  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
102  'A', 'a', 'B', 'b', 'B', 'b', 'B', 'b', 'C', 'c', 'D', 'd', 'D', 'd', 'D', 'd', // 1E0
103  'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'F', 'f', // 1E1
104  'G', 'g', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', // 1E2
105  'K', 'k', 'K', 'k', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'M', 'm', // 1E3
106  'M', 'm', 'M', 'm', 'N', 'n', 'N', 'n', 'N', 'n', 'N', 'n', 'O', 'o', 'O', 'o', // 1E4
107  'O', 'o', 'O', 'o', 'P', 'p', 'P', 'p', 'R', 'r', 'R', 'r', 'R', 'r', 'R', 'r', // 1E5
108  'S', 's', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', // 1E6
109  'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'V', 'v', 'V', 'v', // 1E7
110  'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'W', 'w', 'X', 'x', 'X', 'x', 'Y', 'y', // 1E8
111  'Z', 'z', 'Z', 'z', 'Z', 'z', 'h', 't', 'w', 'y', 'a', 'f', 0 , 0 , 0 , 0 , // 1E9
112  'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', // 1EA
113  'A', 'a', 'A', 'a', 'A', 'a', 'A', 'a', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', // 1EB
114  'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'I', 'i', 'I', 'i', 'O', 'o', 'O', 'o', // 1EC
115  'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', 'O', 'o', // 1ED
116  'O', 'o', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', // 1EE
117  'U', 'u', 'Y', 'y', 'Y', 'y', 'Y', 'y', 'Y', 'y', 0 , 0 , 0 , 0, 0, 0 // 1EF
118 
119 };
120 
121 // Macro for return character together with status
122 // Using in functions returning status their work
123 //
124 #define RETURN_S(ch,res)\
125 {\
126  if (status) *status = res;\
127  return ch;\
128 }
129 
130 // Macro for return character together with status and length
131 // Using in functions returning status and length their work
132 //
133 #define RETURN_LS(ch,len,res)\
134 {\
135  if (seq_len) *seq_len = len;\
136  if (status) *status = res;\
137  return ch;\
138 }
139 
140 
141 // Convert first UTF-8 symbol of "src" into ASCII-7 character.
142 // "ascii_table" specifies whether to use ASCII-7 translation tables.
143 // Length of the retrieved UTF-8 symbol is returned in "*seq_len"
144 // (if "seq_len" is not NULL).
145 // Return resulting ASCII-7 character.
146 // NOTE: If the UTF-8 symbol has no ASCII-7 equivalent, then return
147 // kOutrangeChar or hSkipChar.
148 //
149 char StringToChar(const string& src,
150  size_t* seq_len,
151  bool ascii_table,
152  EConversionStatus* status)
153 {
154  long dst_code; // UTF-code symbol code
155  unsigned char dst_char; // Result character
156  EConversionStatus stat; // Temporary status
157 
158  // Process one UTF character
159  dst_code = StringToCode(src, seq_len, &stat);
160  if (status) *status = stat;
161  // If it was happily
162  if (stat == eSuccess) {
163  // Conversion
164  if (ascii_table) {
165  // Convert into appropriate 7-bit character via conversion table
166  dst_char = CodeToChar(dst_code, status);
167  return dst_char;
168  }
169  else
170  {
171  // if character greater than 127 (0x7F) than substitute it
172  // with kOutrangeChar, else leave it as is.
173  if (dst_code > 0x7F) {
175  }
176  }
177  }
178  // Was error translate char
179  return (char)dst_code;
180 }
181 
182 
183 // Convert UTF-8 string "src" into the ASCII-7 string with
184 // graphically similar characters -- using StringToChar().
185 // Return resulting ASCII-7 string.
186 //
187 string StringToAscii(const string& src, bool ascii_table)
188 {
189  string dst; // String to result
190  char ch; // Temporary UTF symbol code
191  size_t utf_len; // Length of UTF symbol
192  size_t src_len; // Length source string
193 
194  src_len = src.size();
195 
196  for (size_t i = 0; i < src_len; )
197  {
198  // Process one UTF character
199  ch = StringToChar(src.data() + i, &utf_len, ascii_table);
200  // Add character to the result vector
201  if ( ch != kSkipChar ) dst += ch;
202  i += utf_len;
203  }
204  return dst;
205 }
206 
207 
208 // Convert first UTF-8 symbol of "src" into a Unicode symbol code.
209 // Length of the retrieved UTF-8 symbol is returned in "*seq_len"
210 // (if "seq_len" is not NULL).
211 // Return resulting Unicode symbol code.
212 // NOTE: If the UTF-8 symbol has no Unicode equivalent, then return
213 // kOutrangeChar or hSkipChar.
214 //
215 long StringToCode(const string& src,
216  size_t* seq_len,
217  EConversionStatus* status)
218 {
219  unsigned char ch = src.data()[0];
220  size_t utf_len = 0;
221  long dst_code = 0;
222 
223  // If character less then 0x80 we put it as is
224  if (ch < 0x80)
225  {
226  RETURN_LS (ch, 1, eSuccess)
227  }
228  else
229  {
230  // Determine the length of the UTF-8 symbol in bytes
231  if ((ch & 0xFC) == 0xFC) utf_len = 6; // 6 bytes length
232  else if ((ch & 0xF8) == 0xF8) utf_len = 5; // 5 bytes length
233  else if ((ch & 0xF0) == 0xF0) utf_len = 4; // 4 bytes length
234  else if ((ch & 0xE0) == 0xE0) utf_len = 3; // 3 bytes length
235  else if ((ch & 0xC0) == 0xC0) utf_len = 2; // 2 bytes length
236  else
237  {
238  // Bad character. Save it as kOutrangeChar
240  }
241  }
242 
243  // Broken unicode sequence
244  if (utf_len > src.size()) {
245  RETURN_LS ((long)kSkipChar, 1, eSkipChar);
246  }
247 
248  unsigned char mask = 0xFF;
249  mask = (unsigned char)(mask >> (int)utf_len);
250  dst_code = ch & mask;
251 
252  for (size_t j = 1; j < utf_len; j++)
253  {
254  dst_code = dst_code << 6;
255  ch = src.data()[j];
256  ch &= 0x3F;
257  dst_code = dst_code | ch;
258  }
259  // Return result
260  RETURN_LS (dst_code, utf_len, eSuccess)
261 }
262 
263 
264 // Convert UTF-8 string "src" into the vector of Unicode symbol codes
265 // using StringToCode().
266 // Return resulting vector.
267 //
268 vector<long> StringToVector (const string& src)
269 {
270  vector<long> dst; // String to result
271  long ch; // Unicode symbol code
272  size_t utf_len; // Length of Unicode symbol
273  size_t src_len; // Length of source string
274 
275  src_len = src.size();
276 
277  for (size_t i = 0; i < src_len; )
278  {
279  // Process one UTF character
280  ch = StringToCode(src.data()+i, &utf_len);
281  // Add character to the result vector
282  dst.push_back(ch);
283  i += utf_len;
284  }
285  return dst;
286 }
287 
288 
289 // Translate Unicode symbol code "src" into graphically similar ASCII-7
290 // character.
291 // Return resulting ASCII-7 character.
292 // NOTE: If the Unicode symbol has no ASCII-7 equivalent, then return
293 // kOutrangeChar or hSkipChar.
294 //
295 char CodeToChar(const long src, EConversionStatus* status)
296 {
297  unsigned char ch;
298 
299  if (src < 0x80) RETURN_S ((char)src, eSuccess);
300  if ((src >= 0x0300) && (src <= 0x036F)) RETURN_S (kSkipChar, eSkipChar);
301  if ((src >= 0x1E00) && (src <= 0x1EFF))
302  {
303  ch = tblTransA[src-0x1E00];
305  else RETURN_S ((char)ch, eSuccess);
306  }
307  if ((src >= 0xFE20) && (src <= 0xFE2F)) RETURN_S (kSkipChar, eSkipChar);
308  if (src > 0x2FF) RETURN_S (kOutrangeChar, eOutrangeChar);
309 
310  ch = tblTrans[src-0x80];
312 
313  RETURN_S ((char)ch, eSuccess);
314 }
315 
316 
#define static
ncbi::TMaskedQueryRegions mask
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
vector< long > StringToVector(const string &src)
Definition: utf8.cpp:268
char StringToChar(const string &src, size_t *seq_len, bool ascii_table, EConversionStatus *status)
Definition: utf8.cpp:149
EConversionStatus
Definition: utf8.hpp:64
char CodeToChar(const long src, EConversionStatus *status)
Definition: utf8.cpp:295
const char kOutrangeChar
Definition: utf8.hpp:54
long StringToCode(const string &src, size_t *seq_len, EConversionStatus *status)
Definition: utf8.cpp:215
string StringToAscii(const string &src, bool ascii_table)
Definition: utf8.cpp:187
const char kSkipChar
Definition: utf8.hpp:61
@ eSkipChar
Definition: utf8.hpp:66
@ eSuccess
Definition: utf8.hpp:65
@ eOutrangeChar
Definition: utf8.hpp:67
int i
static BOOL utf8
Definition: pcregrep.c:199
static unsigned char tblTransA[]
Definition: utf8.cpp:98
#define RETURN_S(ch, res)
Definition: utf8.cpp:124
static unsigned char tblTrans[]
Definition: utf8.cpp:43
#define RETURN_LS(ch, len, res)
Definition: utf8.cpp:133
Modified on Fri Mar 01 10:08:26 2024 by modify_doxy.py rev. 669887