NCBI C++ ToolKit
iconv.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* FreeTDS - Library of routines accessing Sybase and Microsoft databases
2  * Copyright (C) 2003, 2004 James K. Lowden, based on original work by Brian Bruns
3  * Copyright (C) 2011 Frediano Ziglio
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with this library; if not, write to the
17  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18  * Boston, MA 02111-1307, USA.
19  */
20 
21 /**
22  * \file
23  * This file implements a very simple iconv.
24  * Its purpose is to allow ASCII clients to communicate with Microsoft servers
25  * that encode their metadata in Unicode (UTF-16).
26  *
27  * It supports ISO-8859-1, ASCII, CP1252, UTF-16, UCS-4 and UTF-8
28  */
29 
30 #include <config.h>
31 
32 #if ! HAVE_ICONV
33 
34 #if HAVE_STRING_H
35 #include <string.h>
36 #endif /* HAVE_STRING_H */
37 #if HAVE_ERRNO_H
38 #include <errno.h>
39 #endif
40 
41 #include <assert.h>
42 #include <ctype.h>
43 
44 #include <freetds/tds.h>
45 #include <freetds/bytes.h>
46 #include <freetds/iconv.h>
47 #include <freetds/bool.h>
49 
50 #include "iconv_charsets.h"
51 
52 /**
53  * \addtogroup conv
54  * @{
55  */
56 
58 {
59  Like_to_Like = 0x100
60 };
61 
63 
64 /*
65  * Return values for get_*:
66  * - >0 bytes readed
67  * - -EINVAL not enough data to read
68  * - -EILSEQ invalid encoding detected
69  * Return values for put_*:
70  * - >0 bytes written
71  * - -E2BIG no space left on output
72  * - -EILSEQ character can't be encoded in output charset
73  */
74 
75 static int
76 get_utf8(const unsigned char *p, size_t len, ICONV_CHAR *out)
77 {
79  unsigned int l = 1;
80 
81  do {
82  switch (decode_utf8(&state, &uc, *p++)) {
83  case UTF8_ACCEPT:
84  *out = uc;
85  return l;
86  case UTF8_REJECT:
87  return -EILSEQ;
88  }
89  } while (l++ < len);
90  return -EINVAL;
91 }
92 
93 static int
94 put_utf8(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
95 {
96 #define MASK(n) ((0xffffffffu << (n)) & 0xffffffffu)
97  int o_len;
98 
99  if ((c & MASK(7)) == 0) {
100  if (buf_len < 1)
101  return -E2BIG;
102  *buf = (unsigned char) c;
103  return 1;
104  }
105 
106  o_len = 2;
107  for (;;) {
108  if ((c & MASK(11)) == 0)
109  break;
110  ++o_len;
111  if ((c & MASK(16)) == 0)
112  break;
113  ++o_len;
114  if ((c & MASK(21)) == 0)
115  break;
116  ++o_len;
117  if ((c & MASK(26)) == 0)
118  break;
119  ++o_len;
120  if ((c & MASK(31)) != 0)
121  return -EILSEQ;
122  }
123 
124  if (buf_len < o_len)
125  return -E2BIG;
126  buf += o_len;
127  buf_len = o_len - 1;
128  do {
129  *--buf = 0x80 | (c & 0x3f);
130  c >>= 6;
131  } while (--buf_len);
132  *--buf = (0xff00u >> o_len) | c;
133  return (int) o_len;
134 }
135 
136 static int
137 get_ucs4le(const unsigned char *p, size_t len, ICONV_CHAR *out)
138 {
139  if (len < 4)
140  return -EINVAL;
141  *out = TDS_GET_UA4LE(p);
142  return 4;
143 }
144 
145 static int
146 put_ucs4le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
147 {
148  if (buf_len < 4)
149  return -E2BIG;
150  TDS_PUT_UA4LE(buf, c);
151  return 4;
152 }
153 
154 static int
155 get_ucs4be(const unsigned char *p, size_t len, ICONV_CHAR *out)
156 {
157  if (len < 4)
158  return -EINVAL;
159  *out = TDS_GET_UA4BE(p);
160  return 4;
161 }
162 
163 static int
164 put_ucs4be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
165 {
166  if (buf_len < 4)
167  return -E2BIG;
168  TDS_PUT_UA4BE(buf, c);
169  return 4;
170 }
171 
172 static int
173 get_utf16le(const unsigned char *p, size_t len, ICONV_CHAR *out)
174 {
175  ICONV_CHAR c, c2;
176 
177  if (len < 2)
178  return -EINVAL;
179  c = TDS_GET_UA2LE(p);
180  if ((c & 0xfc00) == 0xd800) {
181  if (len < 4)
182  return -EINVAL;
183  c2 = TDS_GET_UA2LE(p+2);
184  if ((c2 & 0xfc00) == 0xdc00) {
185  *out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
186  return 4;
187  }
188  }
189  *out = c;
190  return 2;
191 }
192 
193 static int
194 put_utf16le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
195 {
196  if (c < 0x10000u) {
197  if (buf_len < 2)
198  return -E2BIG;
199  TDS_PUT_UA2LE(buf, c);
200  return 2;
201  }
202  if (TDS_UNLIKELY(c >= 0x110000u))
203  return -EILSEQ;
204  if (buf_len < 4)
205  return -E2BIG;
206  TDS_PUT_UA2LE(buf, 0xd7c0 + (c >> 10));
207  TDS_PUT_UA2LE(buf+2, 0xdc00 + (c & 0x3ffu));
208  return 4;
209 }
210 
211 static int
212 get_utf16be(const unsigned char *p, size_t len, ICONV_CHAR *out)
213 {
214  ICONV_CHAR c, c2;
215 
216  if (len < 2)
217  return -EINVAL;
218  c = TDS_GET_UA2BE(p);
219  if ((c & 0xfc00) == 0xd800) {
220  if (len < 4)
221  return -EINVAL;
222  c2 = TDS_GET_UA2BE(p+2);
223  if ((c2 & 0xfc00) == 0xdc00) {
224  *out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
225  return 4;
226  }
227  }
228  *out = c;
229  return 2;
230 }
231 
232 static int
233 put_utf16be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
234 {
235  if (c < 0x10000u) {
236  if (buf_len < 2)
237  return -E2BIG;
238  TDS_PUT_UA2BE(buf, c);
239  return 2;
240  }
241  if (TDS_UNLIKELY(c >= 0x110000u))
242  return -EILSEQ;
243  if (buf_len < 4)
244  return -E2BIG;
245  TDS_PUT_UA2BE(buf, 0xd7c0 + (c >> 10));
246  TDS_PUT_UA2BE(buf+2, 0xdc00 + (c & 0x3ffu));
247  return 4;
248 }
249 
250 static int
251 get_iso1(const unsigned char *p, size_t len, ICONV_CHAR *out)
252 {
253  *out = p[0];
254  return 1;
255 }
256 
257 static int
258 put_iso1(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
259 {
260  if (c >= 0x100u)
261  return -EILSEQ;
262  if (buf_len < 1)
263  return -E2BIG;
264  buf[0] = (unsigned char) c;
265  return 1;
266 }
267 
268 static int
269 get_ascii(const unsigned char *p, size_t len, ICONV_CHAR *out)
270 {
271  if (p[0] >= 0x80)
272  return -EILSEQ;
273  *out = p[0];
274  return 1;
275 }
276 
277 static int
278 put_ascii(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
279 {
280  if (c >= 0x80u)
281  return -EILSEQ;
282  if (buf_len < 1)
283  return -E2BIG;
284  buf[0] = (unsigned char) c;
285  return 1;
286 }
287 
288 static int
289 get_cp1252(const unsigned char *p, size_t len, ICONV_CHAR *out)
290 {
291  if (*p >= 0x80 && *p < 0xa0)
292  *out = cp1252_0080_00a0[*p - 0x80];
293  else
294  *out = *p;
295  return 1;
296 }
297 
298 static int
299 put_cp1252(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
300 {
301  if (buf_len < 1)
302  return -E2BIG;
303 
304  if (c >= 0x100 || ((c&~0x1fu) == 0x80 && cp1252_0080_00a0[c - 0x80] != c - 0x80)) {
305  switch (c) {
306 #define CP1252(i,o) case o: c = i; break;
307  CP1252_ALL
308 #undef CP1252
309  default:
310  return -EILSEQ;
311  }
312  }
313  *buf = c;
314  return 1;
315 }
316 
317 static int
318 get_err(const unsigned char *p, size_t len, ICONV_CHAR *out)
319 {
320  return -EILSEQ;
321 }
322 
323 static int
324 put_err(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
325 {
326  return -EILSEQ;
327 }
328 
329 typedef int (*iconv_get_t)(const unsigned char *p, size_t len, ICONV_CHAR *out);
330 typedef int (*iconv_put_t)(unsigned char *buf, size_t buf_len, ICONV_CHAR c);
331 
332 static const iconv_get_t iconv_gets[16] = {
335 };
336 static const iconv_put_t iconv_puts[16] = {
339 };
340 
341 /**
342  * Inputs are FreeTDS canonical names, no other. No alias list is consulted.
343  */
344 iconv_t
345 tds_sys_iconv_open (const char* tocode, const char* fromcode)
346 {
347  int i;
348  unsigned int fromto;
349  const char *enc_name;
350  unsigned char encodings[2];
351 
352  static bool first_time = true;
353 
354  if (TDS_UNLIKELY(first_time)) {
355  first_time = false;
356  tdsdump_log(TDS_DBG_INFO1, "Using trivial iconv\n");
357  }
358 
359  /* match both inputs to our canonical names */
360  enc_name = fromcode;
361  for (i=0; i < 2; ++i) {
362  unsigned char encoding;
363 
364  if (strcmp(enc_name, "ISO-8859-1") == 0)
365  encoding = 0;
366  else if (strcmp(enc_name, "US-ASCII") == 0)
367  encoding = 1;
368  else if (strcmp(enc_name, "UCS-2LE") == 0 || strcmp(enc_name, "UTF-16LE") == 0)
369  encoding = 2;
370  else if (strcmp(enc_name, "UCS-2BE") == 0 || strcmp(enc_name, "UTF-16BE") == 0)
371  encoding = 3;
372  else if (strcmp(enc_name, "UCS-4LE") == 0)
373  encoding = 4;
374  else if (strcmp(enc_name, "UCS-4BE") == 0)
375  encoding = 5;
376  else if (strcmp(enc_name, "UTF-8") == 0)
377  encoding = 6;
378  else if (strcmp(enc_name, "CP1252") == 0)
379  encoding = 7;
380  else {
381  errno = EINVAL;
382  return (iconv_t)(-1);
383  }
384  encodings[i] = encoding;
385 
386  enc_name = tocode;
387  }
388 
389  fromto = (encodings[0] << 4) | (encodings[1] & 0x0F);
390 
391  /* like to like */
392  if (encodings[0] == encodings[1]) {
393  fromto = Like_to_Like;
394  }
395 
396  return (iconv_t) (TDS_INTPTR) fromto;
397 }
398 
399 int
401 {
402  return 0;
403 }
404 
405 size_t
406 tds_sys_iconv (iconv_t cd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft)
407 {
408  const unsigned char *ib;
409  unsigned char *ob;
410  size_t il, ol;
411  int local_errno;
412 
413 #undef CD
414 #define CD ((int) (TDS_INTPTR) cd)
415 
416  /* iconv defines valid semantics for NULL inputs, but we don't support them. */
417  if (!inbuf || !*inbuf || !inbytesleft || !outbuf || !*outbuf || !outbytesleft)
418  return 0;
419 
420  /*
421  * some optimizations
422  * - do not use errno directly only assign a time
423  * (some platform define errno as a complex macro)
424  * - some processors have few registers, deference and copy input variable
425  * (this make also compiler optimize more due to removed aliasing)
426  * also we use unsigned to remove required unsigned casts
427  */
428  local_errno = 0;
429  il = *inbytesleft;
430  ol = *outbytesleft;
431  ib = (const unsigned char*) *inbuf;
432  ob = (unsigned char*) *outbuf;
433 
434  if (CD == Like_to_Like) {
435  size_t copybytes = (il < ol)? il : ol;
436 
437  memcpy(ob, ib, copybytes);
438  ob += copybytes;
439  ol -= copybytes;
440  ib += copybytes;
441  il -= copybytes;
442  } else if (CD & ~0xff) {
443  local_errno = EINVAL;
444  } else {
445  iconv_get_t get_func = iconv_gets[(CD>>4) & 15];
446  iconv_put_t put_func = iconv_puts[ CD & 15];
447 
448  while (il) {
449  ICONV_CHAR out_c;
450  int readed = get_func(ib, il, &out_c), written;
451 
452  TDS_EXTRA_CHECK(assert(readed > 0 || readed == -EINVAL || readed == -EILSEQ));
453  if (TDS_UNLIKELY(readed < 0)) {
454  local_errno = -readed;
455  break;
456  }
457 
458  written = put_func(ob, ol, out_c);
459  TDS_EXTRA_CHECK(assert(written > 0 || written == -E2BIG || written == -EILSEQ));
460  if (TDS_UNLIKELY(written < 0)) {
461  local_errno = -written;
462  break;
463  }
464  il -= readed;
465  ib += readed;
466  ol -= written;
467  ob += written;
468  }
469  }
470 
471  /* back to source */
472  *inbytesleft = il;
473  *outbytesleft = ol;
474  *inbuf = (const char*) ib;
475  *outbuf = (char*) ob;
476 
477  if (il && !local_errno)
478  local_errno = E2BIG;
479 
480  if (local_errno) {
481  errno = local_errno;
482  return (size_t)(-1);
483  }
484 
485  return 0;
486 }
487 
488 
489 /** @} */
490 
491 #endif
#define UTF8_REJECT
Definition: bjoern-utf8.h:16
#define UTF8_ACCEPT
Definition: bjoern-utf8.h:15
static uint32_t decode_utf8(uint32_t *state, uint32_t *codep, uint32_t byte)
Definition: bjoern-utf8.h:21
std::ofstream out("events_result.xml")
main entry point for tests
#define TDS_GET_UA4LE(ptr)
Definition: bytes.h:69
#define TDS_PUT_UA2LE(ptr, val)
Definition: bytes.h:61
#define TDS_GET_UA4BE(ptr)
Definition: bytes.h:72
#define TDS_PUT_UA2BE(ptr, val)
Definition: bytes.h:63
#define TDS_GET_UA2BE(ptr)
Definition: bytes.h:57
#define TDS_GET_UA2LE(ptr)
Definition: bytes.h:56
#define TDS_PUT_UA4BE(ptr, val)
Definition: bytes.h:81
#define TDS_PUT_UA4LE(ptr, val)
Definition: bytes.h:78
void * iconv_t
Definition: iconv.h:28
#define EILSEQ
Definition: iconv.h:44
#define tdsdump_log
Definition: tds.h:1561
#define TDS_DBG_INFO1
Definition: tds.h:900
#define TDS_EXTRA_CHECK(stmt)
Definition: tds.h:392
tds_sysdep_intptr_type TDS_INTPTR
Definition: tds.h:155
#define TDS_UNLIKELY(x)
Definition: tds.h:372
#define CD
#define MASK(n)
static int put_ascii(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:278
static int get_iso1(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:251
static int get_utf8(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:76
static int put_utf8(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:94
static int get_utf16be(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:212
static int get_err(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:318
static int put_err(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:324
static int get_ascii(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:269
static int get_ucs4be(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:155
static int get_utf16le(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:173
static int put_ucs4le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:146
static int put_ucs4be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:164
static int put_iso1(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:258
static int put_utf16be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:233
static int get_ucs4le(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:137
static int put_utf16le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:194
uint32_t len
Definition: iconv.c:78
Uint4 uint32_t
static int get_cp1252(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:289
static int put_cp1252(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:299
static const iconv_get_t iconv_gets[16]
Definition: iconv.c:332
size_t tds_sys_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
Definition: iconv.c:396
static const iconv_put_t iconv_puts[16]
Definition: iconv.c:336
iconv_t tds_sys_iconv_open(const char *tocode, const char *fromcode)
Inputs are FreeTDS canonical names, no other.
Definition: iconv.c:337
int(* iconv_put_t)(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:324
TDS_UINT ICONV_CHAR
Definition: iconv.c:58
int tds_sys_iconv_close(iconv_t cd)
Definition: iconv.c:390
int(* iconv_get_t)(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:323
ICONV_CD_VALUE
Definition: iconv.c:54
@ Like_to_Like
Definition: iconv.c:55
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
#define CP1252_ALL
Definition: iconv_charsets.h:6
static const uint16_t cp1252_0080_00a0[]
char * buf
int i
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define assert(x)
Definition: srv_diag.hpp:58
uchar inbuf[1000000]
Definition: unzcrash.c:40
uchar outbuf[(1000000+1000000)]
Definition: unzcrash.c:41
Modified on Wed Sep 04 15:03:56 2024 by modify_doxy.py rev. 669887