NCBI C++ ToolKit
iconv.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* FreeTDS - Library of routines accessing Sybase and Microsoft databases
2  * Copyright (C) 2003, 2004 James K. Lowden, based on original work by Brian Bruns
3  * Copyright (C) 2011 Frediano Ziglio
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public
16  * License along with this library; if not, write to the
17  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18  * Boston, MA 02111-1307, USA.
19  */
20 
21 /**
22  * \file
23  * This file implements a very simple iconv.
24  * Its purpose is to allow ASCII clients to communicate with Microsoft servers
25  * that encode their metadata in Unicode (UTF-16).
26  *
27  * It supports ISO-8859-1, ASCII, UTF-16, UCS-4 and UTF-8
28  */
29 
30 #include <config.h>
31 
32 #if ! HAVE_ICONV
33 
34 #if HAVE_STRING_H
35 #include <string.h>
36 #endif /* HAVE_STRING_H */
37 #if HAVE_ERRNO_H
38 #include <errno.h>
39 #endif
40 
41 #include <assert.h>
42 #include <ctype.h>
43 
44 #include <freetds/tds.h>
45 #include <freetds/bytes.h>
46 #include <freetds/iconv.h>
47 
48 /**
49  * \addtogroup conv
50  * @{
51  */
52 
54 {
55  Like_to_Like = 0x100
56 };
57 
59 
60 static const unsigned char utf8_lengths[256] = {
61  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
63  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
64  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
65  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
66  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
67  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
68  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
69  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
74  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
75  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
76  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0,
77 };
78 
79 static const unsigned char utf8_masks[7] = {
80  0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01
81 };
82 
83 /*
84  * Return values for get_*:
85  * - >0 bytes readed
86  * - -EINVAL not enough data to read
87  * - -EILSEQ invalid encoding detected
88  * Return values for put_*:
89  * - >0 bytes written
90  * - -E2BIG no space left on output
91  * - -EILSEQ character can't be encoded in output charset
92  */
93 
94 static int
95 get_utf8(const unsigned char *p, size_t len, ICONV_CHAR *out)
96 {
97  ICONV_CHAR uc;
98  size_t l;
99 
100  l = utf8_lengths[p[0]];
101  if (TDS_UNLIKELY(l == 0))
102  return -EILSEQ;
103  if (TDS_UNLIKELY(len < l))
104  return -EINVAL;
105 
106  len = l;
107  uc = *p++ & utf8_masks[l];
108  while(--l)
109  uc = (uc << 6) | (*p++ & 0x3f);
110  *out = uc;
111  return (int)len;
112 }
113 
114 static int
115 put_utf8(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
116 {
117 #define MASK(n) ((0xffffffffu << (n)) & 0xffffffffu)
118  int o_len;
119 
120  if ((c & MASK(7)) == 0) {
121  if (buf_len < 1)
122  return -E2BIG;
123  *buf = (unsigned char) c;
124  return 1;
125  }
126 
127  o_len = 2;
128  for (;;) {
129  if ((c & MASK(11)) == 0)
130  break;
131  ++o_len;
132  if ((c & MASK(16)) == 0)
133  break;
134  ++o_len;
135  if ((c & MASK(21)) == 0)
136  break;
137  ++o_len;
138  if ((c & MASK(26)) == 0)
139  break;
140  ++o_len;
141  if ((c & MASK(31)) != 0)
142  return -EILSEQ;
143  }
144 
145  if (buf_len < o_len)
146  return -E2BIG;
147  buf += o_len;
148  buf_len = o_len - 1;
149  do {
150  *--buf = 0x80 | (c & 0x3f);
151  c >>= 6;
152  } while (--buf_len);
153  *--buf = (0xff00u >> o_len) | c;
154  return o_len;
155 }
156 
157 static int
158 get_ucs4le(const unsigned char *p, size_t len, ICONV_CHAR *out)
159 {
160  if (len < 4)
161  return -EINVAL;
162  *out = TDS_GET_A4LE(p);
163  return 4;
164 }
165 
166 static int
167 put_ucs4le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
168 {
169  if (buf_len < 4)
170  return -E2BIG;
171  TDS_PUT_A4LE(buf, c);
172  return 4;
173 }
174 
175 static int
176 get_ucs4be(const unsigned char *p, size_t len, ICONV_CHAR *out)
177 {
178  if (len < 4)
179  return -EINVAL;
180  *out = TDS_GET_A4BE(p);
181  return 4;
182 }
183 
184 static int
185 put_ucs4be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
186 {
187  if (buf_len < 4)
188  return -E2BIG;
189  TDS_PUT_A4BE(buf, c);
190  return 4;
191 }
192 
193 static int
194 get_utf16le(const unsigned char *p, size_t len, ICONV_CHAR *out)
195 {
196  ICONV_CHAR c, c2;
197 
198  if (len < 2)
199  return -EINVAL;
200  c = TDS_GET_A2LE(p);
201  if ((c & 0xfc00) == 0xd800) {
202  if (len < 4)
203  return -EINVAL;
204  c2 = TDS_GET_A2LE(p+2);
205  if ((c2 & 0xfc00) != 0xdc00)
206  return -EILSEQ;
207  *out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
208  return 4;
209  }
210  *out = c;
211  return 2;
212 }
213 
214 static int
215 put_utf16le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
216 {
217  if (c >= 0x110000u)
218  return -EILSEQ;
219  if (c < 0x10000u) {
220  if (buf_len < 2)
221  return -E2BIG;
222  TDS_PUT_A2LE(buf, c);
223  return 2;
224  }
225  if (buf_len < 4)
226  return -E2BIG;
227  c -= 0x10000u;
228  TDS_PUT_A2LE(buf, 0xd800 + (c >> 10));
229  TDS_PUT_A2LE(buf+2, 0xdc00 + (c & 0x3ffu));
230  return 4;
231 }
232 
233 static int
234 get_utf16be(const unsigned char *p, size_t len, ICONV_CHAR *out)
235 {
236  ICONV_CHAR c, c2;
237 
238  if (len < 2)
239  return -EINVAL;
240  c = TDS_GET_A2BE(p);
241  if ((c & 0xfc00) == 0xd800) {
242  if (len < 4)
243  return -EINVAL;
244  c2 = TDS_GET_A2BE(p+2);
245  if ((c2 & 0xfc00) != 0xdc00)
246  return -EILSEQ;
247  *out = (c << 10) + c2 - ((0xd800 << 10) + 0xdc00 - 0x10000);
248  return 4;
249  }
250  *out = c;
251  return 2;
252 }
253 
254 static int
255 put_utf16be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
256 {
257  if (c >= 0x110000u)
258  return -EILSEQ;
259  if (c < 0x10000u) {
260  if (buf_len < 2)
261  return -E2BIG;
262  TDS_PUT_A2BE(buf, c);
263  return 2;
264  }
265  if (buf_len < 4)
266  return -E2BIG;
267  c -= 0x10000u;
268  TDS_PUT_A2BE(buf, 0xd800 + (c >> 10));
269  TDS_PUT_A2BE(buf+2, 0xdc00 + (c & 0x3ffu));
270  return 4;
271 }
272 
273 static int
274 get_iso1(const unsigned char *p, size_t len, ICONV_CHAR *out)
275 {
276  *out = p[0];
277  return 1;
278 }
279 
280 static int
281 put_iso1(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
282 {
283  if (c >= 0x100u)
284  return -EILSEQ;
285  if (buf_len < 1)
286  return -E2BIG;
287  buf[0] = (unsigned char) c;
288  return 1;
289 }
290 
291 static int
292 get_ascii(const unsigned char *p, size_t len, ICONV_CHAR *out)
293 {
294  if (p[0] >= 0x80)
295  return -EILSEQ;
296  *out = p[0];
297  return 1;
298 }
299 
300 static int
301 put_ascii(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
302 {
303  if (c >= 0x80u)
304  return -EILSEQ;
305  if (buf_len < 1)
306  return -E2BIG;
307  buf[0] = (unsigned char) c;
308  return 1;
309 }
310 
311 static int
312 get_err(const unsigned char *p, size_t len, ICONV_CHAR *out)
313 {
314  return -EILSEQ;
315 }
316 
317 static int
318 put_err(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
319 {
320  return -EILSEQ;
321 }
322 
323 typedef int (*iconv_get_t)(const unsigned char *p, size_t len, ICONV_CHAR *out);
324 typedef int (*iconv_put_t)(unsigned char *buf, size_t buf_len, ICONV_CHAR c);
325 
326 static const iconv_get_t iconv_gets[8] = {
328 };
329 static const iconv_put_t iconv_puts[8] = {
331 };
332 
333 /**
334  * Inputs are FreeTDS canonical names, no other. No alias list is consulted.
335  */
336 iconv_t
337 tds_sys_iconv_open (const char* tocode, const char* fromcode)
338 {
339  int i;
340  unsigned int fromto;
341  const char *enc_name;
342  unsigned char encodings[2];
343 
344  static char first_time = 1;
345 
346  if (TDS_UNLIKELY(first_time)) {
347  first_time = 0;
348  tdsdump_log(TDS_DBG_INFO1, "Using trivial iconv\n");
349  }
350 
351  /* match both inputs to our canonical names */
352  enc_name = fromcode;
353  for (i=0; i < 2; ++i) {
354  unsigned char encoding;
355 
356  if (strcmp(enc_name, "ISO-8859-1") == 0)
357  encoding = 0;
358  else if (strcmp(enc_name, "US-ASCII") == 0)
359  encoding = 1;
360  else if (strcmp(enc_name, "UCS-2LE") == 0 || strcmp(enc_name, "UTF-16LE") == 0)
361  encoding = 2;
362  else if (strcmp(enc_name, "UCS-2BE") == 0 || strcmp(enc_name, "UTF-16BE") == 0)
363  encoding = 3;
364  else if (strcmp(enc_name, "UCS-4LE") == 0)
365  encoding = 4;
366  else if (strcmp(enc_name, "UCS-4BE") == 0)
367  encoding = 5;
368  else if (strcmp(enc_name, "UTF-8") == 0)
369  encoding = 6;
370  else {
371  errno = EINVAL;
372  return (iconv_t)(-1);
373  }
374  encodings[i] = encoding;
375 
376  enc_name = tocode;
377  }
378 
379  fromto = (encodings[0] << 4) | (encodings[1] & 0x0F);
380 
381  /* like to like */
382  if (encodings[0] == encodings[1]) {
383  fromto = Like_to_Like;
384  }
385 
386  return (iconv_t) (TDS_INTPTR) fromto;
387 }
388 
389 int
391 {
392  return 0;
393 }
394 
395 size_t
396 tds_sys_iconv (iconv_t cd, const char* * inbuf, size_t *inbytesleft, char* * outbuf, size_t *outbytesleft)
397 {
398  const unsigned char *ib;
399  unsigned char *ob;
400  size_t il, ol;
401  int local_errno;
402 
403 #undef CD
404 #define CD ((int) (TDS_INTPTR) cd)
405 
406  /* iconv defines valid semantics for NULL inputs, but we don't support them. */
407  if (!inbuf || !*inbuf || !inbytesleft || !outbuf || !*outbuf || !outbytesleft)
408  return 0;
409 
410  /*
411  * some optimizations
412  * - do not use errno directly only assign a time
413  * (some platform define errno as a complex macro)
414  * - some processors have few registers, deference and copy input variable
415  * (this make also compiler optimize more due to removed aliasing)
416  * also we use unsigned to remove required unsigned casts
417  */
418  local_errno = 0;
419  il = *inbytesleft;
420  ol = *outbytesleft;
421  ib = (const unsigned char*) *inbuf;
422  ob = (unsigned char*) *outbuf;
423 
424  if (CD == Like_to_Like) {
425  size_t copybytes = (il < ol)? il : ol;
426 
427  memcpy(ob, ib, copybytes);
428  ob += copybytes;
429  ol -= copybytes;
430  ib += copybytes;
431  il -= copybytes;
432  } else if (CD & ~0x77) {
433  local_errno = EINVAL;
434  } else {
435  iconv_get_t get_func = iconv_gets[(CD>>4) & 7];
436  iconv_put_t put_func = iconv_puts[ CD & 7];
437 
438  while (il) {
439  ICONV_CHAR out_c;
440  int readed = get_func(ib, il, &out_c), written;
441 
442  TDS_EXTRA_CHECK(assert(readed > 0 || readed == -EINVAL || readed == -EILSEQ));
443  if (TDS_UNLIKELY(readed < 0)) {
444  local_errno = -readed;
445  break;
446  }
447 
448  written = put_func(ob, ol, out_c);
449  TDS_EXTRA_CHECK(assert(written > 0 || written == -E2BIG || written == -EILSEQ));
450  if (TDS_UNLIKELY(written < 0)) {
451  local_errno = -written;
452  break;
453  }
454  il -= readed;
455  ib += readed;
456  ol -= written;
457  ob += written;
458  }
459  }
460 
461  /* back to source */
462  *inbytesleft = il;
463  *outbytesleft = ol;
464  *inbuf = (const char*) ib;
465  *outbuf = (char*) ob;
466 
467  if (il && !local_errno)
468  local_errno = E2BIG;
469 
470  if (local_errno) {
471  errno = local_errno;
472  return (size_t)(-1);
473  }
474 
475  return 0;
476 }
477 
478 
479 /** @} */
480 
481 #endif
std::ofstream out("events_result.xml")
main entry point for tests
#define TDS_PUT_A4LE(ptr, val)
Definition: bytes.h:152
#define TDS_GET_A2BE(ptr)
Definition: bytes.h:59
#define TDS_GET_A2LE(ptr)
Definition: bytes.h:140
#define TDS_GET_A4LE(ptr)
Definition: bytes.h:141
#define TDS_GET_A4BE(ptr)
Definition: bytes.h:76
#define TDS_PUT_A2LE(ptr, val)
Definition: bytes.h:151
#define TDS_PUT_A2BE(ptr, val)
Definition: bytes.h:66
#define TDS_PUT_A4BE(ptr, val)
Definition: bytes.h:85
void * iconv_t
Definition: iconv.h:28
#define EILSEQ
Definition: iconv.h:44
#define tdsdump_log
Definition: tds.h:1561
#define TDS_DBG_INFO1
Definition: tds.h:900
#define TDS_EXTRA_CHECK(stmt)
Definition: tds.h:392
tds_sysdep_intptr_type TDS_INTPTR
Definition: tds.h:155
#define TDS_UNLIKELY(x)
Definition: tds.h:372
tds_sysdep_uint32_type TDS_UINT
Definition: tds.h:150
#define CD
#define MASK(n)
uint32_t len
Definition: iconv.c:78
static int put_ascii(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:301
static int get_iso1(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:274
static int get_utf8(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:95
static int put_utf8(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:115
static int get_utf16be(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:234
static const iconv_get_t iconv_gets[8]
Definition: iconv.c:326
static int get_err(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:312
size_t tds_sys_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft)
Definition: iconv.c:396
static const unsigned char utf8_masks[7]
Definition: iconv.c:79
static const unsigned char utf8_lengths[256]
Definition: iconv.c:60
static int put_err(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:318
static int get_ascii(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:292
iconv_t tds_sys_iconv_open(const char *tocode, const char *fromcode)
Inputs are FreeTDS canonical names, no other.
Definition: iconv.c:337
static int get_ucs4be(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:176
int(* iconv_put_t)(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:324
TDS_UINT ICONV_CHAR
Definition: iconv.c:58
static int get_utf16le(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:194
static int put_ucs4le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:167
static int put_ucs4be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:185
static int put_iso1(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:281
static int put_utf16be(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:255
static const iconv_put_t iconv_puts[8]
Definition: iconv.c:329
static int get_ucs4le(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:158
int tds_sys_iconv_close(iconv_t cd)
Definition: iconv.c:390
static int put_utf16le(unsigned char *buf, size_t buf_len, ICONV_CHAR c)
Definition: iconv.c:215
int(* iconv_get_t)(const unsigned char *p, size_t len, ICONV_CHAR *out)
Definition: iconv.c:323
ICONV_CD_VALUE
Definition: iconv.c:54
@ Like_to_Like
Definition: iconv.c:55
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
char * buf
int i
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define assert(x)
Definition: srv_diag.hpp:58
uchar inbuf[1000000]
Definition: unzcrash.c:40
uchar outbuf[(1000000+1000000)]
Definition: unzcrash.c:41
Modified on Wed Sep 04 15:02:25 2024 by modify_doxy.py rev. 669887