NCBI C++ ToolKit
unicode.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef UTIL_UNICODE__H
2 #define UTIL_UNICODE__H
3 
4 /* $Id: unicode.hpp 37166 2008-03-10 14:43:58Z gouriano $
5  * ==========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ==========================================================================
28  *
29  * Author: Aleksey Vinokurov
30  *
31  * File Description:
32  * Unicode transformation library
33  *
34  */
35 
36 #include <corelib/ncbistd.hpp>
37 #include <string>
38 
39 
40 /** @addtogroup utf8
41  *
42  * @{
43  */
44 
45 
47 BEGIN_SCOPE(utf8)
48 
49 /// Types of substitutors.
51 {
52  eSkip = 0, ///< Unicode to be skipped in translation. Usually it is combined mark.
53  eAsIs, ///< Unicodes which should go into the text as is.
54  eString, ///< String of symbols.
55  eException, ///< Throw exception (CUtilException, with type eWrongData)
56  //
57  eHTML, ///< HTML tag or, for example, HTML entity.
58  ePicture, ///< Path to the picture, or maybe picture itself.
59  eOther ///< Something else.
60 };
61 
63 {
66 };
67 
68 /// Structure to keep substititutions for the particular unicode character.
69 typedef struct
70 {
71  const char* Subst; ///< Substitutor for unicode.
72  ESubstType Type; ///< Type of the substitutor.
74 
77 typedef unsigned int TUnicode;
78 
79 
80 /// Convert Unicode character into ASCII string.
81 ///
82 /// @param character
83 /// character to translate
84 /// @param table
85 /// Table to use in translation. If Table is not specified,
86 /// the internal default one will be used.
87 /// @return
88 /// Pointer to substitute structure
92  const SUnicodeTranslation* default_translation=NULL);
93 
94 /// Convert UTF8 into Unicode character.
95 ///
96 /// @param utf
97 /// Start of UTF8 character buffer
98 /// @param unicode
99 /// Pointer to Unicode character to store the result in
100 /// @return
101 /// Length of the translated UTF8 or 0 in case of error.
103 size_t UTF8ToUnicode(const char* utf, TUnicode* unicode);
104 
105 /// Convert Unicode character into UTF8.
106 ///
107 /// @param unicode
108 /// Unicode character
109 /// @param buffer
110 /// UTF8 buffer to store the result
111 /// @param buf_length
112 /// UTF8 buffer size
113 /// @return
114 /// Length of the generated UTF8 sequence
116 size_t UnicodeToUTF8(TUnicode unicode, char *buffer, size_t buf_length);
117 
118 /// Convert Unicode character into UTF8.
119 ///
120 /// @param unicode
121 /// Unicode character
122 /// @return
123 /// UTF8 buffer as a string
125 string UnicodeToUTF8(TUnicode unicode);
126 
127 /// Convert UTF8 into ASCII character buffer.
128 ///
129 /// Decode UTF8 buffer and substitute all Unicodes with appropriate
130 /// symbols or words from dictionary.
131 /// @param src
132 /// UTF8 buffer to decode
133 /// @param dst
134 /// Buffer to put the result in
135 /// @param dst_len
136 /// Length of the destignation buffer
137 /// @param default_translation
138 /// Default translation of unknown Unicode symbols
139 /// @param table
140 /// Table to use in translation. If Table is not specified,
141 /// the internal default one will be used.
142 /// @param result
143 /// Result of the conversion
144 /// @return
145 /// Length of decoded string or -1 if buffer is too small
147 ssize_t UTF8ToAscii(const char* src, char* dst, size_t dst_len,
148  const SUnicodeTranslation* default_translation,
149  const TUnicodeTable* table=NULL,
151 
152 /// Convert UTF8 into ASCII string.
153 ///
154 /// Decode UTF8 buffer and substitute all Unicodes with appropriate
155 /// symbols or words from dictionary.
156 /// @param src
157 /// UTF8 buffer to decode
158 /// @param default_translation
159 /// Default translation of unknown Unicode symbols
160 /// @param table
161 /// Table to use in translation. If Table is not specified,
162 /// the internal default one will be used.
163 /// @param result
164 /// Result of the conversion
165 /// @return
166 /// String with decoded text
168 string UTF8ToAsciiString(const char* src,
169  const SUnicodeTranslation* default_translation,
170  const TUnicodeTable* table=NULL,
172 
173 
174 END_SCOPE(utf8)
176 
177 /* @} */
178 
179 #endif /* UTIL_UNICODE__H */
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
ESubstType Type
Type of the substitutor.
Definition: unicode.hpp:72
string UTF8ToAsciiString(const char *src, const SUnicodeTranslation *default_translation, const TUnicodeTable *table=NULL, EConversionResult *result=NULL)
Convert UTF8 into ASCII string.
Definition: unicode.cpp:526
SUnicodeTranslation TUnicodePlan[256]
Definition: unicode.hpp:75
size_t UnicodeToUTF8(TUnicode unicode, char *buffer, size_t buf_length)
Convert Unicode character into UTF8.
Definition: unicode.cpp:424
const SUnicodeTranslation * UnicodeToAscii(TUnicode character, const TUnicodeTable *table=NULL, const SUnicodeTranslation *default_translation=NULL)
Convert Unicode character into ASCII string.
Definition: unicode.cpp:324
const char * Subst
Substitutor for unicode.
Definition: unicode.hpp:71
size_t UTF8ToUnicode(const char *utf, TUnicode *unicode)
Convert UTF8 into Unicode character.
Definition: unicode.cpp:382
TUnicodePlan * TUnicodeTable[256]
Definition: unicode.hpp:76
ESubstType
Types of substitutors.
Definition: unicode.hpp:51
unsigned int TUnicode
Definition: unicode.hpp:77
EConversionResult
Definition: unicode.hpp:63
ssize_t UTF8ToAscii(const char *src, char *dst, size_t dst_len, const SUnicodeTranslation *default_translation, const TUnicodeTable *table=NULL, EConversionResult *result=NULL)
Convert UTF8 into ASCII character buffer.
Definition: unicode.cpp:458
@ eHTML
HTML tag or, for example, HTML entity.
Definition: unicode.hpp:57
@ eOther
Something else.
Definition: unicode.hpp:59
@ eSkip
Unicode to be skipped in translation. Usually it is combined mark.
Definition: unicode.hpp:52
@ eException
Throw exception (CUtilException, with type eWrongData)
Definition: unicode.hpp:55
@ eString
String of symbols.
Definition: unicode.hpp:54
@ eAsIs
Unicodes which should go into the text as is.
Definition: unicode.hpp:53
@ ePicture
Path to the picture, or maybe picture itself.
Definition: unicode.hpp:58
@ eDefaultTranslationUsed
Definition: unicode.hpp:65
@ eConvertedFine
Definition: unicode.hpp:64
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
int ssize_t
Definition: ncbiconf_msvc.h:93
static BOOL utf
Definition: pcre2grep.c:291
static uint8_t * buffer
Definition: pcre2test.c:1016
NCBI_XUTIL_EXPORT
Parameter to control printing diagnostic message about conversion of static array data from a differe...
Definition: static_set.hpp:72
Structure to keep substititutions for the particular unicode character.
Definition: unicode.hpp:70
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:57:01 2024 by modify_doxy.py rev. 669887