41 #define NCBI_USE_ERRCODE_X Util_Unicode
46 #include "unicode_plans.inl"
52 0, 0, 0, 0, 0, 0, 0, 0,
54 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0,
61 0, 0, 0, 0, 0, 0, 0, 0,
63 0, 0, 0, 0, 0, 0, 0, 0,
64 0, 0, 0, 0, 0, 0, 0, 0,
66 0, 0, 0, 0, 0, 0, 0, 0,
67 0, 0, 0, 0, 0, 0, 0, 0,
69 0, 0, 0, 0, 0, 0, 0, 0,
70 0, 0, 0, 0, 0, 0, 0, 0,
72 0, 0, 0, 0, 0, 0, 0, 0,
73 0, 0, 0, 0, 0, 0, 0, 0,
75 0, 0, 0, 0, 0, 0, 0, 0,
76 0, 0, 0, 0, 0, 0, 0, 0,
78 0, 0, 0, 0, 0, 0, 0, 0,
79 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0,
82 0, 0, 0, 0, 0, 0, 0, 0,
84 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0,
90 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0,
138 : m_initialized(
false), m_pool(0)
164 if (!ifs.is_open()) {
165 ERR_POST_X(1,
"UnicodeToAscii table not found: " << name);
168 LOG_POST_X(2,
Info <<
"Loading UnicodeToAscii table at: " << name);
172 size_t poolsize = filelen/2;
175 ERR_POST_X(3,
"UnicodeToAscii table failed to load: not enough memory");
189 if (poolpos + translation.size() + 1 > poolsize) {
190 m_pool = (
char*)realloc(
m_pool, poolsize += filelen/4);
192 ERR_POST_X(3,
"UnicodeToAscii table failed to load: not enough memory");
197 symbolToOffset[symbol] = poolpos;
198 memcpy(
m_pool+poolpos, translation.data(), translation.size());
199 poolpos += translation.size();
205 ERR_POST_X(1,
"UnicodeToAscii table is empty: " << name);
216 sym != symend; ++sym) {
227 string& line,
TUnicode& symbol,
string& translation)
234 string::size_type begin = line.find_first_not_of(
" \t", 0);
235 if (begin == string::npos) {
238 string::size_type end = line.find_first_of(
" \t,#",begin);
242 if (end == string::npos) {
250 if ( end == line.size() || line[end] ==
'#') {
254 end = line.find(
',',end);
255 if (end == string::npos) {
258 begin = line.find_first_not_of(
" \t", ++end);
259 if (begin == string::npos) {
262 if (*(line.data()+begin) !=
'\"') {
265 const char* data = line.data()+begin;
266 const char* dataend = line.data()+line.size();
267 for (++data; data < dataend; ++data) {
274 if (data < dataend) {
278 case 'a': c = 0x7;
break;
279 case 'b': c = 0x8;
break;
280 case 't': c = 0x9;
break;
281 case 'n': c = 0xA;
break;
282 case 'v': c = 0xB;
break;
283 case 'f': c = 0xC;
break;
284 case 'r': c = 0xD;
break;
285 case '0': c = 0x0;
break;
287 if (data + 1 < dataend) {
288 begin = data + 1 - line.data();
289 end = line.find_first_not_of(
"0123456789abcdefABCDEF", begin);
290 if (end == string::npos) {
294 data = line.data() + end;
299 if (data == dataend) {
303 translation.append(1,c);
329 if (
t.IsInitialized()) {
330 return t.GetTranslation(character);
335 if ((character & (~0xFFFF)) == 0) {
336 unsigned int thePlanNo = (character & 0xFF00) >> 8;
337 unsigned int theOffset = character & 0xFF;
340 translation = &((*thePlan)[theOffset]);
344 if (!default_translation) {
349 "UnicodeToAscii: unknown Unicode symbol");
351 translation = default_translation;
359 const char *p = theUTF;
362 if ( ((*theUTF) & 0xC0) != 0xC0 ) {
364 RC |= (
unsigned char)theUTF[0];
370 while ((counter =
Int1(counter << 1)) < 0) {
371 unsigned char c = *p++;
372 if ((c & ~077) != 0200) {
375 acc = (acc << 6) | (c & 077);
384 const char *p = theUTF;
387 if ( (
unsigned char)theUTF[0] < 0x80 ) {
389 *theUnicode = *theUTF;
393 if ( ((*theUTF) & 0xC0) != 0xC0 || ((*theUTF) & 0xFE) == 0xC0) {
399 if ( ((*theUTF) & 0xF8) == 0xF0 ) {
403 while ((counter =
Int1(counter << 1)) < 0) {
404 unsigned char c = *p++;
405 if ((c & ~077) != 0200) {
408 acc = (acc << 6) | (c & 077);
412 return (
size_t)(p - theUTF);
419 size_t theLength =
UnicodeToUTF8( theUnicode, theBuffer, 10 );
420 return string( theBuffer, theLength );
425 size_t theBufLength )
429 if (theUnicode < 0x80) {
431 if ( Length > theBufLength )
return 0;
432 theBuffer[0] = char(theUnicode);
434 else if (theUnicode < 0x800) {
436 if ( Length > theBufLength )
return 0;
437 theBuffer[0] = char( 0xC0 | (theUnicode>>6));
438 theBuffer[1] = char( 0x80 | (theUnicode & 0x3F));
440 else if (theUnicode < 0x10000) {
442 if ( Length > theBufLength )
return 0;
443 theBuffer[0] = char( 0xE0 | (theUnicode>>12));
444 theBuffer[1] = char( 0x80 | ((theUnicode>>6) & 0x3F));
445 theBuffer[2] = char( 0x80 | (theUnicode & 0x3F));
447 else if (theUnicode < 0x200000) {
449 if ( Length > theBufLength )
return 0;
450 theBuffer[0] = char( 0xF0 | (theUnicode>>18));
451 theBuffer[1] = char( 0x80 | ((theUnicode>>12) & 0x3F));
452 theBuffer[2] = char( 0x80 | ((theUnicode>>6) & 0x3F));
453 theBuffer[3] = char( 0x80 | (theUnicode & 0x3F));
466 if ( !src || !dst || dstLen == 0 )
return 0;
469 size_t srcLen = strlen( src );
471 for ( srcPos = 0; srcPos < srcLen; ) {
473 char* pDst = &(dst[dstPos]);
474 const char* pSrc = &(src[srcPos]);
490 if (
result && pSubst == default_translation) {
508 memcpy( pDst, pSrc, utfLen );
514 size_t substLen = strlen( pSubst->
Subst );
515 if ( (dstPos + substLen) > dstLen ) {
520 memcpy( pDst, pSubst->
Subst, substLen );
537 size_t srcLen = strlen( src );
539 for ( srcPos = 0; srcPos < srcLen; ) {
541 const char* pSrc = &(src[srcPos]);
557 if (
result && pSubst == default_translation) {
577 dst +=
string( pSrc, utfLen );
583 dst += pSubst->
Subst;
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
CUnicodeToAsciiTranslation(void)
void x_Initialize(const string &name)
bool IsInitialized(void) const
static int x_ParseLine(string &line, TUnicode &symbol, string &translation)
virtual ~CUnicodeToAsciiTranslation(void)
map< TUnicode, SUnicodeTranslation > m_SymbolToTranslation
const SUnicodeTranslation * GetTranslation(TUnicode symbol) const
container_type::const_iterator const_iterator
const_iterator begin() const
const_iterator end() const
const_iterator find(const key_type &key) const
#define LOG_POST_X(err_subcode, message)
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Info(CExceptionArgs_Base &args)
Int8 GetLength(void) const
Get size of file.
#define NCBI_PARAM_TYPE(section, name)
Generate typename for a parameter from its {section, name} attributes.
int8_t Int1
1-byte (8-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
ESubstType Type
Type of the substitutor.
string UTF8ToAsciiString(const char *src, const SUnicodeTranslation *default_translation, const TUnicodeTable *table, EConversionResult *result)
Convert UTF8 into ASCII string.
SUnicodeTranslation TUnicodePlan[256]
string UnicodeToUTF8(TUnicode theUnicode)
Convert Unicode character into UTF8.
const SUnicodeTranslation * UnicodeToAscii(TUnicode character, const TUnicodeTable *table, const SUnicodeTranslation *default_translation)
Convert Unicode character into ASCII string.
const char * Subst
Substitutor for unicode.
TUnicodePlan * TUnicodeTable[256]
ssize_t UTF8ToAscii(const char *src, char *dst, size_t dstLen, const SUnicodeTranslation *default_translation, const TUnicodeTable *table, EConversionResult *result)
Convert UTF8 into ASCII character buffer.
@ eSkip
Unicode to be skipped in translation. Usually it is combined mark.
@ eException
Throw exception (CUtilException, with type eWrongData)
@ eAsIs
Unicodes which should go into the text as is.
@ eDefaultTranslationUsed
Definition of all error codes used in util (xutil.lib).
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
Static variables safety - create on demand, destroy on application termination.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
std::istream & in(std::istream &in_, double &x_)
Structure to keep substititutions for the particular unicode character.
CSafeStatic< CUnicodeToAsciiTranslation > g_UnicodeTranslation
NCBI_PARAM_DECL(string, NCBI, UnicodeToAscii)
static string s_FindUnicodeToAscii(void)
static TUnicodeTable g_DefaultUnicodeTable
TUnicode UTF8ToUnicode(const char *theUTF)
NCBI_PARAM_DEF_WITH_INIT(string, NCBI, UnicodeToAscii, "", s_FindUnicodeToAscii)
static TUnicodePlan s_Plan_26h
static TUnicodePlan s_Plan_E4h
static TUnicodePlan s_Plan_30h
static TUnicodePlan s_Plan_04h
static TUnicodePlan s_Plan_FEh
static TUnicodePlan s_Plan_E2h
static TUnicodePlan s_Plan_27h
static TUnicodePlan s_Plan_21h
static TUnicodePlan s_Plan_E5h
static TUnicodePlan s_Plan_E8h
static TUnicodePlan s_Plan_00h
static TUnicodePlan s_Plan_E6h
static TUnicodePlan s_Plan_22h
static TUnicodePlan s_Plan_20h
static TUnicodePlan s_Plan_24h
static TUnicodePlan s_Plan_25h
static TUnicodePlan s_Plan_01h
static TUnicodePlan s_Plan_EAh
static TUnicodePlan s_Plan_EBh
static TUnicodePlan s_Plan_FBh
static TUnicodePlan s_Plan_23h
static TUnicodePlan s_Plan_E0h
static TUnicodePlan s_Plan_E3h
static TUnicodePlan s_Plan_1Eh
static TUnicodePlan s_Plan_02h
static TUnicodePlan s_Plan_03h
static TUnicodePlan s_Plan_E7h
string g_FindDataFile(const CTempString &name, CDirEntry::EType type=CDirEntry::eFile)
Look for an NCBI application data file or directory of the given name and type; in general,...