NCBI C++ ToolKit
htmlhelper.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: htmlhelper.cpp 94625 2021-08-24 15:24:27Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Eugene Vasilchenko, Vladimir Ivanov
27  *
28  */
29 
30 
31 #include <ncbi_pch.hpp>
32 #include <html/html.hpp>
33 #include <html/error_codes.hpp>
34 #include <corelib/ncbisys.hpp>
35 
36 
37 
38 #define NCBI_USE_ERRCODE_X Html_Lib
39 
41 
42 
43 // CHTMLHelper
44 
45 const char* kDefaultNL = "\n";
47 
48 
49 void CHTMLHelper::SetNL(const string& nl)
50 {
51  if (sm_newline != kDefaultNL) {
52  free((char*)sm_newline);
53  }
54  sm_newline = NcbiSysChar_strdup(nl.c_str());
55 }
56 
57 
58 static string s_HTMLEncode(const string& str, const string& set,
60 {
62 
63  SIZE_TYPE last = 0;
64  SIZE_TYPE semicolon = 0;
65 
66  // Find first symbol to encode.
67  SIZE_TYPE ptr = str.find_first_of(set, last);
68  while ( ptr != NPOS ) {
69  // Copy plain part of the input string
70  if ( ptr != last ) {
71  out.write(str.data() + last, ptr - last);
72  }
73  // Append encoded symbol
74  switch (str[ptr]) {
75  case '"':
76  out << "&quot;";
77  break;
78  case '&':
79  {{
80  out.put('&');
81  bool is_entity = false;
82  // Check on HTML entity
84  (ptr+2 < str.length()) &&
85  (semicolon != NPOS)) {
86  if ( ptr >= semicolon )
87  semicolon = str.find(";", ptr+1);
88  if ( semicolon != NPOS ) {
89  SIZE_TYPE len = semicolon - ptr;
90  SIZE_TYPE p = ptr + 1;
91  if (str[ptr+1] == '#') {
92  // Check on numeric character reference encoding
94  p++;
95  if (len || len <= 4) {
96  for (; p < semicolon; ++p) {
97  if (!isdigit((unsigned char)(str[p])))
98  break;
99  }
100  }
101  }
102  } else {
103  // Check on literal entity
105  if (len && len <= 10) {
106  for (; p < semicolon; ++p) {
107  if (!isalpha((unsigned char)(str[p])))
108  break;
109  }
110  }
111  }
112  }
113  is_entity = (p == semicolon);
114  }
115  }
116  if ( is_entity ) {
118  ERR_POST_X_ONCE(2, Info << "string \"" << str <<
119  "\" contains HTML encoded entities");
120  }
121  } else {
122  out << "amp;";
123  }
124  }}
125  break;
126 
127  case '<':
128  out << "&lt;";
129  break;
130  case '>':
131  out << "&gt;";
132  break;
133  }
134  // Find next symbol to encode
135  last = ptr + 1;
136  ptr = str.find_first_of(set, last);
137  }
138  // Append last part of the source string
139  if ( last != str.size() ) {
140  out.write(str.data() + last, str.size() - last);
141  }
143 }
144 
145 
147 {
148  return s_HTMLEncode(str, "\"&<>", flags);
149 }
150 
151 
152 string
154 {
155  return s_HTMLEncode(str, "\"&", flags);
156 }
157 
158 
159 string CHTMLHelper::StripTags(const string& str)
160 {
161  SIZE_TYPE pos = 0;
162  string s(str);
163 
164  // First, strip comments
165  while ( (pos = s.find("<!--", pos)) != NPOS ) {
166  SIZE_TYPE pos_end = s.find("-->", pos + 1);
167  if ( pos_end == NPOS ) {
168  break;
169  }
170  s.erase(pos, pos_end - pos + 3);
171  pos++;
172  }
173  // Next, strip mapping tags <@...@>
174  while ( (pos = s.find("<@", pos)) != NPOS ) {
175  SIZE_TYPE pos_end = s.find("@>", pos + 1);
176  if ( pos_end == NPOS ) {
177  break;
178  }
179  s.erase(pos, pos_end - pos + 2);
180  pos++;
181  }
182  // Now, strip balanced "<..>"
183  pos =0;
184  while ( (pos = s.find("<", pos)) != NPOS ) {
185  SIZE_TYPE pos_end = s.find(">", pos + 1);
186  if ( pos_end == NPOS ) {
187  break;
188  }
189  if (pos < s.size() &&
190  (isalpha((unsigned char) s[pos + 1]) || s[pos + 1] == '/' )) {
191  s.erase(pos, pos_end - pos + 1);
192  } else {
193  pos++;
194  }
195  }
196  return s;
197 }
198 
199 
200 string CHTMLHelper::StripSpecialChars(const string& str)
201 {
202  SIZE_TYPE pos = 0;
203  string s(str);
204 
205  // Strip named and numeric character entities "&[#]...;"
206  while ( (pos = s.find("&", pos)) != NPOS ) {
207  SIZE_TYPE pos_end = s.find(";", pos + 1);
208  if ( pos_end == NPOS ) {
209  break;
210  }
211  if ( (pos_end - pos) > 2 && (pos_end - pos) < 8 ) {
212  int (*check)(int c);
213  SIZE_TYPE start = pos + 1;
214  if ( s[start] == '#') {
215  check = &::isdigit;
216  start++;
217  } else {
218  check = &::isalpha;
219  }
220  bool need_delete = true;
221  for (SIZE_TYPE i = start; i < pos_end; i++ ) {
222  if ( !check((int)s[i]) ) {
223  need_delete = false;
224  break;
225  }
226  }
227  if ( need_delete ) {
228  s.erase(pos, pos_end - pos + 1);
229  }
230  }
231  pos++;
232  }
233  return s;
234 }
235 
236 // Character entity references
237 // http://www.w3.org/TR/html4/sgml/entities.html
238 // http://www.w3.org/TR/1998/REC-html40-19980424/charset.html#h-5.3
239 // http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
240 
241 static struct tag_HtmlEntities
242 {
244  const char* s;
245 
246 } const s_HtmlEntities[] =
247 {
248  { 34, "quot" },
249  { 38, "amp" },
250  { 39, "apos" },
251  { 60, "lt" },
252  { 62, "gt" },
253  { 160, "nbsp" },
254  { 161, "iexcl" },
255  { 162, "cent" },
256  { 163, "pound" },
257  { 164, "curren" },
258  { 165, "yen" },
259  { 166, "brvbar" },
260  { 167, "sect" },
261  { 168, "uml" },
262  { 169, "copy" },
263  { 170, "ordf" },
264  { 171, "laquo" },
265  { 172, "not" },
266  { 173, "shy" },
267  { 174, "reg" },
268  { 175, "macr" },
269  { 176, "deg" },
270  { 177, "plusmn" },
271  { 178, "sup2" },
272  { 179, "sup3" },
273  { 180, "acute" },
274  { 181, "micro" },
275  { 182, "para" },
276  { 183, "middot" },
277  { 184, "cedil" },
278  { 185, "sup1" },
279  { 186, "ordm" },
280  { 187, "raquo" },
281  { 188, "frac14" },
282  { 189, "frac12" },
283  { 190, "frac34" },
284  { 191, "iquest" },
285  { 192, "Agrave" },
286  { 193, "Aacute" },
287  { 194, "Acirc" },
288  { 195, "Atilde" },
289  { 196, "Auml" },
290  { 197, "Aring" },
291  { 198, "AElig" },
292  { 199, "Ccedil" },
293  { 200, "Egrave" },
294  { 201, "Eacute" },
295  { 202, "Ecirc" },
296  { 203, "Euml" },
297  { 204, "Igrave" },
298  { 205, "Iacute" },
299  { 206, "Icirc" },
300  { 207, "Iuml" },
301  { 208, "ETH" },
302  { 209, "Ntilde" },
303  { 210, "Ograve" },
304  { 211, "Oacute" },
305  { 212, "Ocirc" },
306  { 213, "Otilde" },
307  { 214, "Ouml" },
308  { 215, "times" },
309  { 216, "Oslash" },
310  { 217, "Ugrave" },
311  { 218, "Uacute" },
312  { 219, "Ucirc" },
313  { 220, "Uuml" },
314  { 221, "Yacute" },
315  { 222, "THORN" },
316  { 223, "szlig" },
317  { 224, "agrave" },
318  { 225, "aacute" },
319  { 226, "acirc" },
320  { 227, "atilde" },
321  { 228, "auml" },
322  { 229, "aring" },
323  { 230, "aelig" },
324  { 231, "ccedil" },
325  { 232, "egrave" },
326  { 233, "eacute" },
327  { 234, "ecirc" },
328  { 235, "euml" },
329  { 236, "igrave" },
330  { 237, "iacute" },
331  { 238, "icirc" },
332  { 239, "iuml" },
333  { 240, "eth" },
334  { 241, "ntilde" },
335  { 242, "ograve" },
336  { 243, "oacute" },
337  { 244, "ocirc" },
338  { 245, "otilde" },
339  { 246, "ouml" },
340  { 247, "divide" },
341  { 248, "oslash" },
342  { 249, "ugrave" },
343  { 250, "uacute" },
344  { 251, "ucirc" },
345  { 252, "uuml" },
346  { 253, "yacute" },
347  { 254, "thorn" },
348  { 255, "yuml" },
349  { 338, "OElig" },
350  { 339, "oelig" },
351  { 352, "Scaron" },
352  { 353, "scaron" },
353  { 376, "Yuml" },
354  { 402, "fnof" },
355  { 710, "circ" },
356  { 732, "tilde" },
357  { 913, "Alpha" },
358  { 914, "Beta" },
359  { 915, "Gamma" },
360  { 916, "Delta" },
361  { 917, "Epsilon" },
362  { 918, "Zeta" },
363  { 919, "Eta" },
364  { 920, "Theta" },
365  { 921, "Iota" },
366  { 922, "Kappa" },
367  { 923, "Lambda" },
368  { 924, "Mu" },
369  { 925, "Nu" },
370  { 926, "Xi" },
371  { 927, "Omicron" },
372  { 928, "Pi" },
373  { 929, "Rho" },
374  { 931, "Sigma" },
375  { 932, "Tau" },
376  { 933, "Upsilon" },
377  { 934, "Phi" },
378  { 935, "Chi" },
379  { 936, "Psi" },
380  { 937, "Omega" },
381  { 945, "alpha" },
382  { 946, "beta" },
383  { 947, "gamma" },
384  { 948, "delta" },
385  { 949, "epsilon" },
386  { 950, "zeta" },
387  { 951, "eta" },
388  { 952, "theta" },
389  { 953, "iota" },
390  { 954, "kappa" },
391  { 955, "lambda" },
392  { 956, "mu" },
393  { 957, "nu" },
394  { 958, "xi" },
395  { 959, "omicron" },
396  { 960, "pi" },
397  { 961, "rho" },
398  { 962, "sigmaf" },
399  { 963, "sigma" },
400  { 964, "tau" },
401  { 965, "upsilon" },
402  { 966, "phi" },
403  { 967, "chi" },
404  { 968, "psi" },
405  { 969, "omega" },
406  { 977, "thetasym" },
407  { 978, "upsih" },
408  { 982, "piv" },
409  { 8194, "ensp" },
410  { 8195, "emsp" },
411  { 8201, "thinsp" },
412  { 8204, "zwnj" },
413  { 8205, "zwj" },
414  { 8206, "lrm" },
415  { 8207, "rlm" },
416  { 8211, "ndash" },
417  { 8212, "mdash" },
418  { 8216, "lsquo" },
419  { 8217, "rsquo" },
420  { 8218, "sbquo" },
421  { 8220, "ldquo" },
422  { 8221, "rdquo" },
423  { 8222, "bdquo" },
424  { 8224, "dagger" },
425  { 8225, "Dagger" },
426  { 8226, "bull" },
427  { 8230, "hellip" },
428  { 8240, "permil" },
429  { 8242, "prime" },
430  { 8243, "Prime" },
431  { 8249, "lsaquo" },
432  { 8250, "rsaquo" },
433  { 8254, "oline" },
434  { 8260, "frasl" },
435  { 8364, "euro" },
436  { 8465, "image" },
437  { 8472, "weierp" },
438  { 8476, "real" },
439  { 8482, "trade" },
440  { 8501, "alefsym" },
441  { 8592, "larr" },
442  { 8593, "uarr" },
443  { 8594, "rarr" },
444  { 8595, "darr" },
445  { 8596, "harr" },
446  { 8629, "crarr" },
447  { 8656, "lArr" },
448  { 8657, "uArr" },
449  { 8658, "rArr" },
450  { 8659, "dArr" },
451  { 8660, "hArr" },
452  { 8704, "forall" },
453  { 8706, "part" },
454  { 8707, "exist" },
455  { 8709, "empty" },
456  { 8711, "nabla" },
457  { 8712, "isin" },
458  { 8713, "notin" },
459  { 8715, "ni" },
460  { 8719, "prod" },
461  { 8721, "sum" },
462  { 8722, "minus" },
463  { 8727, "lowast" },
464  { 8730, "radic" },
465  { 8733, "prop" },
466  { 8734, "infin" },
467  { 8736, "ang" },
468  { 8743, "and" },
469  { 8744, "or" },
470  { 8745, "cap" },
471  { 8746, "cup" },
472  { 8747, "int" },
473  { 8756, "there4" },
474  { 8764, "sim" },
475  { 8773, "cong" },
476  { 8776, "asymp" },
477  { 8800, "ne" },
478  { 8801, "equiv" },
479  { 8804, "le" },
480  { 8805, "ge" },
481  { 8834, "sub" },
482  { 8835, "sup" },
483  { 8836, "nsub" },
484  { 8838, "sube" },
485  { 8839, "supe" },
486  { 8853, "oplus" },
487  { 8855, "otimes" },
488  { 8869, "perp" },
489  { 8901, "sdot" },
490  { 8968, "lceil" },
491  { 8969, "rceil" },
492  { 8970, "lfloor" },
493  { 8971, "rfloor" },
494  { 9001, "lang" },
495  { 9002, "rang" },
496  { 9674, "loz" },
497  { 9824, "spades" },
498  { 9827, "clubs" },
499  { 9829, "hearts" },
500  { 9830, "diams" },
501  { 0, 0 }
502 };
503 
504 
506  THTMLDecodeFlags* result_flags)
507 {
508  CStringUTF8 ustr;
510  if (encoding == eEncoding_Unknown) {
511  encoding = CUtf8::GuessEncoding(str);
512  if (encoding == eEncoding_Unknown) {
513  NCBI_THROW2(CStringException, eBadArgs,
514  "Unable to guess the source string encoding", 0);
515  }
516  }
517  // wild guess...
518  ustr.reserve(str.size());
519 
520  string::const_iterator i, e = str.end();
521  char ch;
523 
524  for (i = str.begin(); i != e;) {
525  ch = *(i++);
526  //check for HTML entities and character references
527  if (i != e && ch == '&') {
528  string::const_iterator itmp, end_of_entity, start_of_entity;
529  itmp = end_of_entity = start_of_entity = i;
530  bool ent, dec, hex, parsed=false;
531  ent = isalpha((unsigned char)(*itmp)) != 0;
532  dec = !ent && *itmp == '#' && ++itmp != e &&
533  isdigit((unsigned char)(*itmp)) != 0;
534  hex = !dec && itmp != e &&
535  (*itmp == 'x' || *itmp == 'X') && ++itmp != e &&
536  isxdigit((unsigned char)(*itmp)) != 0;
537  start_of_entity = itmp;
538  if (itmp != e && (ent || dec || hex)) {
539  // do not look too far
540  for (int len=0; len<16 && itmp != e; ++len, ++itmp) {
541  if (*itmp == '&' || *itmp == '#') {
542  break;
543  }
544  if (*itmp == ';') {
545  end_of_entity = itmp;
546  break;
547  }
548  ent = ent && isalnum( (unsigned char)(*itmp)) != 0;
549  dec = dec && isdigit( (unsigned char)(*itmp)) != 0;
550  hex = hex && isxdigit((unsigned char)(*itmp)) != 0;
551  }
552  if (end_of_entity != i && (ent || dec || hex)) {
553  uch = 0;
554  if (ent) {
555  string entity(start_of_entity,end_of_entity);
556  const struct tag_HtmlEntities* p = s_HtmlEntities;
557  for ( ; p->u != 0; ++p) {
558  if (entity.compare(p->s) == 0) {
559  uch = p->u;
560  parsed = true;
562  break;
563  }
564  }
565  } else {
566  parsed = true;
568  for (itmp = start_of_entity;
569  itmp != end_of_entity; ++itmp) {
570  TUnicodeSymbol ud = *itmp;
571  if (dec) {
572  uch = 10 * uch + (ud - '0');
573  } else if (hex) {
574  if (ud >='0' && ud <= '9') {
575  ud -= '0';
576  } else if (ud >='a' && ud <= 'f') {
577  ud -= 'a';
578  ud += 10;
579  } else if (ud >='A' && ud <= 'F') {
580  ud -= 'A';
581  ud += 10;
582  }
583  uch = 16 * uch + ud;
584  }
585  }
586  }
587  if (parsed) {
588  ustr += CUtf8::AsUTF8(&uch,1);
589  i = ++end_of_entity;
590  continue;
591  }
592  }
593  }
594  }
595 // no entity - append as is
596  if (encoding == eEncoding_UTF8 || encoding == eEncoding_Ascii) {
597  ustr.append( 1, ch );
598  } else {
599  result |= fEncoding;
600  ustr += CUtf8::AsUTF8(CTempString(&ch,1), encoding);
601  }
602  }
603  if (result_flags) {
604  *result_flags = result;
605  }
606  return ustr;
607 }
608 
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CStringException –.
Definition: ncbistr.hpp:4506
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Definition: set.hpp:45
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
#define check(s)
Definition: describecol2.c:21
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ERR_POST_X_ONCE(err_subcode, message)
Error posting only once during program execution with default error code and given error subcode.
Definition: ncbidiag.hpp:621
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
static void SetNL(const string &nl)
Definition: htmlhelper.cpp:49
static const char * sm_newline
Definition: htmlhelper.hpp:122
static string HTMLEncode(const string &str, THTMLEncodeFlags flags=fEncodeAll)
HTML encodes a string. E.g. <.
Definition: htmlhelper.cpp:146
static CStringUTF8 HTMLDecode(const string &str, EEncoding encoding=eEncoding_Unknown, THTMLDecodeFlags *result_flags=NULL)
Decode HTML entities and character references.
Definition: htmlhelper.cpp:505
int THTMLEncodeFlags
Definition: htmlhelper.hpp:72
int THTMLDecodeFlags
Definition: htmlhelper.hpp:83
static string StripSpecialChars(const string &str)
Strip all named and numeric character entities from a string.
Definition: htmlhelper.cpp:200
static string StripTags(const string &str)
Strip all HTML tags from a string.
Definition: htmlhelper.cpp:159
static string HTMLAttributeEncode(const string &str, THTMLEncodeFlags flags=fSkipEntities)
HTML encodes a tag attribute ('&' and '"' symbols).
Definition: htmlhelper.cpp:153
@ fSkipLiteralEntities
Skip "&entity;".
Definition: htmlhelper.hpp:66
@ fSkipNumericEntities
Skip "&#NNNN;".
Definition: htmlhelper.hpp:67
@ fCheckPreencoded
Print warning if some preencoded entity found in the string.
Definition: htmlhelper.hpp:69
@ fCharRef_Entity
Character entity reference(s) was found.
Definition: htmlhelper.hpp:79
@ fCharRef_Numeric
Numeric character reference(s) was found.
Definition: htmlhelper.hpp:80
@ fEncoding
Character encoding changed.
Definition: htmlhelper.hpp:81
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
std::string CStringUTF8
Definition: ncbistl.hpp:254
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
EEncoding
Definition: ncbistr.hpp:199
#define NPOS
Definition: ncbistr.hpp:133
Uint4 TUnicodeSymbol
Unicode character.
Definition: ncbistr.hpp:141
static EEncoding GuessEncoding(const CTempString &src)
Guess the encoding of the C/C++ string.
Definition: ncbistr.cpp:6691
static CStringUTF8 AsUTF8(const CTempString &src, EEncoding encoding, EValidate validate=eNoValidate)
Convert into UTF8 from a C/C++ string.
Definition: ncbistr.hpp:3889
@ eEncoding_Ascii
Definition: ncbistr.hpp:202
@ eEncoding_UTF8
Definition: ncbistr.hpp:201
@ eEncoding_Unknown
Definition: ncbistr.hpp:200
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
HTML classes.
static struct tag_HtmlEntities s_HtmlEntities[]
const char * kDefaultNL
Definition: htmlhelper.cpp:45
static string s_HTMLEncode(const string &str, const string &set, CHTMLHelper::THTMLEncodeFlags flags)
Definition: htmlhelper.cpp:58
Definition of all error codes used in html library (xhtml.lib).
int i
int len
static void hex(unsigned char c)
Definition: mdb_dump.c:56
static unsigned int ud(time_t one, time_t two)
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
int isxdigit(Uchar c)
Definition: ncbictype.hpp:71
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
#define NcbiSysChar_strdup
Definition: ncbisys.hpp:178
TUnicodeSymbol u
Definition: ncbistr.cpp:4229
const char * s
Definition: ncbistr.cpp:4230
#define uch
else result
Definition: token2.c:20
void free(voidpf ptr)
unsigned char uch
Definition: zutil.h:39
Modified on Tue May 28 05:52:33 2024 by modify_doxy.py rev. 669887