NCBI C++ ToolKit
xmlmisc.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: xmlmisc.cpp 99335 2023-03-13 13:48:10Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: xmlmisc.cpp
27  *
28  * Author: Alexey Dobronadezhdin
29  *
30  * File Description:
31  * XML functionality from C-toolkit.
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "ftacpp.hpp"
37 #include "ftaerr.hpp"
38 #include "xmlmisc.h"
39 
40 #ifdef THIS_FILE
41 # undef THIS_FILE
42 #endif
43 #define THIS_FILE "xmlmisc.cpp"
44 
45 #define XML_START_TAG 1
46 #define XML_END_TAG 2
47 #define XML_ATTRIBUTE 3
48 #define XML_CONTENT 4
49 
50 /* function to decode ampersand-protected symbols */
51 
53 
54 struct XmlTable {
55  const Char* code;
56  size_t len;
57  char letter;
58 };
59 
60 static const XmlTable xmlcodes[] = {
61  { "&amp;", 5, '&' },
62  { "&apos;", 6, '\'' },
63  { "&gt;", 4, '>' },
64  { "&lt;", 4, '<' },
65  { "&quot;", 6, '"' },
66  { nullptr, 0, '\0' }
67 };
68 
69 static char* DecodeXml(char* str)
70 {
71  char ch;
72  char* dst;
73  char* src;
74  short i;
75 
76  if (StringHasNoText(str))
77  return str;
78 
79  src = str;
80  dst = str;
81  ch = *src;
82  while (ch != '\0') {
83  if (ch == '&') {
84  const XmlTable* xtp = nullptr;
85  for (i = 0; xmlcodes[i].code; i++) {
86  if (StringEquNI(src, xmlcodes[i].code, xmlcodes[i].len)) {
87  xtp = &(xmlcodes[i]);
88  break;
89  }
90  }
91  if (xtp) {
92  *dst = xtp->letter;
93  dst++;
94  src += xtp->len;
95  } else {
96  *dst = ch;
97  dst++;
98  src++;
99  }
100  } else {
101  *dst = ch;
102  dst++;
103  src++;
104  }
105  ch = *src;
106  }
107  *dst = '\0';
108 
109  return str;
110 }
111 
112 
113 static char* TrimSpacesAroundString(char* str)
114 {
115  unsigned char ch;
116  char* dst;
117  char* ptr;
118 
119  if (str && str[0] != '\0') {
120  dst = str;
121  ptr = str;
122  ch = *ptr;
123  while (ch != '\0' && ch <= ' ') {
124  ptr++;
125  ch = *ptr;
126  }
127  while (ch != '\0') {
128  *dst = ch;
129  dst++;
130  ptr++;
131  ch = *ptr;
132  }
133  *dst = '\0';
134  dst = nullptr;
135  ptr = str;
136  ch = *ptr;
137  while (ch != '\0') {
138  if (ch > ' ') {
139  dst = nullptr;
140  } else if (! dst) {
141  dst = ptr;
142  }
143  ptr++;
144  ch = *ptr;
145  }
146  if (dst) {
147  *dst = '\0';
148  }
149  }
150  return str;
151 }
152 
153 
154 static void TokenizeXmlLine(ValNodePtr* headp, ValNodePtr* tailp, char* str)
155 {
156  char *atr, *fst, *lst, *nxt, *ptr;
157  char ch, cha, chf, chl, quo;
158 
159  bool doStart, doEnd;
160 
161  if (! headp || ! tailp)
162  return;
163  if (StringHasNoText(str))
164  return;
165 
166  ptr = str;
167  ch = *ptr;
168 
169  while (ch != '\0') {
170  if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t') {
171  /* ignore whitespace between tags */
172  ptr++;
173  ch = *ptr;
174 
175  } else if (ch == '<') {
176 
177  /* process XML tag */
178  /* skip past left angle bracket */
179  ptr++;
180 
181  /* keep track of pointers to first character after < and last character before > in XML tag */
182  fst = ptr;
183  lst = ptr;
184  ch = *ptr;
185  while (ch != '\0' && ch != '>') {
186  lst = ptr;
187  ptr++;
188  ch = *ptr;
189  }
190  if (ch != '\0') {
191  *ptr = '\0';
192  ptr++;
193  ch = *ptr;
194  }
195 
196  chf = *fst;
197  chl = *lst;
198  if (chf == '?' || chf == '!') {
199  /* skip processing instructions */
200  } else {
201  /* initial default - if no slashes are present, just do start tag */
202  doStart = true;
203  doEnd = false;
204  /* check for slash just after < or just before > symbol */
205  if (chf == '/') {
206  /* slash after <, just do end tag */
207  fst++;
208  doStart = false;
209  doEnd = true;
210  } else if (chl == '/') {
211  /* slash before > - self-closing tag - do start tag and end tag - content will be empty */
212  *lst = '\0';
213  doEnd = true;
214  }
215 
216  /* skip past first space to look for attribute strings before closing > symbol */
217  atr = fst;
218  cha = *atr;
219  while (cha != '\0' && cha != ' ') {
220  atr++;
221  cha = *atr;
222  }
223  if (cha != '\0') {
224  *atr = '\0';
225  atr++;
226  cha = *atr;
227  }
228 
229  /* report start tag */
230  if (doStart) {
232  ValNodeCopyStrEx(headp, tailp, XML_START_TAG, fst);
233  }
234 
235  /* report individual attribute tag="value" clauses */
236  while (cha != '\0') {
237  nxt = atr;
238  cha = *nxt;
239  /* skip to equal sign */
240  while (cha != '\0' && cha != '=') {
241  nxt++;
242  cha = *nxt;
243  }
244  if (cha != '\0') {
245  nxt++;
246  cha = *nxt;
247  }
248  quo = '\0';
249  if (cha == '"' || cha == '\'') {
250  quo = cha;
251  nxt++;
252  cha = *nxt;
253  }
254  while (cha != '\0' && cha != quo) {
255  nxt++;
256  cha = *nxt;
257  }
258  if (cha != '\0') {
259  nxt++;
260  cha = *nxt;
261  }
262  *nxt = '\0';
264  ValNodeCopyStrEx(headp, tailp, XML_ATTRIBUTE, atr);
265  *nxt = cha;
266  atr = nxt;
267  }
268 
269  /* report end tag */
270  if (doEnd) {
272  ValNodeCopyStrEx(headp, tailp, XML_END_TAG, fst);
273  }
274  }
275 
276  } else {
277 
278  /* process content between tags */
279  fst = ptr;
280  ptr++;
281  ch = *ptr;
282  while (ch != '\0' && ch != '<') {
283  ptr++;
284  ch = *ptr;
285  }
286  if (ch != '\0') {
287  *ptr = '\0';
288  }
289 
290  /* report content string */
292  DecodeXml(fst);
293  ValNodeCopyStrEx(headp, tailp, XML_CONTENT, fst);
294  /*
295  if (ch != '\0') {
296  *ptr = ch;
297  }
298  */
299  }
300  }
301 }
302 
304 {
305  ValNodePtr head = nullptr, tail = nullptr;
306 
307  if (StringHasNoText(str))
308  return nullptr;
309 
310  TokenizeXmlLine(&head, &tail, str);
311 
312  return head;
313 }
314 
315 /* second pass - process ValNode chain into hierarchical structure */
316 
318 {
319  XmlObjPtr attr = nullptr;
320  char ch, chf, chl, quo;
321  char * eql, *lst;
322 
323  if (StringHasNoText(str))
324  return nullptr;
325 
326  eql = str;
327  ch = *eql;
328  while (ch != '\0' && ch != '=') {
329  eql++;
330  ch = *eql;
331  }
332  if (ch == '\0')
333  return nullptr;
334 
335  *eql = '\0';
336  eql++;
337  ch = *eql;
338  quo = ch;
339  if (quo == '"' || quo == '\'') {
340  eql++;
341  ch = *eql;
342  }
343  chf = ch;
344  if (chf == '\0')
345  return nullptr;
346 
347  lst = eql;
348  chl = *lst;
349  while (chl != '\0' && chl != quo) {
350  lst++;
351  chl = *lst;
352  }
353  if (chl != '\0') {
354  *lst = '\0';
355  }
356 
357  if (StringHasNoText(str) || StringHasNoText(eql))
358  return nullptr;
359 
360  attr = new XmlObj;
361  if (! attr)
362  return nullptr;
363 
366  DecodeXml(str);
367  DecodeXml(eql);
368  attr->name = StringSave(str);
369  attr->contents = StringSave(eql);
370 
371  return attr;
372 }
373 
374 static XmlObjPtr ProcessStartTag(ValNodePtr* curr, XmlObjPtr parent, const Char* name)
375 {
376  XmlObjPtr attr, child, lastattr = nullptr, lastchild = nullptr, xop = nullptr;
377  unsigned char choice;
378  char* str;
379  ValNodePtr vnp;
380 
381  if (! curr)
382  return nullptr;
383 
384  xop = new XmlObj;
385  if (! xop)
386  return nullptr;
387 
388  xop->name = StringSave(name);
389  xop->parent = parent;
390 
391  while (*curr) {
392 
393  vnp = *curr;
394  str = vnp->data;
395  choice = vnp->choice;
396 
397  /* advance to next token */
398  *curr = vnp->next;
399 
401 
402  if (StringHasNoText(str)) {
403  /* skip */
404  } else if (choice == XML_START_TAG) {
405 
406  /* recursive call to process next level */
407  child = ProcessStartTag(curr, xop, str);
408  /* link into children list */
409  if (child) {
410  if (! xop->children) {
411  xop->children = child;
412  }
413  if (lastchild) {
414  lastchild->next = child;
415  }
416  lastchild = child;
417  }
418 
419  } else if (choice == XML_END_TAG) {
420 
421  /* pop out of recursive call */
422  return xop;
423 
424  } else if (choice == XML_ATTRIBUTE) {
425 
426  /* get attributes within tag */
427  attr = ProcessAttribute(str);
428  if (attr) {
429  if (! xop->attributes) {
430  xop->attributes = attr;
431  }
432  if (lastattr) {
433  lastattr->next = attr;
434  }
435  lastattr = attr;
436  }
437 
438  } else if (choice == XML_CONTENT) {
439 
440  /* get contact between start and end tags */
441  xop->contents = StringSave(str);
442  }
443  }
444 
445  return xop;
446 }
447 
448 static XmlObjPtr SetSuccessors(XmlObjPtr xop, XmlObjPtr prev, short level)
449 {
450  XmlObjPtr tmp;
451 
452  if (! xop)
453  return nullptr;
454  xop->level = level;
455 
456  if (prev) {
457  prev->successor = xop;
458  }
459 
460  prev = xop;
461  for (tmp = xop->children; tmp; tmp = tmp->next) {
462  prev = SetSuccessors(tmp, prev, level + 1);
463  }
464 
465  return prev;
466 }
467 
469 {
470  ValNodePtr curr;
471  XmlObjPtr xop;
472 
473  if (! head)
474  return nullptr;
475 
476  curr = head;
477 
478  xop = ProcessStartTag(&curr, nullptr, "root");
479  if (! xop)
480  return nullptr;
481 
482  SetSuccessors(xop, nullptr, 1);
483 
484  return xop;
485 }
486 
488 {
489  XmlObjPtr curr, next;
490 
491  if (! xop)
492  return nullptr;
493 
494  MemFree(xop->name);
495  MemFree(xop->contents);
496 
497  curr = xop->attributes;
498  while (curr) {
499  next = curr->next;
500  curr->next = nullptr;
501  FreeXmlObject(curr);
502  curr = next;
503  }
504 
505  curr = xop->children;
506  while (curr) {
507  next = curr->next;
508  curr->next = nullptr;
509  FreeXmlObject(curr);
510  curr = next;
511  }
512 
513  delete xop;
514 
515  return nullptr;
516 }
517 
519 {
521  XmlObjPtr root, xop;
522  char* tmp;
523 
524  if (StringHasNoText(str))
525  return nullptr;
526  tmp = StringSave(str);
527  if (! tmp)
528  return nullptr;
529 
531  MemFree(tmp);
532 
533  if (! head)
534  return nullptr;
535 
536  root = ParseXmlTokens(head);
538 
539  if (! root)
540  return nullptr;
541  xop = root->children;
542  root->children = nullptr;
543  FreeXmlObject(root);
544 
545  return xop;
546 }
547 
548 static int VisitXmlNodeProc(
549  XmlObjPtr xop,
550  XmlObjPtr parent,
551  short level,
552  void* userdata,
553  VisitXmlNodeFunc callback,
554  char* nodeFilter,
555  char* parentFilter,
556  char* attrTagFilter,
557  char* attrValFilter,
558  short maxDepth)
559 {
560  XmlObjPtr attr, tmp;
561  int index = 0;
562 
563  bool okay;
564 
565  if (! xop)
566  return index;
567 
568  /* check depth limit */
569  if (level > maxDepth)
570  return index;
571 
572  okay = true;
573 
574  /* check attribute filters */
575  if (StringDoesHaveText(attrTagFilter)) {
576  okay = false;
577  for (attr = xop->attributes; attr; attr = attr->next) {
578  if (NStr::CompareNocase(attr->name, attrTagFilter) == 0) {
579  if (StringHasNoText(attrValFilter) || NStr::CompareNocase(attr->contents, attrValFilter) == 0) {
580  okay = true;
581  break;
582  }
583  }
584  }
585  } else if (StringDoesHaveText(attrValFilter)) {
586  okay = false;
587  for (attr = xop->attributes; attr; attr = attr->next) {
588  if (NStr::CompareNocase(attr->contents, attrValFilter) == 0) {
589  okay = true;
590  break;
591  }
592  }
593  }
594 
595  /* check node name filter */
596  if (StringDoesHaveText(nodeFilter)) {
597  if (NStr::CompareNocase(xop->name, nodeFilter) != 0) {
598  okay = false;
599  }
600  }
601 
602  /* check parent name filter */
603  if (StringDoesHaveText(parentFilter)) {
604  if (parent && NStr::CompareNocase(parent->name, parentFilter) != 0) {
605  okay = false;
606  }
607  }
608 
609  if (okay) {
610  /* call callback for this node if all filter tests pass */
611  if (callback) {
612  callback(xop, parent, level, userdata);
613  }
614  index++;
615  }
616 
617  /* visit children */
618  for (tmp = xop->children; tmp; tmp = tmp->next) {
619  index += VisitXmlNodeProc(tmp, xop, level + 1, userdata, callback, nodeFilter, parentFilter, attrTagFilter, attrValFilter, maxDepth);
620  }
621 
622  return index;
623 }
624 
625 int VisitXmlNodes(XmlObjPtr xop, void* userdata, VisitXmlNodeFunc callback, char* nodeFilter, char* parentFilter, char* attrTagFilter, char* attrValFilter, short maxDepth)
626 {
627  int index = 0;
628 
629  if (! xop)
630  return index;
631 
632  if (maxDepth == 0) {
633  maxDepth = numeric_limits<short>::max();
634  }
635 
636  index += VisitXmlNodeProc(xop, nullptr, 1, userdata, callback, nodeFilter, parentFilter, attrTagFilter, attrValFilter, maxDepth);
637 
638  return index;
639 }
640 
#define head
Definition: ct_nlmzip_i.h:138
bool StringDoesHaveText(const char *s)
Definition: ftacpp.hpp:140
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:125
void MemFree(char *p)
Definition: ftacpp.hpp:55
bool StringHasNoText(const char *s)
Definition: ftacpp.hpp:131
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
#define StringSave
Definition: ncbistr.hpp:326
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
int i
int len
T max(T x_, T y_)
ValNode * next
Definition: valnode.h:51
char * data
Definition: valnode.h:49
unsigned char choice
Definition: valnode.h:47
Definition: xmlmisc.h:42
short level
Definition: xmlmisc.h:45
XmlObj * next
Definition: xmlmisc.h:48
XmlObj * attributes
Definition: xmlmisc.h:46
XmlObj * children
Definition: xmlmisc.h:47
char * name
Definition: xmlmisc.h:43
char * contents
Definition: xmlmisc.h:44
size_t len
Definition: xmlmisc.cpp:56
char letter
Definition: xmlmisc.cpp:57
const Char * code
Definition: xmlmisc.cpp:55
Definition: inftrees.h:24
ValNodePtr ValNodeFreeData(ValNodePtr vnp)
Definition: valnode.cpp:96
ValNodePtr ValNodeCopyStrEx(ValNodePtr *head, ValNodePtr *tail, short choice, const char *str)
Definition: valnode.cpp:199
#define XML_END_TAG
Definition: xmlmisc.cpp:46
static char * DecodeXml(char *str)
Definition: xmlmisc.cpp:69
static ValNodePtr TokenizeXmlString(char *str)
Definition: xmlmisc.cpp:303
static char * TrimSpacesAroundString(char *str)
Definition: xmlmisc.cpp:113
static XmlObjPtr ParseXmlTokens(ValNodePtr head)
Definition: xmlmisc.cpp:468
static int VisitXmlNodeProc(XmlObjPtr xop, XmlObjPtr parent, short level, void *userdata, VisitXmlNodeFunc callback, char *nodeFilter, char *parentFilter, char *attrTagFilter, char *attrValFilter, short maxDepth)
Definition: xmlmisc.cpp:548
#define XML_CONTENT
Definition: xmlmisc.cpp:48
static XmlObjPtr ProcessAttribute(char *str)
Definition: xmlmisc.cpp:317
#define XML_ATTRIBUTE
Definition: xmlmisc.cpp:47
XmlObjPtr ParseXmlString(const Char *str)
Definition: xmlmisc.cpp:518
static void TokenizeXmlLine(ValNodePtr *headp, ValNodePtr *tailp, char *str)
Definition: xmlmisc.cpp:154
XmlObjPtr FreeXmlObject(XmlObjPtr xop)
Definition: xmlmisc.cpp:487
static XmlObjPtr SetSuccessors(XmlObjPtr xop, XmlObjPtr prev, short level)
Definition: xmlmisc.cpp:448
static const XmlTable xmlcodes[]
Definition: xmlmisc.cpp:60
#define XML_START_TAG
Definition: xmlmisc.cpp:45
int VisitXmlNodes(XmlObjPtr xop, void *userdata, VisitXmlNodeFunc callback, char *nodeFilter, char *parentFilter, char *attrTagFilter, char *attrValFilter, short maxDepth)
Definition: xmlmisc.cpp:625
static XmlObjPtr ProcessStartTag(ValNodePtr *curr, XmlObjPtr parent, const Char *name)
Definition: xmlmisc.cpp:374
void(* VisitXmlNodeFunc)(XmlObjPtr xop, XmlObjPtr parent, short level, void *userdata)
Definition: xmlmisc.h:54
Modified on Wed Apr 17 13:10:07 2024 by modify_doxy.py rev. 669887