NCBI C++ ToolKit
lexer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: lexer.cpp 91618 2020-11-20 15:04:21Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Eugene Vasilchenko
27 *
28 * File Description:
29 * ASN.1 lexer
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include "lexer.hpp"
35 #include "tokens.hpp"
36 
38 
39 inline bool IsAlNum(char c)
40 {
41  return isalnum((unsigned char) c) ? true : false;
42 }
43 
44 inline bool IsDigit(char c)
45 {
46  return isdigit((unsigned char) c) ? true : false;
47 }
48 
49 ASNLexer::ASNLexer(CNcbiIstream& in, const string& name)
50  : AbstractLexer(in,name)
51 {
52 }
53 
55 {
56 }
57 
59 {
60  char c = Char();
61  switch ( c ) {
62  case ':':
63  if ( Char(1) == ':' && Char(2) == '=' ) {
64  StartToken();
65  AddChars(3);
66  return T_DEFINE;
67  }
68  return T_SYMBOL;
69  case '-':
70  case '+':
71  if ( IsDigit(Char(1)) ) {
72  StartToken();
73  AddChar();
74  return LookupNumber();
75  }
76  return T_SYMBOL;
77  case '\"':
78  StartToken();
79  AddChar();
80  StartString();
81  LookupString();
82  return T_STRING;
83  case '\'':
84  StartToken();
85  AddChar();
86  return LookupBinHexString();
87 #if 0
88  case '[':
89  StartToken();
90  AddChar();
91  LookupTag();
92  return T_TAG;
93 #else
94  case '[':
95  StartToken();
96  AddChar();
97  return T_TAG_BEGIN;
98  case ']':
99  StartToken();
100  AddChar();
101  return T_TAG_END;
102 #endif
103  default:
104  if ( IsDigit(c) ) {
105  StartToken();
106  AddChar();
107  return LookupNumber();
108  }
109  else if ( c >= 'a' && c <= 'z' ) {
110  StartToken();
111  AddChar();
113  return T_IDENTIFIER;
114  }
115  else if ( c >= 'A' && c <= 'Z' ) {
116  StartToken();
117  AddChar();
119  return LookupKeyword();
120  }
121  return T_SYMBOL;
122  }
123 }
124 
126 {
127  while ( true ) {
128  char c = Char();
129  switch ( c ) {
130  case ' ':
131  case '\t':
132  case '\r':
133  SkipChar();
134  break;
135  case '\n':
136  SkipChar();
137  NextLine();
138  break;
139  case '-':
140  if ( Char(1) == '-' ) {
141  // comments
142  SkipChars(2);
143  SkipComment();
144  break;
145  }
146  return;
147  case '/':
148  if ( Char(1) == '*' ) {
149  // comments
150  SkipChars(2);
151  SkipComment();
152  break;
153  }
154  return;
155  default:
156  return;
157  }
158  }
159 }
160 
162 {
163  CComment& comment = AddComment();
164  while ( true ) {
165  // wait for end of comments
166  char c = Char();
167  if (c == '\r') {
168  SkipChar();
169  continue;
170  }
171  switch ( c ) {
172  case '\n':
173  SkipChar();
174  NextLine();
175  return;
176  case 0:
177  if ( Eof() )
178  return;
179  break;
180  case '-':
181  if ( Char(1) == '-' ) {
182  SkipChars(2);
183  return;
184  }
185  break;
186  case '*':
187  if ( Char(1) == '/' ) {
188  SkipChars(2);
189  return;
190  }
191  break;
192  }
193  comment.AddChar(c);
194  SkipChar();
195  }
196 }
197 
199 {
200  while ( true ) {
201  char c = Char();
202  switch ( c ) {
203  case '\r':
204  case '\n':
205  LexerWarning("unclosed string", 1);
206  return;
207  case 0:
208  if ( Eof() ) {
209  LexerWarning("unclosed string", 2);
210  return;
211  }
212  LexerWarning("illegal character in string: \\0", 3);
213  AddStringChar(c);
214  AddChar();
215  break;
216  case '\"':
217  if ( Char(1) != '\"' ) {
218  AddChar();
219  return;
220  }
221  AddChars(2);
222  break;
223  default:
224  if ( c < ' ' && c > '\0' ) {
225  LexerWarning("illegal character in string: \\...", 4);
226  }
227  else {
228  AddStringChar(c);
229  }
230  AddChar();
231  break;
232  }
233  }
234 }
235 
237 {
238  TToken token = T_BINARY_STRING;
239  while ( true ) {
240  char c = Char();
241  switch ( c ) {
242  case '\r':
243  case '\n':
244  LexerWarning("unclosed bit string", 5);
245  return token;
246  case 0:
247  if ( Eof() ) {
248  LexerWarning("unclosed bit string", 6);
249  return token;
250  }
251  AddChar();
252  LexerWarning("illegal character in bit string", 7);
253  break;
254  case '0':
255  case '1':
256  AddChar();
257  break;
258  case '2': case '3': case '4': case '5': case '6': case '7': case '8':
259  case '9': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
260  AddChar();
261  token = T_HEXADECIMAL_STRING;
262  break;
263  case '\'':
264  switch ( Char(1) ) {
265  case 'B':
266  AddChars(2);
267  if ( token != T_BINARY_STRING )
268  LexerWarning("binary string contains hexadecimal digits", 8);
269  return T_BINARY_STRING;
270  case 'H':
271  AddChars(2);
272  return T_HEXADECIMAL_STRING;
273  default:
274  AddChar();
275  LexerWarning("unknown type of bit string", 9);
276  return token;
277  }
278  default:
279  AddChar();
280  LexerWarning("illegal character in bit string", 10);
281  break;
282  }
283  }
284 }
285 
287 {
288  while ( true ) {
289  char c = Char();
290  if ( IsAlNum(c) )
291  AddChar();
292  else if ( c == '-' ) {
293  if ( IsAlNum(Char(1)) )
294  AddChars(2);
295  else {
296  if ( AllowIDsEndingWithMinus() )
297  AddChar();
298  return;
299  }
300  }
301  else
302  return;
303  }
304 }
305 
307 {
308  while ( IsDigit(Char()) ) {
309  AddChar();
310  }
311  char c = Char();
312  if (c == '.' || c == 'e' || c == 'E' || c == '-' || c == '+') {
313  AddChar();
314  LookupNumber();
315  return T_DOUBLE;
316  }
317  return T_NUMBER;
318 }
319 
321 {
322  while ( true ) {
323  char c = Char();
324  switch ( c ) {
325  case '\r':
326  case '\n':
327  LexerWarning("unclosed tag", 11);
328  return;
329  case 0:
330  if ( Eof() ) {
331  LexerWarning("unclosed tag", 12);
332  return;
333  }
334  AddChar();
335  LexerWarning("illegal character in tag", 13);
336  break;
337  case ']':
338  AddChar();
339  return;
340  case '0': case '1': case '2': case '3': case '4':
341  case '5': case '6': case '7': case '8': case '9':
342  // case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
343  AddChar();
344  break;
345  default:
346  AddChar();
347 // LexerWarning("illegal character in tag", 14);
348  break;
349  }
350  }
351 }
352 
353 #define CHECK(keyword, t, length) \
354  if ( memcmp(token, keyword, length) == 0 ) return t
355 
357 {
358  const char* token = CurrentTokenStart();
359  switch ( CurrentTokenLength() ) {
360  case 2:
361  CHECK("OF", K_OF, 2);
362  break;
363  case 3:
364  CHECK("SET", K_SET, 3);
365  CHECK("BIT", K_BIT, 3);
366  CHECK("END", K_END, 3);
367  break;
368  case 4:
369  CHECK("TRUE", K_TRUE, 4);
370  CHECK("NULL", K_NULL, 4);
371  CHECK("REAL", K_REAL, 4);
372  CHECK("FROM", K_FROM, 4);
373  CHECK("TAGS", K_TAGS, 4);
374  break;
375  case 5:
376  CHECK("OCTET", K_OCTET, 5);
377  CHECK("BEGIN", K_BEGIN, 5);
378  CHECK("FALSE", K_FALSE, 5);
379  break;
380  case 6:
381  CHECK("CHOICE", K_CHOICE, 6);
382  CHECK("STRING", K_STRING, 6);
383  CHECK("BigInt", K_BIGINT, 6);
384  break;
385  case 7:
386  CHECK("INTEGER", K_INTEGER, 7);
387  CHECK("BOOLEAN", K_BOOLEAN, 7);
388  CHECK("DEFAULT", K_DEFAULT, 7);
389  CHECK("IMPORTS", K_IMPORTS, 7);
390  CHECK("EXPORTS", K_EXPORTS, 7);
391  CHECK("PRIVATE", K_PRIVATE, 7);
392  break;
393  case 8:
394  CHECK("SEQUENCE", K_SEQUENCE, 8);
395  CHECK("OPTIONAL", K_OPTIONAL, 8);
396  CHECK("EXPLICIT", K_EXPLICIT, 8);
397  CHECK("IMPLICIT", K_IMPLICIT, 8);
398  break;
399  case 9:
400  CHECK("AUTOMATIC", K_AUTOMATIC, 9);
401  CHECK("UNIVERSAL", K_UNIVERSAL, 9);
402  case 10:
403  CHECK("ENUMERATED", K_ENUMERATED, 10);
404  CHECK("UTF8String", K_UTF8String, 10);
405  CHECK("COMPONENTS", K_COMPONENTS, 10);
406  break;
407  case 11:
408  CHECK("StringStore", K_StringStore, 11);
409  CHECK("DEFINITIONS", K_DEFINITIONS, 11);
410  CHECK("APPLICATION", K_APPLICATION, 11);
411  break;
412  case 13:
413  CHECK("VisibleString", K_VisibleString, 13);
414  break;
415  }
416  return T_TYPE_REFERENCE;
417 }
418 
420 {
422  m_StringValue.erase();
423 }
424 
426 {
428  m_StringValue += c;
429 }
430 
TToken LookupKeyword(void)
Definition: lexer.cpp:356
void AddStringChar(char c)
Definition: lexer.cpp:425
bool AllowIDsEndingWithMinus(void) const
Definition: lexer.hpp:53
TToken LookupBinHexString(void)
Definition: lexer.cpp:236
TToken LookupNumber(void)
Definition: lexer.cpp:306
void LookupString(void)
Definition: lexer.cpp:198
string m_StringValue
Definition: lexer.hpp:77
void SkipComment(void)
Definition: lexer.cpp:161
void LookupTag(void)
Definition: lexer.cpp:320
void LookupIdentifier(void)
Definition: lexer.cpp:286
virtual ~ASNLexer()
Definition: lexer.cpp:54
virtual void LookupComments(void) override
Definition: lexer.cpp:125
ASNLexer(CNcbiIstream &in, const string &name)
Definition: lexer.cpp:49
virtual TToken LookupToken(void) override
Definition: lexer.cpp:58
void StartString(void)
Definition: lexer.cpp:419
void AddChar(char c)
Definition: alexer.cpp:208
bool TokenStarted(void) const
Definition: alexer.hpp:131
virtual void LexerWarning(const char *error, int err_subcode=0)
Definition: alexer.cpp:70
void SkipChars(size_t count)
Definition: alexer.hpp:161
char Char(void)
Definition: alexer.hpp:179
void StartToken(void)
Definition: alexer.hpp:145
void SkipChar(void)
Definition: alexer.hpp:167
size_t CurrentTokenLength(void) const
Definition: alexer.hpp:195
void NextLine(void)
Definition: alexer.hpp:141
CComment & AddComment(void)
Definition: alexer.cpp:197
void AddChars(size_t count)
Definition: alexer.hpp:151
bool Eof(void)
Definition: alexer.hpp:183
const char * CurrentTokenStart(void) const
Definition: alexer.hpp:187
void AddChar(void)
Definition: alexer.hpp:157
#define true
Definition: bool.h:35
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define CHECK(keyword, t, length)
Definition: lexer.cpp:353
bool IsAlNum(char c)
Definition: lexer.cpp:39
bool IsDigit(char c)
Definition: lexer.cpp:44
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
std::istream & in(std::istream &in_, double &x_)
#define _ASSERT
TToken
Definition: tokens.hpp:38
@ K_DEFINITIONS
Definition: tokens.hpp:52
@ K_FALSE
Definition: tokens.hpp:72
@ K_IMPLICIT
Definition: tokens.hpp:82
@ K_BIGINT
Definition: tokens.hpp:76
@ T_IDENTIFIER
Definition: tokens.hpp:42
@ T_TAG
Definition: tokens.hpp:49
@ K_BOOLEAN
Definition: tokens.hpp:59
@ K_APPLICATION
Definition: tokens.hpp:86
@ K_INTEGER
Definition: tokens.hpp:60
@ T_STRING
Definition: tokens.hpp:44
@ T_TAG_BEGIN
Definition: tokens.hpp:79
@ K_CHOICE
Definition: tokens.hpp:71
@ K_PRIVATE
Definition: tokens.hpp:87
@ K_FROM
Definition: tokens.hpp:57
@ T_TYPE_REFERENCE
Definition: tokens.hpp:43
@ K_BIT
Definition: tokens.hpp:65
@ K_COMPONENTS
Definition: tokens.hpp:88
@ K_AUTOMATIC
Definition: tokens.hpp:84
@ K_BEGIN
Definition: tokens.hpp:53
@ K_OPTIONAL
Definition: tokens.hpp:74
@ K_IMPORTS
Definition: tokens.hpp:55
@ K_ENUMERATED
Definition: tokens.hpp:61
@ T_DEFINE
Definition: tokens.hpp:48
@ K_REAL
Definition: tokens.hpp:62
@ T_DOUBLE
Definition: tokens.hpp:50
@ K_StringStore
Definition: tokens.hpp:64
@ K_DEFAULT
Definition: tokens.hpp:75
@ K_EXPLICIT
Definition: tokens.hpp:81
@ T_BINARY_STRING
Definition: tokens.hpp:46
@ K_SEQUENCE
Definition: tokens.hpp:69
@ K_OF
Definition: tokens.hpp:70
@ T_TAG_END
Definition: tokens.hpp:80
@ K_OCTET
Definition: tokens.hpp:66
@ K_NULL
Definition: tokens.hpp:58
@ T_NUMBER
Definition: tokens.hpp:45
@ K_UNIVERSAL
Definition: tokens.hpp:85
@ K_EXPORTS
Definition: tokens.hpp:56
@ T_HEXADECIMAL_STRING
Definition: tokens.hpp:47
@ K_VisibleString
Definition: tokens.hpp:63
@ K_TRUE
Definition: tokens.hpp:73
@ K_END
Definition: tokens.hpp:54
@ K_TAGS
Definition: tokens.hpp:83
@ K_UTF8String
Definition: tokens.hpp:77
@ K_STRING
Definition: tokens.hpp:67
@ K_SET
Definition: tokens.hpp:68
@ T_SYMBOL
Definition: tokens.hpp:40
Modified on Fri Sep 20 14:57:44 2024 by modify_doxy.py rev. 669887