NCBI C++ ToolKit
ncbistr.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: ncbistr.cpp 100536 2023-08-10 15:46:01Z lavr $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko, Denis Vakatov
27  *
28  * File Description:
29  * Some helper functions
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/tempstr.hpp>
37 #include <corelib/ncbistr_util.hpp>
38 #include <corelib/error_codes.hpp>
39 #include <corelib/ncbierror.hpp>
40 #include <corelib/ncbifloat.h>
41 #include <corelib/ncbi_base64.h>
42 #include <memory>
43 #include <functional>
44 #include <algorithm>
45 #include <iterator>
46 #include <stdio.h>
47 #include <locale.h>
48 #include <math.h>
49 
50 
51 #define NCBI_USE_ERRCODE_X Corelib_Util
52 
53 
55 
56 
57 // Digits (up to base 36)
58 static const char kDigitUpper[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
59 static const char kDigitLower[] = "0123456789abcdefghijklmnopqrstuvwxyz";
60 
61 
62 static inline
63 SIZE_TYPE s_DiffPtr(const char* end, const char* start)
64 {
65  return end ? (SIZE_TYPE)(end - start) : (SIZE_TYPE) 0;
66 }
67 
68 const char *const kEmptyCStr = "";
69 
70 #if defined(HAVE_WSTRING)
71 const wchar_t *const kEmptyWCStr = L"";
72 #endif
73 
74 
75 extern const char* const kNcbiDevelopmentVersionString;
77  = "NCBI_DEVELOPMENT_VER_" NCBI_AS_STRING(NCBI_DEVELOPMENT_VER);
78 
79 #ifdef NCBI_PRODUCTION_VER
80 extern const char* const kNcbiProductionVersionString;
81 const char* const kNcbiProductionVersionString
82  = "NCBI_PRODUCTION_VER_" NCBI_AS_STRING(NCBI_PRODUCTION_VER);
83 #endif
84 
85 
86 #if !defined(NCBI_OS_MSWIN) && \
87  !(defined(NCBI_OS_LINUX) && \
88  (defined(NCBI_COMPILER_GCC) || defined(NCBI_COMPILER_ANY_CLANG)))
89 const string* CNcbiEmptyString::m_Str = 0;
90 const string& CNcbiEmptyString::FirstGet(void) {
91  static const string s_Str = "";
92  m_Str = &s_Str;
93  return s_Str;
94 }
95 # ifdef HAVE_WSTRING
96 const wstring* CNcbiEmptyWString::m_Str = 0;
97 const wstring& CNcbiEmptyWString::FirstGet(void) {
98  static const wstring s_Str = L"";
99  m_Str = &s_Str;
100  return s_Str;
101 }
102 # endif
103 #endif
104 
105 
107 {
108  SIZE_TYPE len = str.length();
109  for (SIZE_TYPE idx = pos; idx < len; ++idx) {
110  if (!isspace((unsigned char) str[idx])) {
111  return false;
112  }
113  }
114  return true;
115 }
116 
117 
119 {
120  SIZE_TYPE n1 = s1.length();
121  SIZE_TYPE n2 = s2.length();
122  if ( !n1 ) {
123  return n2 ? -1 : 0;
124  }
125  if ( !n2 ) {
126  return 1;
127  }
128  if (int res = memcmp(s1.data(), s2.data(), min(n1, n2))) {
129  return res;
130  }
131  return (n1 == n2) ? 0 : (n1 > n2 ? 1 : -1);
132 }
133 
134 
136  const char* s2)
137 {
138  if (pos == NPOS || !n || s1.length() <= pos) {
139  return *s2 ? -1 : 0;
140  }
141  if ( !*s2 ) {
142  return 1;
143  }
144  if (n == NPOS || n > s1.length() - pos) {
145  n = s1.length() - pos;
146  }
147  const char* s = s1.data() + pos;
148  while (n && *s2 && *s == *s2) {
149  s++; s2++; n--;
150  }
151  if (n == 0) {
152  return *s2 ? -1 : 0;
153  }
154  return *s - *s2;
155 }
156 
157 
159  const CTempString s2)
160 {
161  if (pos == NPOS || !n || s1.length() <= pos) {
162  return s2.empty() ? 0 : -1;
163  }
164  if (s2.empty()) {
165  return 1;
166  }
167  if (n == NPOS || n > s1.length() - pos) {
168  n = s1.length() - pos;
169  }
170  SIZE_TYPE n_cmp = n;
171  if (n_cmp > s2.length()) {
172  n_cmp = s2.length();
173  }
174  const char* s = s1.data() + pos;
175  const char* p = s2.data();
176  while (n_cmp && *s == *p) {
177  s++; p++; n_cmp--;
178  }
179 
180  if (n_cmp == 0) {
181  if (n == s2.length())
182  return 0;
183  return n > s2.length() ? 1 : -1;
184  }
185 
186  return *s - *p;
187 }
188 
189 
191 {
192  SIZE_TYPE n1 = s1.length();
193  SIZE_TYPE n2 = s2.length();
194 
195  if ( !n1 ) {
196  return n2 ? -1 : 0;
197  }
198  if ( !n2 ) {
199  return 1;
200  }
201  SIZE_TYPE n = min(n1, n2);
202  const char* p1 = s1.data();
203  const char* p2 = s2.data();
204 
205  while (n && (*p1 == *p2 ||
206  tolower((unsigned char)(*p1)) == tolower((unsigned char)(*p2))) ) {
207  p1++; p2++; n--;
208  }
209  if ( !n ) {
210  return (n1 == n2) ? 0 : (n1 > n2 ? 1 : -1);
211  }
212  if (*p1 == *p2) {
213  return 0;
214  }
215  return tolower((unsigned char)(*p1)) - tolower((unsigned char)(*p2));
216 }
217 
218 
220  const char* s2)
221 {
222  if (pos == NPOS || !n || s1.length() <= pos) {
223  return *s2 ? -1 : 0;
224  }
225  if ( !*s2 ) {
226  return 1;
227  }
228 
229  if (n == NPOS || n > s1.length() - pos) {
230  n = s1.length() - pos;
231  }
232 
233  const char* s = s1.data() + pos;
234  while (n && *s2 && (*s == *s2 ||
235  tolower((unsigned char)(*s)) == tolower((unsigned char)(*s2))) ) {
236  s++; s2++; n--;
237  }
238  if (n == 0) {
239  return *s2 ? -1 : 0;
240  }
241  if (*s == *s2) {
242  return 0;
243  }
244  return tolower((unsigned char)(*s)) - tolower((unsigned char)(*s2));
245 }
246 
247 
249  const CTempString s2)
250 {
251  if (pos == NPOS || !n || s1.length() <= pos) {
252  return s2.empty() ? 0 : -1;
253  }
254  if (s2.empty()) {
255  return 1;
256  }
257  if (n == NPOS || n > s1.length() - pos) {
258  n = s1.length() - pos;
259  }
260 
261  SIZE_TYPE n_cmp = n;
262  if (n_cmp > s2.length()) {
263  n_cmp = s2.length();
264  }
265  const char* s = s1.data() + pos;
266  const char* p = s2.data();
267  while (n_cmp && (*s == *p ||
268  tolower((unsigned char)(*s)) == tolower((unsigned char)(*p))) ) {
269  s++; p++; n_cmp--;
270  }
271  if (n_cmp == 0) {
272  return (n == s2.length()) ? 0 : (n > s2.length() ? 1 : -1);
273  }
274  if (*s == *p) {
275  return 0;
276  }
277  return tolower((unsigned char)(*s)) - tolower((unsigned char)(*p));
278 }
279 
280 
281 // MatchesMask() tri-state result
283  eMatch = 1, // match
284  eNoMatch = 0, // no match
285  eMismatch = -1 // mismatch, stop search
286 };
287 
288 // Implements the same logic as UTIL_MatchesMask() from 'include/connect/ncbi_util.h',
289 // but for CTempString instead of char*.
290 
292 {
293  char s, m;
294  size_t str_pos = 0, mask_pos = 0;
295 
296  for ( ; (m = mask[mask_pos]); ++str_pos, ++mask_pos) {
297 
298  s = str[str_pos];
299 
300  if (!s && m != '*') {
301  return eMismatch;
302  }
303  // Analyze mask symbol
304  switch ( m ) {
305  case '?':
306  _ASSERT(s);
307  continue;
308  case '*':
309  // Collapse multiple stars
310  while ( (m = mask[mask_pos]) == '*' ) mask_pos++;
311  if ( !m ) {
312  // only stars left in the mask
313  return eMatch;
314  }
315  // General case, use recursion
316  while ( s ) {
317  EMatchesMaskResult res = s_MatchesMask(str.substr(str_pos), mask.substr(mask_pos), ignore_case);
318  if ( res != eNoMatch ) {
319  // match or mismatch
320  return res;
321  }
322  // continue search
323  s = str[str_pos++];
324  }
325  return eMismatch;
326 
327  case '[':
328  if (!(m = mask[++mask_pos]))
329  return eMismatch; // mismatch, pattern error
330  if (m == '!') {
331  m = 1 /*complement*/;
332  ++mask_pos;
333  } else
334  m = 0;
335  if (ignore_case)
336  s = (char) tolower((unsigned char) s);
337  _ASSERT(s);
338  char a, b; // range for [a-b]
339  do {
340  if (!(a = mask[mask_pos++]))
341  return eMismatch; // mismatch, pattern error
342  if (mask[mask_pos] == '-' && mask[mask_pos+1] != ']') {
343  ++mask_pos;
344  if (!(b = mask[mask_pos++]))
345  return eMismatch; // mismatch, pattern error
346  } else
347  b = a;
348  if (s) {
349  if (ignore_case) {
350  a = (char) tolower((unsigned char) a);
351  b = (char) tolower((unsigned char) b);
352  }
353  if (a <= s && s <= b)
354  s = 0 /*mark as found*/;
355  }
356  } while (mask[mask_pos] != ']');
357  if (m == !s)
358  return eNoMatch; // mismatch
359  continue;
360 
361  case '\\':
362  if (!(m = mask[++mask_pos]))
363  return eMismatch; // mismatch, pattern error
364  /*FALLTHRU*/
365 
366  default:
367  // Compare non pattern character in mask and name
368  _ASSERT(s && m);
369  if (ignore_case) {
370  if (s != m && tolower((unsigned char)s) != tolower((unsigned char)m))
371  return eNoMatch;
372  } else {
373  if (s != m)
374  return eNoMatch;
375  }
376  continue;
377  }
378  }
379  // Matches if we reach the end of the string and mask at the same time only
380  if ( str[str_pos] ) {
381  return eNoMatch;
382  }
383  return eMatch;
384 }
385 
386 
387 // NOTE: This code is also used in CDirEntry::MatchesMask().
388 //
390 {
391  return s_MatchesMask(str, mask, use_case == NStr::eNocase) == eMatch;
392 }
393 
394 
395 char* NStr::ToLower(char* str)
396 {
397  char* s;
398  for (s = str; *str; str++) {
399  *str = (char)tolower((unsigned char)(*str));
400  }
401  return s;
402 }
403 
404 
405 string& NStr::ToLower(string& str)
406 {
407  NON_CONST_ITERATE (string, it, str) {
408  *it = (char)tolower((unsigned char)(*it));
409  }
410  return str;
411 }
412 
413 
414 char* NStr::ToUpper(char* str)
415 {
416  char* s;
417  for (s = str; *str; str++) {
418  *str = (char)toupper((unsigned char)(*str));
419  }
420  return s;
421 }
422 
423 
424 string& NStr::ToUpper(string& str)
425 {
426  NON_CONST_ITERATE (string, it, str) {
427  *it = (char)toupper((unsigned char)(*it));
428  }
429  return str;
430 }
431 
432 
434 {
435  SIZE_TYPE len = str.length();
436  for (SIZE_TYPE i = 0; i < len; ++i) {
437  if (isalpha((unsigned char)str[i]) && !islower((unsigned char)str[i])) {
438  return false;
439  }
440  }
441  return true;
442 }
443 
444 
446 {
447  SIZE_TYPE len = str.length();
448  for (SIZE_TYPE i = 0; i < len; ++i) {
449  if (isalpha((unsigned char)str[i]) && !isupper((unsigned char)str[i])) {
450  return false;
451  }
452  }
453  return true;
454 }
455 
456 
458 {
459  int error = 0, ret = -1;
460  size_t len = str.size();
461 
462  if (!len) {
463  error = EINVAL;
464  } else {
465  size_t i = 0;
466  // skip leading '+' if any
467  if (str.data()[0] == '+' && len > 1) {
468  ++i;
469  }
470  unsigned v = 0;
471  for (; i < len; ++i) {
472  unsigned d = str.data()[i] - '0';
473  if (d > 9) {
474  error = EINVAL;
475  break;
476  }
477  unsigned nv = v * 10 + d;
478  const unsigned kOverflowLimit = (INT_MAX - 9) / 10 + 1;
479  if (v >= kOverflowLimit) {
480  // possible overflow
481  if (v > kOverflowLimit || nv > INT_MAX) {
482  error = ERANGE;
483  break;
484  }
485  }
486  v = nv;
487  }
488  if (!error) {
489  ret = static_cast<int>(v);
490  }
491  }
492 /*
493  if (flags & fConvErr_NoErrno) {
494  return ret;
495  }
496 */
497  errno = error;
498  if (error) {
501  } else {
503  }
504  }
505  return ret;
506 }
507 
508 
509 /// @internal
510 // Access to errno is slow on some platforms, because it use TLS to store a value
511 // for each thread. This guard class can set an errno value in string to numeric
512 // conversion functions only once before exit, and when necessary.
514 {
515 public:
516  CS2N_Guard(NStr::TStringToNumFlags, bool skip_if_zero) :
517  m_NoErrno(false), // m_NoErrno((flags & NStr::fConvErr_NoErrno) > 0),
518  m_SkipIfZero(skip_if_zero),
519  m_Errno(0)
520  { }
521  ~CS2N_Guard(void) {
522  if (!m_NoErrno) {
523  // Is the guard used against the code that already set an errno?
524  // If the error code is not defined here, do not even try to check/set it.
525  if (!m_SkipIfZero || m_Errno) {
526  errno = m_Errno;
527  }
528  }
529  }
530  void Set(int errcode) { m_Errno = errcode; }
531  int Errno(void) const { return m_Errno; }
532  // Says that we want to throw an exception, do not set errno in this case
533  void Throw(void) { m_SkipIfZero = true; m_Errno = 0; }
534  // Auxiliary function to create a message about conversion error
535  // to specified type. It doesn't have any relation to the guard itself,
536  // but can help to save on the amount of code in calling macro.
537  string Message(const CTempString str, const char* to_type, const CTempString msg);
538 
539 private:
540  bool m_NoErrno; // do not set errno at all
541  bool m_SkipIfZero; // do not set errno if TRUE and m_Errno == 0
542  int m_Errno; // errno value to set
543 };
544 
545 string CS2N_Guard::Message(const CTempString str, const char* to_type, const CTempString msg)
546 {
547  string s;
548  s.reserve(str.length() + msg.length() + 50);
549  s += "Cannot convert string '";
551  s += "' to ";
552  s += to_type;
553  if ( !msg.empty() ) {
554  s += ", ";
555  s += msg;
556  }
557  return s;
558 }
559 
560 /// Regular guard
561 #define S2N_CONVERT_GUARD(flags) \
562  CS2N_Guard err_guard(flags, false)
563 
564 // This guard can be used against the code that already set an errno.
565 // If the error code is not defined, the guard not even try to check/set it (even to zero).
566 #define S2N_CONVERT_GUARD_EX(flags) \
567  CS2N_Guard err_guard(flags, true)
568 
569 #define S2N_CONVERT_ERROR(to_type, msg, errcode, pos) \
570  do { \
571  err_guard.Set(errcode); \
572  if ( !(flags & NStr::fConvErr_NoThrow) ) { \
573  err_guard.Throw(); \
574  NCBI_THROW2(CStringException, eConvert, \
575  err_guard.Message(str, #to_type, msg), pos); \
576  } else { \
577 /* \
578  if (flags & NStr::fConvErr_NoErrno) { \
579  / Error, but forced to return 0 / \
580  return 0; \
581  } \
582 */ \
583  if (flags & NStr::fConvErr_NoErrMessage) { \
584  CNcbiError::SetErrno(err_guard.Errno()); \
585  } else { \
586  CNcbiError::SetErrno(err_guard.Errno(), \
587  err_guard.Message(str, #to_type, msg)); \
588  } \
589  return 0; \
590  } \
591  } while (false)
592 
593 
594 #define S2N_CONVERT_ERROR_INVAL(to_type) \
595  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos)
596 
597 #define S2N_CONVERT_ERROR_RADIX(to_type, msg) \
598  S2N_CONVERT_ERROR(to_type, msg, EINVAL, pos)
599 
600 #define S2N_CONVERT_ERROR_OVERFLOW(to_type) \
601  S2N_CONVERT_ERROR(to_type, "overflow", ERANGE, pos)
602 
603 #define CHECK_ENDPTR(to_type) \
604  if ( str[pos] ) { \
605  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos); \
606  }
607 
608 #define CHECK_ENDPTR_SIZE(to_type) \
609  if ( pos < size ) { \
610  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos); \
611  }
612 
613 #define CHECK_COMMAS \
614  /* Check on possible commas */ \
615  if (flags & NStr::fAllowCommas) { \
616  if (ch == ',') { \
617  if ((numpos == pos) || \
618  ((comma >= 0) && (comma != 3)) ) { \
619  /* Not first comma, sitting on incorrect place */ \
620  break; \
621  } \
622  /* Skip it */ \
623  comma = 0; \
624  pos++; \
625  continue; \
626  } else { \
627  if (comma >= 0) { \
628  /* Count symbols between commas */ \
629  comma++; \
630  } \
631  } \
632  }
633 
634 
636 {
638  Int8 value = StringToInt8(str, flags, base);
639  if ( value < kMin_Int || value > kMax_Int ) {
640  S2N_CONVERT_ERROR(int, "overflow", ERANGE, 0);
641  }
642  return (int) value;
643 }
644 
645 
646 unsigned int
647 NStr::StringToUInt(const CTempString str, TStringToNumFlags flags, int base)
648 {
650  Uint8 value = StringToUInt8(str, flags, base);
651  if ( value > kMax_UInt ) {
652  S2N_CONVERT_ERROR(unsigned int, "overflow", ERANGE, 0);
653  }
654  return (unsigned int) value;
655 }
656 
657 
658 long NStr::StringToLong(const CTempString str, TStringToNumFlags flags, int base)
659 {
661  Int8 value = StringToInt8(str, flags, base);
662  if ( value < kMin_Long || value > kMax_Long ) {
663  S2N_CONVERT_ERROR(long, "overflow", ERANGE, 0);
664  }
665  return (long) value;
666 }
667 
668 
669 unsigned long
670 NStr::StringToULong(const CTempString str, TStringToNumFlags flags, int base)
671 {
673  Uint8 value = StringToUInt8(str, flags, base);
674  if ( value > kMax_ULong ) {
675  S2N_CONVERT_ERROR(unsigned long, "overflow", ERANGE, 0);
676  }
677  return (unsigned long) value;
678 }
679 
680 
681 /// @internal
682 // Check that symbol 'ch' is good symbol for number with radix 'base'.
683 static inline
684 bool s_IsGoodCharForRadix(char ch, int base, int* value = 0)
685 {
686  if ( base <= 10 ) {
687  // shortcut for most frequent case
688  int delta = ch-'0';
689  if ( unsigned(delta) < unsigned(base) ) {
690  if ( value ) {
691  *value = delta;
692  }
693  return true;
694  }
695  return false;
696  }
697  if (!isalnum((unsigned char) ch)) {
698  return false;
699  }
700  // Corresponding numeric value of *endptr
701  int delta;
702  if (isdigit((unsigned char) ch)) {
703  delta = ch - '0';
704  } else {
705  ch = (char) tolower((unsigned char) ch);
706  delta = ch - 'a' + 10;
707  }
708  if ( value ) {
709  *value = delta;
710  }
711  return delta < base;
712  }
713 
714 
715 // Skip all allowed chars (all except used for digit composition).
716 // Update 'ptr' to current position in the string.
717 enum ESkipMode {
718  eSkipAll, // all symbols
719  eSkipAllAllowed, // all symbols, except digit/+/-/.
720  eSkipSpacesOnly // spaces only
721 };
722 
723 static inline
724 bool s_IsDecimalPoint(unsigned char ch, NStr::TStringToNumFlags flags)
725 {
726  if ( ch != '.' && ch != ',') {
727  return false;
728  }
729  if (flags & NStr::fDecimalPosix) {
730  return ch == '.';
731  }
732  else if (flags & NStr::fDecimalPosixOrLocal) {
733  return ch == '.' || ch == ',';
734  }
735  struct lconv* conv = localeconv();
736  return ch == *(conv->decimal_point);
737 }
738 
739 static inline
741  SIZE_TYPE& pos,
742  ESkipMode skip_mode,
744 {
745  if (skip_mode == eSkipAll) {
746  pos = str.length();
747  return;
748  }
749 
750  for ( SIZE_TYPE len = str.length(); pos < len; ++pos ) {
751  unsigned char ch = str[pos];
752  if ( isdigit(ch) || ch == '+' || ch == '-' || s_IsDecimalPoint(ch,flags) ) {
753  break;
754  }
755  if ( (skip_mode == eSkipSpacesOnly) && !isspace(ch) ) {
756  break;
757  }
758  }
759 }
760 
761 
762 // Check radix base. If it is zero, determine base using first chars
763 // of the string. Update 'base' value.
764 // Update 'ptr' to current position in the string.
765 static inline
766 bool s_CheckRadix(const CTempString str, SIZE_TYPE& pos, int& base)
767 {
768  if ( base == 10 || base == 8 ) {
769  // shortcut for most frequent case
770  return true;
771  }
772  // Check base
773  if ( base < 0 || base == 1 || base > 36 ) {
774  return false;
775  }
776  // Try to determine base using first chars of the string
777  unsigned char ch = str[pos];
778  unsigned char next = str[pos+1];
779  if ( base == 0 ) {
780  if ( ch != '0' ) {
781  base = 10;
782  } else if (next == 'x' || next == 'X') {
783  base = 16;
784  } else {
785  base = 8;
786  }
787  }
788  // Remove leading '0x' for hex numbers
789  if ( base == 16 ) {
790  if (ch == '0' && (next == 'x' || next == 'X')) {
791  pos += 2;
792  }
793  }
794  return true;
795 }
796 
797 
798 Int8 NStr::StringToInt8(const CTempString str, TStringToNumFlags flags, int base)
799 {
801 
802  // Current position in the string
803  SIZE_TYPE pos = 0;
804 
805  // Skip allowed leading symbols
806  if (flags & fAllowLeadingSymbols) {
807  bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
810  }
811  // Determine sign
812  bool sign = false;
813  switch (str[pos]) {
814  case '-':
815  sign = true;
816  /*FALLTHRU*/
817  case '+':
818  pos++;
819  break;
820  default:
821  if (flags & fMandatorySign) {
823  }
824  break;
825  }
826  SIZE_TYPE pos0 = pos;
827  // Check radix base
828  if ( !s_CheckRadix(str, pos, base) ) {
829  S2N_CONVERT_ERROR_RADIX(Int8, "bad numeric base '" +
830  NStr::IntToString(base)+ "'");
831  }
832 
833  // Begin conversion
834  Int8 n = 0;
835  Int8 limdiv = base==10? kMax_I8 / 10: kMax_I8 / base;
836  Int8 limoff = (base==10? kMax_I8 % 10: kMax_I8 % base) + (sign ? 1 : 0);
837 
838  // Number of symbols between two commas. '-1' means -- no comma yet.
839  int comma = -1;
840  SIZE_TYPE numpos = pos;
841 
842  while (char ch = str[pos]) {
843  int delta; // corresponding numeric value of 'ch'
844 
845  // Check on possible commas
846  CHECK_COMMAS;
847  // Sanity check
848  if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
849  break;
850  }
851  // Overflow check
852  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
854  }
855  n *= base;
856  n += delta;
857  pos++;
858  }
859 
860  // Last checks
861  if ( pos == pos0 || ((comma >= 0) && (comma != 3)) ) {
863  }
864  // Skip allowed trailing symbols
866  bool spaces = ((flags & fAllowTrailingSymbols) ==
869  }
870  // Assign sign before the end pointer check
871  n = sign ? -n : n;
873 
874  return n;
875 }
876 
877 
879  TStringToNumFlags flags, int base)
880 {
882 
883  const TStringToNumFlags slow_flags =
885 
886  if ( base == 10 && (flags & slow_flags) == 0 ) {
887  // fast conversion
888 
889  // Current position in the string
890  CTempString::const_iterator ptr = str.begin(), end = str.end();
891 
892  // Determine sign
893  if ( ptr != end && *ptr == '+' ) {
894  ++ptr;
895  }
896  if ( ptr == end ) {
897  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, ptr-str.begin());
898  }
899 
900  // Begin conversion
901  Uint8 n = 0;
902 
903  const Uint8 limdiv = kMax_UI8/10;
904  const int limoff = int(kMax_UI8 % 10);
905 
906  do {
907  char ch = *ptr;
908  int delta = ch - '0';
909  if ( unsigned(delta) >= 10 ) {
910  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, ptr-str.begin());
911  }
912  // Overflow check
913  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
914  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, ptr-str.begin());
915  }
916  n = n*10+delta;
917  } while ( ++ptr != end );
918 
919  return n;
920  }
921 
922  // Current position in the string
923  SIZE_TYPE pos = 0, size = str.size();
924 
925  // Skip allowed leading symbols
926  if (flags & fAllowLeadingSymbols) {
927  bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
930  }
931  // Determine sign
932  if (str[pos] == '+') {
933  pos++;
934  } else {
935  if (flags & fMandatorySign) {
937  }
938  }
939  SIZE_TYPE pos0 = pos;
940 
941  // Begin conversion
942  Uint8 n = 0;
943  // Check radix base
944  if ( !s_CheckRadix(str, pos, base) ) {
945  S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
946  NStr::IntToString(base) + "'");
947  }
948 
949  Uint8 limdiv = kMax_UI8 / base;
950  int limoff = int(kMax_UI8 % base);
951 
952  // Number of symbols between two commas. '-1' means -- no comma yet.
953  int comma = -1;
954  SIZE_TYPE numpos = pos;
955 
956  while (char ch = str[pos]) {
957  int delta; // corresponding numeric value of 'ch'
958 
959  // Check on possible commas
960  CHECK_COMMAS;
961  // Sanity check
962  if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
963  break;
964  }
965  // Overflow check
966  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
968  }
969  n *= base;
970  n += delta;
971  pos++;
972  }
973 
974  // Last checks
975  if ( pos == pos0 || ((comma >= 0) && (comma != 3)) ) {
977  }
978  // Skip allowed trailing symbols
980  bool spaces = ((flags & fAllowTrailingSymbols) ==
983  }
985  return n;
986 }
987 
988 
989 double NStr::StringToDoublePosix(const char* ptr, char** endptr, TStringToNumFlags flags)
990 {
992 
993  const char* start = ptr;
994  char c = *ptr++;
995 
996  // skip leading blanks
997  while ( isspace((unsigned char)c) ) {
998  c = *ptr++;
999  }
1000 
1001  int sign = 0;
1002  if ( c == '-' ) {
1003  sign = -1;
1004  c = *ptr++;
1005  }
1006  else if ( c == '+' ) {
1007  sign = +1;
1008  c = *ptr++;
1009  }
1010 
1011  if (c == 0) {
1012  if (endptr) {
1013  *endptr = (char*)start;
1014  }
1015  err_guard.Set(EINVAL);
1016  return 0.;
1017  }
1018 
1019  // short-cut - single digit
1020  if ( !*ptr && c >= '0' && c <= '9' ) {
1021  if (endptr) {
1022  *endptr = (char*)ptr;
1023  }
1024  double result = c-'0';
1025  // some compilers fail to negate zero
1026  return sign < 0 ? (c == '0' ? -0. : -result) : result;
1027  }
1028 
1029  bool dot = false, expn = false, anydigits = false;
1030  int digits = 0, dot_position = 0;
1031  unsigned int first=0, second=0, first_mul=1;
1032  long double second_mul = NCBI_CONST_LONGDOUBLE(1.),
1033  third = NCBI_CONST_LONGDOUBLE(0.);
1034 
1035  // up to exponent
1036  for ( ; ; c = *ptr++ ) {
1037  if (c >= '0' && c <= '9') {
1038  // digits: accumulate
1039  c = (char)(c - '0');
1040  anydigits = true;
1041  ++digits;
1042  if (first == 0) {
1043  first = c;
1044  if ( first == 0 ) {
1045  // omit leading zeros
1046  --digits;
1047  if (dot) {
1048  --dot_position;
1049  }
1050  }
1051  } else if (digits <= 9) {
1052  // first 9 digits come to 'first'
1053  first = first*10 + c;
1054  } else if (digits <= 18) {
1055  // next 9 digits come to 'second'
1056  first_mul *= 10;
1057  second = second*10 + c;
1058  } else {
1059  // other digits come to 'third'
1060  second_mul *= NCBI_CONST_LONGDOUBLE(10.);
1061  third = third * NCBI_CONST_LONGDOUBLE(10.) + c;
1062  }
1063  }
1064  else if (c == '.') {
1065  // dot
1066  // if second dot, stop
1067  if (dot) {
1068  --ptr;
1069  break;
1070  }
1071  dot_position = digits;
1072  dot = true;
1073  }
1074  else if (c == 'e' || c == 'E') {
1075  // if exponent, stop
1076  if (!anydigits) {
1077  --ptr;
1078  break;
1079  }
1080  expn = true;
1081  break;
1082  }
1083  else {
1084  --ptr;
1085  if (!anydigits) {
1086  if ( !dot && (c == 'n' || c == 'N') &&
1087  NStr::strncasecmp(ptr,"nan",3)==0) {
1088  if (endptr) {
1089  *endptr = (char*)(ptr+3);
1090  }
1091  return HUGE_VAL/HUGE_VAL; /* NCBI_FAKE_WARNING */
1092  }
1093  if ( (c == 'i' || c == 'I') ) {
1094  if ( NStr::strncasecmp(ptr,"inf",3)==0) {
1095  ptr += 3;
1096  if ( NStr::strncasecmp(ptr,"inity",5)==0) {
1097  ptr += 5;
1098  }
1099  if (endptr) {
1100  *endptr = (char*)ptr;
1101  }
1102  return sign < 0 ? -HUGE_VAL : HUGE_VAL;
1103  }
1104  }
1105  }
1106  break;
1107  }
1108  }
1109  // if no digits, stop now - error
1110  if (!anydigits) {
1111  if (endptr) {
1112  *endptr = (char*)start;
1113  }
1114  err_guard.Set(EINVAL);
1115  return 0.;
1116  }
1117  int exponent = dot ? dot_position - digits : 0;
1118 
1119  // read exponent
1120  if (expn && *ptr) {
1121  int expvalue = 0;
1122  bool expsign = false, expnegate= false;
1123  int expdigits= 0;
1124  for( ; ; ++ptr) {
1125  c = *ptr;
1126  // sign: should be no digits at this point
1127  if (c == '-' || c == '+') {
1128  // if there was sign or digits, stop
1129  if (expsign || expdigits) {
1130  break;
1131  }
1132  expsign = true;
1133  expnegate = c == '-';
1134  }
1135  // digits: accumulate
1136  else if (c >= '0' && c <= '9') {
1137  ++expdigits;
1138  int newexpvalue = expvalue*10 + (c-'0');
1139  if (newexpvalue > expvalue) {
1140  expvalue = newexpvalue;
1141  }
1142  }
1143  else {
1144  break;
1145  }
1146  }
1147  // if no digits, rollback
1148  if (!expdigits) {
1149  // rollback sign
1150  if (expsign) {
1151  --ptr;
1152  }
1153  // rollback exponent
1154  if (expn) {
1155  --ptr;
1156  }
1157  }
1158  else {
1159  exponent = expnegate ? exponent - expvalue : exponent + expvalue;
1160  }
1161  }
1162  long double ret;
1163  if ( first_mul > 1 ) {
1164  _ASSERT(first);
1165  ret = ((long double)first * first_mul + second)* second_mul + third;
1166  }
1167  else {
1168  _ASSERT(first_mul == 1);
1169  _ASSERT(second == 0);
1170  _ASSERT(second_mul == 1);
1171  _ASSERT(third == 0);
1172  ret = first;
1173  }
1174  // calculate exponent
1175  if ( first && exponent ) {
1176  // multiply by power of 10 only non-zero mantissa
1177  if (exponent > 2*DBL_MAX_10_EXP) {
1178  ret = (flags & fDecimalPosixFinite) ? DBL_MAX : HUGE_VAL;
1179  err_guard.Set(ERANGE);
1180  } else if (exponent < 2*DBL_MIN_10_EXP) {
1181  ret = (flags & fDecimalPosixFinite) ? DBL_MIN : 0.;
1182  err_guard.Set(ERANGE);
1183  } else {
1184  if ( exponent > 0 ) {
1185  static const double mul1[16] = {
1186  1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
1187  1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15
1188  };
1189  ret *= mul1[exponent&15];
1190  if ( exponent >>= 4 ) {
1191  static const long double mul2[16] = {
1192  NCBI_CONST_LONGDOUBLE(1e0),
1193  NCBI_CONST_LONGDOUBLE(1e16),
1194  NCBI_CONST_LONGDOUBLE(1e32),
1195  NCBI_CONST_LONGDOUBLE(1e48),
1196  NCBI_CONST_LONGDOUBLE(1e64),
1197  NCBI_CONST_LONGDOUBLE(1e80),
1198  NCBI_CONST_LONGDOUBLE(1e96),
1199  NCBI_CONST_LONGDOUBLE(1e112),
1200  NCBI_CONST_LONGDOUBLE(1e128),
1201  NCBI_CONST_LONGDOUBLE(1e144),
1202  NCBI_CONST_LONGDOUBLE(1e160),
1203  NCBI_CONST_LONGDOUBLE(1e176),
1204  NCBI_CONST_LONGDOUBLE(1e192),
1205  NCBI_CONST_LONGDOUBLE(1e208),
1206  NCBI_CONST_LONGDOUBLE(1e224),
1207  NCBI_CONST_LONGDOUBLE(1e240)
1208  };
1209  ret *= mul2[exponent&15];
1210  for ( exponent >>= 4; exponent; --exponent ) {
1211  ret *= NCBI_CONST_LONGDOUBLE(1e256);
1212  }
1213  }
1214  if (!finite(double(ret))) {
1215  if (flags & fDecimalPosixFinite) {
1216  ret = DBL_MAX;
1217  }
1218  err_guard.Set(ERANGE);
1219  }
1220  }
1221  else {
1222  exponent = -exponent;
1223  static const long double mul1[16] = {
1224  NCBI_CONST_LONGDOUBLE(1e-0),
1225  NCBI_CONST_LONGDOUBLE(1e-1),
1226  NCBI_CONST_LONGDOUBLE(1e-2),
1227  NCBI_CONST_LONGDOUBLE(1e-3),
1228  NCBI_CONST_LONGDOUBLE(1e-4),
1229  NCBI_CONST_LONGDOUBLE(1e-5),
1230  NCBI_CONST_LONGDOUBLE(1e-6),
1231  NCBI_CONST_LONGDOUBLE(1e-7),
1232  NCBI_CONST_LONGDOUBLE(1e-8),
1233  NCBI_CONST_LONGDOUBLE(1e-9),
1234  NCBI_CONST_LONGDOUBLE(1e-10),
1235  NCBI_CONST_LONGDOUBLE(1e-11),
1236  NCBI_CONST_LONGDOUBLE(1e-12),
1237  NCBI_CONST_LONGDOUBLE(1e-13),
1238  NCBI_CONST_LONGDOUBLE(1e-14),
1239  NCBI_CONST_LONGDOUBLE(1e-15)
1240  };
1241  ret *= mul1[exponent&15];
1242  if ( exponent >>= 4 ) {
1243  static const long double mul2[16] = {
1244  NCBI_CONST_LONGDOUBLE(1e-0),
1245  NCBI_CONST_LONGDOUBLE(1e-16),
1246  NCBI_CONST_LONGDOUBLE(1e-32),
1247  NCBI_CONST_LONGDOUBLE(1e-48),
1248  NCBI_CONST_LONGDOUBLE(1e-64),
1249  NCBI_CONST_LONGDOUBLE(1e-80),
1250  NCBI_CONST_LONGDOUBLE(1e-96),
1251  NCBI_CONST_LONGDOUBLE(1e-112),
1252  NCBI_CONST_LONGDOUBLE(1e-128),
1253  NCBI_CONST_LONGDOUBLE(1e-144),
1254  NCBI_CONST_LONGDOUBLE(1e-160),
1255  NCBI_CONST_LONGDOUBLE(1e-176),
1256  NCBI_CONST_LONGDOUBLE(1e-192),
1257  NCBI_CONST_LONGDOUBLE(1e-208),
1258  NCBI_CONST_LONGDOUBLE(1e-224),
1259  NCBI_CONST_LONGDOUBLE(1e-240)
1260  };
1261  ret *= mul2[exponent&15];
1262  for ( exponent >>= 4; exponent; --exponent ) {
1263  ret *= NCBI_CONST_LONGDOUBLE(1e-256);
1264  }
1265  }
1266  if ( ret < DBL_MIN ) {
1267  if (flags & fDecimalPosixFinite) {
1268  ret = DBL_MIN;
1269  }
1270  err_guard.Set(ERANGE);
1271  }
1272  }
1273  }
1274  }
1275  if ( sign < 0 ) {
1276  ret = -ret;
1277  }
1278  // done
1279  if (endptr) {
1280  *endptr = (char*)ptr;
1281  }
1282  return (double)ret;
1283 }
1284 
1285 
1286 /// @internal
1287 static double s_StringToDouble(const char* str, size_t size,
1289 {
1290  _ASSERT(str[size] == '\0');
1292  NCBI_THROW2(CStringException, eBadArgs,
1293  "NStr::StringToDouble(): mutually exclusive flags specified", 0);
1294  }
1296 
1297  // Current position in the string
1298  SIZE_TYPE pos = 0;
1299 
1300  // Skip allowed leading symbols
1302  bool spaces = ((flags & NStr::fAllowLeadingSymbols) ==
1305  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
1306  }
1307  // Check mandatory sign
1308  if (flags & NStr::fMandatorySign) {
1309  switch (str[pos]) {
1310  case '-':
1311  case '+':
1312  break;
1313  default:
1314  S2N_CONVERT_ERROR_INVAL(double);
1315  }
1316  }
1317  // For consistency make additional check on incorrect leading symbols.
1318  // Because strtod() may just skip such symbols.
1319  if (!(flags & NStr::fAllowLeadingSymbols)) {
1320  char c = str[pos];
1321  if ( !isdigit((unsigned char)c) && !s_IsDecimalPoint(c,flags) && c != '-' && c != '+') {
1322  S2N_CONVERT_ERROR_INVAL(double);
1323  }
1324  }
1325 
1326  // Conversion
1327  int& errno_ref = errno;
1328  errno_ref = 0;
1329 
1330  char* endptr = 0;
1331  const char* begptr = str + pos;
1332 
1333  double n;
1334  if (flags & NStr::fDecimalPosix) {
1335  n = NStr::StringToDoublePosix(begptr, &endptr, flags);
1336  } else {
1337  n = strtod(begptr, &endptr);
1338  }
1340  char* endptr2 = 0;
1341  double n2 = NStr::StringToDoublePosix(begptr, &endptr2, flags);
1342  if (!endptr || (endptr2 && endptr2 > endptr)) {
1343  n = n2;
1344  endptr = endptr2;
1345  }
1346  }
1347  if ( !endptr || endptr == begptr ) {
1348  S2N_CONVERT_ERROR(double, kEmptyStr, EINVAL, s_DiffPtr(endptr, begptr) + pos);
1349  }
1350  // some libs set ERANGE, others do not
1351  // here, we do not consider ERANGE as error
1352  if ( errno_ref && errno_ref != ERANGE ) {
1353  S2N_CONVERT_ERROR(double, kEmptyStr, errno_ref, s_DiffPtr(endptr, begptr) + pos);
1354  }
1355  // special cases
1356  if ((flags & NStr::fDecimalPosixFinite) && n != 0. && !isnan(n))
1357  {
1358  bool is_negative = n < 0.;
1359  if (is_negative) {
1360  n = -n;
1361  }
1362  if ( n < DBL_MIN) {
1363  n = DBL_MIN;
1364  } else if (!finite(n)) {
1365  n = DBL_MAX;
1366  }
1367  if (is_negative) {
1368  n = -n;
1369  }
1370  }
1371 
1372  pos += s_DiffPtr(endptr, begptr);
1373 
1374  // Skip allowed trailing symbols
1376  bool spaces = ((flags & NStr::fAllowTrailingSymbols) ==
1379  }
1380  CHECK_ENDPTR(double);
1381  return n;
1382 }
1383 
1384 
1385 double NStr::StringToDoubleEx(const char* str, size_t size,
1386  TStringToNumFlags flags)
1388  return s_StringToDouble(str, size, flags);
1389 }
1390 
1391 
1392 double NStr::StringToDouble(const CTempStringEx str, TStringToNumFlags flags)
1393 {
1394  size_t size = str.size();
1395  if ( str.HasZeroAtEnd() ) {
1396  // string has zero at the end already
1397  return s_StringToDouble(str.data(), size, flags);
1398  }
1399  char buf[256]; // small temporary buffer on stack for appending zero char
1400  if ( size < sizeof(buf) ) {
1401  memcpy(buf, str.data(), size);
1402  buf[size] = '\0';
1403  return s_StringToDouble(buf, size, flags);
1404  }
1405  else {
1406  // use std::string() to allocate memory for appending zero char
1407  return s_StringToDouble(string(str).c_str(), size, flags);
1408  }
1409 }
1410 
1411 /// @internal
1413  SIZE_TYPE& pos,
1414  Uint8 value,
1416 {
1418 
1419  unsigned char ch = str[pos];
1420  if ( !ch ) {
1421  return value;
1422  }
1423 
1424  ch = (unsigned char)toupper(ch);
1425  Uint8 v = value;
1426  bool err = false;
1427 
1428  switch(ch) {
1429  case 'K':
1430  pos++;
1431  if ((kMax_UI8 / 1024) < v) {
1432  err = true;
1433  }
1434  v *= 1024;
1435  break;
1436  case 'M':
1437  pos++;
1438  if ((kMax_UI8 / 1024 / 1024) < v) {
1439  err = true;
1440  }
1441  v *= 1024 * 1024;
1442  break;
1443  case 'G':
1444  pos++;
1445  if ((kMax_UI8 / 1024 / 1024 / 1024) < v) {
1446  err = true;
1447  }
1448  v *= 1024 * 1024 * 1024;
1449  break;
1450  default:
1451  // error -- the "qual" points to the last unprocessed symbol
1453  }
1454  if ( err ) {
1455  S2N_CONVERT_ERROR_OVERFLOW(DataSize);
1456  }
1457 
1458  ch = str[pos];
1459  if ( ch && toupper(ch) == 'B' ) {
1460  pos++;
1461  }
1462  return v;
1463 }
1464 
1465 
1467  TStringToNumFlags flags,
1468  int base)
1469 {
1470  // We have a limited base range here
1471  if ( base < 2 || base > 16 ) {
1472  NCBI_THROW2(CStringException, eConvert,
1473  "Bad numeric base '" + NStr::IntToString(base)+ "'", 0);
1474  }
1476 
1477  // Current position in the string
1478  SIZE_TYPE pos = 0;
1479 
1480  // Find end of number representation
1481  {{
1482  // Skip allowed leading symbols
1483  if (flags & fAllowLeadingSymbols) {
1484  bool spaces = ((flags & fAllowLeadingSymbols) ==
1487  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
1488  }
1489  // Determine sign
1490  if (str[pos] == '+') {
1491  pos++;
1492  // strip fMandatorySign flag
1493  flags &= ~fMandatorySign;
1494  } else {
1495  if (flags & fMandatorySign) {
1497  }
1498  }
1499  // Check radix base
1500  if ( !s_CheckRadix(str, pos, base) ) {
1501  S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
1502  NStr::IntToString(base) + "'");
1503  }
1504  }}
1505 
1506  SIZE_TYPE numpos = pos;
1507  char ch = str[pos];
1508  while (ch) {
1509  if ( !s_IsGoodCharForRadix(ch, base) &&
1510  ((ch != ',') || !(flags & fAllowCommas)) ) {
1511  break;
1512  }
1513  ch = str[++pos];
1514  }
1515  // If string is empty, just use whole remaining string for conversion
1516  // (for correct error reporting)
1517  if (pos-numpos == 0) {
1518  pos = str.length();
1519  }
1520 
1521  // Convert to number
1522  Uint8 n = StringToUInt8(CTempString(str.data()+numpos, pos-numpos),
1523  flags, base);
1524  if ( !n && errno ) {
1525  // If exceptions are enabled that it has been already thrown.
1526  // The errno is also set, so just return a zero.
1527  return 0;
1528  }
1529  // Check trailer (KB, MB, ...)
1530  if ( ch ) {
1531  n = s_DataSizeConvertQual(str, pos, n, flags);
1532  }
1533  // Skip allowed trailing symbols
1534  if (flags & fAllowTrailingSymbols) {
1535  bool spaces = ((flags & fAllowTrailingSymbols) ==
1538  }
1540  return n;
1541 }
1542 
1543 
1545  TStringToNumFlags flags /* = 0 */)
1546 {
1547  TStringToNumFlags allowed_flags = fConvErr_NoThrow +
1548  fMandatorySign +
1549  fAllowCommas +
1552  fDS_ForceBinary +
1555 
1556  if ((flags & allowed_flags) != flags) {
1557  NCBI_THROW2(CStringException, eConvert, "Wrong set of flags", 0);
1558  }
1560 
1561  const char* str_ptr = str.data();
1562  const char* str_end = str_ptr + str.size();
1563  if (flags & fAllowLeadingSymbols) {
1564  bool allow_all = (flags & fAllowLeadingSymbols) != fAllowLeadingSpaces;
1565  for (; str_ptr < str_end; ++str_ptr) {
1566  char c = *str_ptr;
1567  if (isdigit(c))
1568  break;
1569  if (isspace(c))
1570  continue;
1571  if ((c == '+' || c == '-') && (flags & fMandatorySign)
1572  && str_ptr + 1 < str_end && isdigit(*(str_ptr + 1)))
1573  {
1574  break;
1575  }
1576  if (!allow_all)
1577  break;
1578  }
1579  }
1580 
1581  if (str_ptr < str_end && *str_ptr == '+') {
1582  ++str_ptr;
1583  }
1584  else if ((str_ptr < str_end && *str_ptr == '-')
1585  || (flags & fMandatorySign))
1586  {
1587  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1588  }
1589 
1590  const char* num_start = str_ptr;
1591  bool have_dot = false;
1592  bool allow_commas = (flags & fAllowCommas) != 0;
1593  bool allow_dot = (flags & fDS_ProhibitFractions) == 0;
1594  Uint4 digs_pre_dot = 0, digs_post_dot = 0;
1595 
1596  for (; str_ptr < str_end; ++str_ptr) {
1597  char c = *str_ptr;
1598  if (isdigit(c)) {
1599  if (have_dot)
1600  ++digs_post_dot;
1601  else
1602  ++digs_pre_dot;
1603  }
1604  else if (c == '.' && allow_dot) {
1605  if (have_dot || str_ptr == num_start)
1606  break;
1607  if (*(str_ptr - 1) == ',') {
1608  --str_ptr;
1609  break;
1610  }
1611  have_dot = true;
1612  }
1613  else if (c == ',' && allow_commas) {
1614  if (have_dot || str_ptr == num_start)
1615  break;
1616  if (*(str_ptr - 1) == ',') {
1617  --str_ptr;
1618  break;
1619  }
1620  }
1621  else
1622  break;
1623  }
1624  if (have_dot && digs_post_dot == 0)
1625  --str_ptr;
1626  else if (str_ptr > num_start && *(str_ptr - 1) == ',')
1627  --str_ptr;
1628 
1629  const char* num_end = str_ptr;
1630  if (num_start == num_end) {
1631  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1632  }
1633  if (str_ptr < str_end && *str_ptr == ' '
1635  {
1636  ++str_ptr;
1637  }
1638  char suff_c = 0;
1639  if (str_ptr < str_end)
1640  suff_c = (char)toupper(*str_ptr);
1641 
1642  static const char s_Suffixes[] = {'K', 'M', 'G', 'T', 'P', 'E'};
1643  static const char* const s_BinCoefs[] = {"1024", "1048576", "1073741824",
1644  "1099511627776",
1645  "1125899906842624",
1646  "1152921504606846976"};
1647  static const Uint4 s_NumSuffixes = (Uint4)(sizeof(s_Suffixes) / sizeof(s_Suffixes[0]));
1648 
1649  bool binary_suff = (flags & fDS_ForceBinary) != 0;
1650  Uint4 suff_idx = 0;
1651  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
1652  if (suff_c == s_Suffixes[suff_idx])
1653  break;
1654  }
1655  if (suff_idx < s_NumSuffixes) {
1656  ++str_ptr;
1657  if (str_ptr + 1 < str_end && toupper(*str_ptr) == 'I'
1658  && toupper(*(str_ptr + 1)) == 'B')
1659  {
1660  str_ptr += 2;
1661  binary_suff = true;
1662  }
1663  else if (str_ptr < str_end && toupper(*str_ptr) == 'B')
1664  ++str_ptr;
1665  }
1666  else if (suff_c == 'B') {
1667  ++str_ptr;
1668  }
1669  else if (*(str_ptr - 1) == ' ')
1670  --str_ptr;
1671 
1672  if (flags & fAllowTrailingSymbols) {
1673  bool allow_all = (flags & fAllowTrailingSymbols) != fAllowTrailingSpaces;
1674  for (; str_ptr < str_end; ++str_ptr) {
1675  char c = *str_ptr;
1676  if (isspace(c))
1677  continue;
1678  if (!allow_all)
1679  break;
1680  }
1681  }
1682  if (str_ptr != str_end) {
1683  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1684  }
1685 
1686  Uint4 orig_digs = digs_pre_dot + digs_post_dot;
1687  AutoArray<Uint1> orig_num(orig_digs);
1688  str_ptr = num_start;
1689  for (Uint4 i = 0; str_ptr < num_end; ++str_ptr) {
1690  if (*str_ptr == ',' || *str_ptr == '.')
1691  continue;
1692  orig_num[i++] = Uint1(*str_ptr - '0');
1693  }
1694 
1695  Uint1* num_to_conv = orig_num.get();
1696  Uint4 digs_to_conv = digs_pre_dot;
1697  AutoArray<Uint1> mul_num;
1698  if (binary_suff && suff_idx < s_NumSuffixes) {
1699  const char* coef = s_BinCoefs[suff_idx];
1700  Uint4 coef_size = Uint4(strlen(coef));
1701  mul_num = new Uint1[orig_digs + coef_size];
1702  memset(mul_num.get(), 0, orig_digs + coef_size);
1703  for (Uint4 coef_i = 0; coef_i < coef_size; ++coef_i) {
1704  Uint1 coef_d = Uint1(coef[coef_i] - '0');
1705  Uint1 carry = 0;
1706  Uint4 res_idx = orig_digs + coef_i;
1707  for (int orig_i = orig_digs - 1; orig_i >= 0; --orig_i, --res_idx) {
1708  Uint1 orig_d = orig_num[orig_i];
1709  Uint1 res_d = Uint1(coef_d * orig_d + carry + mul_num[res_idx]);
1710  carry = 0;
1711  while (res_d >= 10) {
1712  res_d = (Uint1)(res_d - 10); // res_d -= 10;
1713  ++carry;
1714  }
1715  mul_num[res_idx] = res_d;
1716  }
1717  _ASSERT(carry <= 9);
1718  for (; carry != 0; --res_idx) {
1719  Uint1 res_d = Uint1(mul_num[res_idx] + carry);
1720  carry = 0;
1721  while (res_d >= 10) {
1722  res_d = (Uint1)(res_d - 10); // res_d -= 10;
1723  ++carry;
1724  }
1725  mul_num[res_idx] = res_d;
1726  }
1727  }
1728  digs_to_conv = orig_digs + coef_size - digs_post_dot;
1729  num_to_conv = mul_num.get();
1730  while (digs_to_conv > 1 && *num_to_conv == 0) {
1731  --digs_to_conv;
1732  ++num_to_conv;
1733  }
1734  }
1735  else if (suff_idx < s_NumSuffixes) {
1736  Uint4 coef_size = (suff_idx + 1) * 3;
1737  if (coef_size <= digs_post_dot) {
1738  digs_to_conv += coef_size;
1739  digs_post_dot -= coef_size;
1740  }
1741  else {
1742  digs_to_conv += digs_post_dot;
1743  coef_size -= digs_post_dot;
1744  digs_post_dot = 0;
1745  mul_num = new Uint1[digs_to_conv + coef_size];
1746  memmove(mul_num.get(), num_to_conv, digs_to_conv);
1747  memset(mul_num.get() + digs_to_conv, 0, coef_size);
1748  num_to_conv = mul_num.get();
1749  digs_to_conv += coef_size;
1750  }
1751  }
1752 
1753  const Uint8 limdiv = kMax_UI8/10;
1754  const int limoff = int(kMax_UI8 % 10);
1755  Uint8 n = 0;
1756  for (Uint4 i = 0; i < digs_to_conv; ++i) {
1757  Uint1 d = num_to_conv[i];
1758  if (n >= limdiv && (n > limdiv || d > limoff)) {
1759  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, i);
1760  }
1761  n *= 10;
1762  n += d;
1763  }
1764  if (digs_post_dot != 0 && num_to_conv[digs_to_conv] >= 5) {
1765  if (n == kMax_UI8) {
1766  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, digs_to_conv);
1767  }
1768  ++n;
1769  }
1770  return n;
1771 }
1772 
1773 
1774 size_t NStr::StringToSizet(const CTempString str,
1775  TStringToNumFlags flags, int base)
1776 {
1777 #if (SIZEOF_SIZE_T > 4)
1778  return StringToUInt8(str, flags, base);
1779 #else
1780  return StringToUInt(str, flags, base);
1781 #endif
1783 
1784 
1785 /// @internal
1786 template <typename T>
1787 static void s_UnsignedOtherBaseToString(string& out_str,
1788  T value,
1790  int base)
1791 {
1792  _ASSERT(base != 10);
1793 
1794  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1795  char buffer[kBufSize + 2]; // +2 for fWithRadix
1796  char* pos = buffer + kBufSize;
1797  const char* kDigit = (flags & NStr::fUseLowercase) ? kDigitLower : kDigitUpper;
1798 
1799  out_str.erase();
1800 
1801  if ( base == 16 ) {
1802  if ( flags & NStr::fWithRadix ) {
1803  out_str.append("0x");
1804  }
1805 
1806  do {
1807  *--pos = kDigit[value % 16];
1808  value /= 16;
1809  } while ( value );
1810  }
1811  else if ( base == 8 ) {
1812  if ( flags & NStr::fWithRadix ) {
1813  out_str.append("0");
1814  if ( value == 0 ) {
1815  // to prevent "00"
1816  return;
1817  }
1818  }
1819  do {
1820  *--pos = kDigit[value % 8];
1821  value /= 8;
1822  } while ( value );
1823  }
1824  else {
1825  do {
1826  *--pos = kDigit[value % base];
1827  value /= base;
1828  } while ( value );
1829  }
1830  out_str.append(pos, buffer + kBufSize - pos);
1831 }
1832 
1833 
1834 /// @internal
1835 static void s_SignedBase10ToString(string& out_str,
1836  unsigned long value,
1837  long svalue,
1839  int base)
1840 {
1841  _ASSERT(base == 10);
1842 
1843  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1844  char buffer[kBufSize+2];
1845  char* pos = buffer + kBufSize;
1846 
1847  if (svalue < 0) {
1848  value = static_cast<unsigned long>(-svalue);
1849  }
1850  if ((flags & NStr::fWithCommas)) {
1851  int cnt = -1;
1852  do {
1853  if (++cnt == 3) {
1854  *--pos = ',';
1855  cnt = 0;
1856  }
1857  *--pos = '0' + value % 10;
1858  value /= 10;
1859  } while (value);
1860  }
1861  else {
1862  do {
1863  *--pos = '0' + value % 10;
1864  value /= 10;
1865  } while (value);
1866  }
1867 
1868  if (svalue < 0)
1869  *--pos = '-';
1870  else if (flags & NStr::fWithSign)
1871  *--pos = '+';
1873  out_str.assign(pos, buffer + kBufSize - pos);
1874 }
1875 
1876 
1877 void NStr::IntToString(string& out_str, int svalue,
1878  TNumToStringFlags flags, int base)
1879 {
1880  if ( base < 2 || base > 36 ) {
1881  CNcbiError::SetErrno(errno = EINVAL);
1882  return;
1883  }
1884  unsigned int value = static_cast<unsigned int>(svalue);
1885  if ( base == 10 ) {
1886  s_SignedBase10ToString(out_str, value, svalue, flags, base);
1887  } else {
1888  s_UnsignedOtherBaseToString(out_str, value, flags, base);
1889  }
1890  errno = 0;
1891 }
1892 
1893 
1894 void NStr::LongToString(string& out_str, long svalue,
1895  TNumToStringFlags flags, int base)
1896 {
1897  if ( base < 2 || base > 36 ) {
1898  CNcbiError::SetErrno(errno = EINVAL);
1899  return;
1900  }
1901  unsigned long value = static_cast<unsigned long>(svalue);
1902  if ( base == 10 ) {
1903  s_SignedBase10ToString(out_str, value, svalue, flags, base);
1904  } else {
1905  s_UnsignedOtherBaseToString(out_str, value, flags, base);
1906  }
1907  errno = 0;
1908 }
1909 
1910 
1911 void NStr::ULongToString(string& out_str,
1912  unsigned long value,
1913  TNumToStringFlags flags,
1914  int base)
1915 {
1916  if ( base < 2 || base > 36 ) {
1917  CNcbiError::SetErrno(errno = EINVAL);
1918  return;
1919  }
1920  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1921  char buffer[kBufSize];
1922  char* pos = buffer + kBufSize;
1923  out_str.erase();
1924 
1925  if ( base == 10 ) {
1926  if ( (flags & fWithCommas) ) {
1927  int cnt = -1;
1928  do {
1929  if (++cnt == 3) {
1930  *--pos = ',';
1931  cnt = 0;
1932  }
1933  *--pos = '0' + value % 10;
1934  value /= 10;
1935  } while ( value );
1936  }
1937  else {
1938  do {
1939  *--pos = '0' + value % 10;
1940  value /= 10;
1941  } while ( value );
1942  }
1943 
1944  if ( (flags & fWithSign) ) {
1945  *--pos = '+';
1946  }
1947  out_str.assign(pos, buffer + kBufSize - pos);
1948  }
1949  else {
1950  s_UnsignedOtherBaseToString(out_str, value, flags, base);
1951  }
1952  errno = 0;
1953 }
1954 
1955 
1957 // On some platforms division of Int8 is very slow,
1958 // so will try to optimize it working with chunks.
1959 // Works only for radix base == 10.
1961 #define PRINT_INT8_CHUNK 1000000000
1962 #define PRINT_INT8_CHUNK_SIZE 9
1963 
1964 /// @internal
1965 static char* s_PrintBase10Uint8(char* pos,
1966  Uint8 value,
1968 {
1969  if ( (flags & NStr::fWithCommas) ) {
1970  int cnt = -1;
1971 #ifdef PRINT_INT8_CHUNK
1972  // while n doesn't fit in Uint4 process the number
1973  // by 9-digit chunks within 32-bit Uint4
1974  while ( value & ~Uint8(Uint4(~0)) ) {
1975  Uint4 chunk = Uint4(value);
1977  chunk -= PRINT_INT8_CHUNK*Uint4(value);
1978  char* end = pos - PRINT_INT8_CHUNK_SIZE - 2; // 9-digit chunk should have 2 commas
1979  do {
1980  if (++cnt == 3) {
1981  *--pos = ',';
1982  cnt = 0;
1983  }
1984  *--pos = '0' + chunk % 10;
1985  chunk /= 10;
1986  } while ( pos != end );
1987  }
1988  // process all remaining digits in 32-bit number
1989  Uint4 chunk = Uint4(value);
1990  do {
1991  if (++cnt == 3) {
1992  *--pos = ',';
1993  cnt = 0;
1994  }
1995  *--pos = '0' + chunk % 10;
1996  chunk /= 10;
1997  } while ( chunk );
1998 #else
1999  do {
2000  if (++cnt == 3) {
2001  *--pos = ',';
2002  cnt = 0;
2003  }
2004  *--pos = '0' + value % 10;
2005  value /= 10;
2006  } while ( value );
2007 #endif
2008  }
2009  else {
2010 #ifdef PRINT_INT8_CHUNK
2011  // while n doesn't fit in Uint4 process the number
2012  // by 9-digit chunks within 32-bit Uint4
2013  while ( value & ~Uint8(Uint4(~0)) ) {
2014  Uint4 chunk = Uint4(value);
2016  chunk -= PRINT_INT8_CHUNK*Uint4(value);
2017  char* end = pos - PRINT_INT8_CHUNK_SIZE;
2018  do {
2019  *--pos = '0' + chunk % 10;
2020  chunk /= 10;
2021  } while ( pos != end );
2022  }
2023  // process all remaining digits in 32-bit number
2024  Uint4 chunk = Uint4(value);
2025  do {
2026  *--pos = '0' + chunk % 10;
2027  chunk /= 10;
2028  } while ( chunk );
2029 #else
2030  do {
2031  *--pos = '0' + value % 10;
2032  value /= 10;
2033  } while ( value );
2034 #endif
2035  }
2036  return pos;
2037 }
2038 
2039 
2040 void NStr::Int8ToString(string& out_str, Int8 svalue,
2041  TNumToStringFlags flags, int base)
2042 {
2043  if ( base < 2 || base > 36 ) {
2044  CNcbiError::SetErrno(errno = EINVAL);
2045  return;
2046  }
2047  Uint8 value;
2048  if (base == 10) {
2049  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
2050  char buffer[kBufSize];
2051 
2052  value = static_cast<Uint8>(svalue<0?-svalue:svalue);
2053  char* pos = s_PrintBase10Uint8(buffer + kBufSize, value, flags);
2054  if (svalue < 0)
2055  *--pos = '-';
2056  else if (flags & fWithSign)
2057  *--pos = '+';
2058  out_str.assign(pos, buffer + kBufSize - pos);
2059  } else {
2060  value = static_cast<Uint8>(svalue);
2061  s_UnsignedOtherBaseToString(out_str, value, flags, base);
2062  }
2063  errno = 0;
2064 }
2065 
2066 
2067 void NStr::UInt8ToString(string& out_str, Uint8 value,
2068  TNumToStringFlags flags, int base)
2069 {
2070  if ( base < 2 || base > 36 ) {
2071  CNcbiError::SetErrno(errno = EINVAL);
2072  return;
2073  }
2074  if (base == 10) {
2075  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
2076  char buffer[kBufSize];
2077 
2078  char* pos = s_PrintBase10Uint8(buffer + kBufSize, value, flags);
2079  if ( flags & fWithSign ) {
2080  *--pos = '+';
2081  }
2082  out_str.assign(pos, buffer + kBufSize - pos);
2083  } else {
2084  s_UnsignedOtherBaseToString(out_str, value, flags, base);
2085  }
2086  errno = 0;
2087 }
2088 
2089 
2090 void NStr::UInt8ToString_DataSize(string& out_str,
2091  Uint8 value,
2092  TNumToStringFlags flags /* = 0 */,
2093  unsigned int max_digits /* = 3 */)
2094 {
2095  TNumToStringFlags allowed_flags = fWithSign +
2096  fWithCommas +
2097  fDS_Binary +
2100  fDS_ShortSuffix +
2102 
2103  if ((flags & allowed_flags) != flags) {
2104  NCBI_THROW2(CStringException, eConvert, "Wrong set of flags", 0);
2105  }
2106 
2107  if (max_digits < 3)
2108  max_digits = 3;
2109 
2110  static const char s_Suffixes[] = {'K', 'M', 'G', 'T', 'P', 'E'};
2111  static const Uint4 s_NumSuffixes = Uint4(sizeof(s_Suffixes) / sizeof(s_Suffixes[0]));
2112 
2113  static const SIZE_TYPE kBufSize = 50;
2114  char buffer[kBufSize];
2115  char* num_start;
2116  char* dot_ptr;
2117  char* num_end;
2118  Uint4 digs_pre_dot, suff_idx;
2119 
2120  if (!(flags &fDS_Binary)) {
2121  static const Uint8 s_Coefs[] = {1000, 1000000, 1000000000,
2122  NCBI_CONST_UINT8(1000000000000),
2123  NCBI_CONST_UINT8(1000000000000000),
2124  NCBI_CONST_UINT8(1000000000000000000)};
2125  suff_idx = 0;
2126  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
2127  if (value < s_Coefs[suff_idx])
2128  break;
2129  }
2130  num_start = s_PrintBase10Uint8(buffer + kBufSize, value, 0);
2131  num_start[-1] = '0';
2132  dot_ptr = buffer + kBufSize - 3 * suff_idx;
2133  digs_pre_dot = Uint4(dot_ptr - num_start);
2134  if (!(flags & fDS_NoDecimalPoint)) {
2135  num_end = min(buffer + kBufSize, dot_ptr + (max_digits - digs_pre_dot));
2136  }
2137  else {
2138  while (suff_idx > 0 && max_digits - digs_pre_dot >= 3) {
2139  --suff_idx;
2140  digs_pre_dot += 3;
2141  dot_ptr += 3;
2142  }
2143  num_end = dot_ptr;
2144  }
2145  char* round_dig = num_end - 1;
2146  if (num_end < buffer + kBufSize && *num_end >= '5')
2147  ++(*round_dig);
2148  while (*round_dig == '0' + 10) {
2149  *round_dig = '0';
2150  --round_dig;
2151  ++(*round_dig);
2152  }
2153  if (round_dig < num_start) {
2154  _ASSERT(num_start - round_dig == 1);
2155  num_start = round_dig;
2156  ++digs_pre_dot;
2157  if (!(flags & fDS_NoDecimalPoint)) {
2158  if (digs_pre_dot > 3) {
2159  ++suff_idx;
2160  digs_pre_dot -= 3;
2161  dot_ptr -= 3;
2162  }
2163  --num_end;
2164  }
2165  else {
2166  if (digs_pre_dot > max_digits) {
2167  ++suff_idx;
2168  digs_pre_dot -= 3;
2169  dot_ptr -= 3;
2170  num_end = dot_ptr;
2171  }
2172  }
2173  }
2174  }
2175  else {
2176  static const Uint8 s_Coefs[] = {1, 1024, 1048576, 1073741824,
2177  NCBI_CONST_UINT8(1099511627776),
2178  NCBI_CONST_UINT8(1125899906842624),
2179  NCBI_CONST_UINT8(1152921504606846976)};
2180 
2181  suff_idx = 1;
2182  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
2183  if (value < s_Coefs[suff_idx])
2184  break;
2185  }
2186  bool can_try_another = true;
2187 try_another_suffix:
2188  Uint8 mul_coef = s_Coefs[suff_idx - 1];
2189  Uint8 whole_num = value / mul_coef;
2190  if (max_digits == 3 && whole_num >= 1000) {
2191  ++suff_idx;
2192  goto try_another_suffix;
2193  }
2194  num_start = s_PrintBase10Uint8(buffer + kBufSize, whole_num, 0);
2195  num_start[-1] = '0';
2196  digs_pre_dot = Uint4(buffer + kBufSize - num_start);
2197  if (max_digits - digs_pre_dot >= 3 && (flags & fDS_NoDecimalPoint)
2198  && suff_idx != 1 && can_try_another)
2199  {
2200  Uint4 new_suff = suff_idx - 1;
2201 try_even_more_suffix:
2202  Uint8 new_num = value / s_Coefs[new_suff - 1];
2203  char* new_start = s_PrintBase10Uint8(buffer + kBufSize / 2, new_num, 0);
2204  Uint4 new_digs = Uint4(buffer + kBufSize / 2 - new_start);
2205  if (new_digs <= max_digits) {
2206  if (max_digits - digs_pre_dot >= 3 && new_suff != 1) {
2207  --new_suff;
2208  goto try_even_more_suffix;
2209  }
2210  suff_idx = new_suff;
2211  can_try_another = false;
2212  goto try_another_suffix;
2213  }
2214  if (new_suff != suff_idx - 1) {
2215  suff_idx = new_suff + 1;
2216  can_try_another = false;
2217  goto try_another_suffix;
2218  }
2219  }
2220  memcpy(buffer, num_start - 1, digs_pre_dot + 1);
2221  num_start = buffer + 1;
2222  dot_ptr = num_start + digs_pre_dot;
2223  Uint4 cnt_more_digs = 1;
2224  if (!(flags & fDS_NoDecimalPoint))
2225  cnt_more_digs += min(max_digits - digs_pre_dot, 3 * (suff_idx - 1));
2226  num_end = dot_ptr;
2227  Uint8 left_val = value - whole_num * mul_coef;
2228  do {
2229  left_val *= 10;
2230  Uint1 d = Uint1(left_val / mul_coef);
2231  *num_end = char(d + '0');
2232  ++num_end;
2233  left_val -= d * mul_coef;
2234  --cnt_more_digs;
2235  }
2236  while (cnt_more_digs != 0);
2237  --num_end;
2238 
2239  char* round_dig = num_end - 1;
2240  if (*num_end >= '5')
2241  ++(*round_dig);
2242  while (*round_dig == '0' + 10) {
2243  *round_dig = '0';
2244  --round_dig;
2245  ++(*round_dig);
2246  }
2247  if (round_dig < num_start) {
2248  _ASSERT(round_dig == buffer);
2249  num_start = round_dig;
2250  ++digs_pre_dot;
2251  if (digs_pre_dot > max_digits) {
2252  ++suff_idx;
2253  goto try_another_suffix;
2254  }
2255  if (num_end != dot_ptr)
2256  --num_end;
2257  }
2258  if (!(flags & fDS_NoDecimalPoint) && digs_pre_dot == 4
2259  && num_start[0] == '1' && num_start[1] == '0'
2260  && num_start[2] == '2' && num_start[3] == '4')
2261  {
2262  ++suff_idx;
2263  goto try_another_suffix;
2264  }
2265 
2266  --suff_idx;
2267  }
2268 
2269  out_str.erase();
2270  if (flags & fWithSign)
2271  out_str.append(1, '+');
2272  if (!(flags & fWithCommas) || digs_pre_dot <= 3) {
2273  out_str.append(num_start, digs_pre_dot);
2274  }
2275  else {
2276  Uint4 digs_first = digs_pre_dot % 3;
2277  out_str.append(num_start, digs_first);
2278  char* left_ptr = num_start + digs_first;
2279  Uint4 digs_left = digs_pre_dot - digs_first;
2280  while (digs_left != 0) {
2281  out_str.append(1, ',');
2282  out_str.append(left_ptr, 3);
2283  left_ptr += 3;
2284  digs_left -= 3;
2285  }
2286  }
2287  if (num_end != dot_ptr) {
2288  out_str.append(1, '.');
2289  out_str.append(dot_ptr, num_end - dot_ptr);
2290  }
2291 
2292  if (suff_idx == 0) {
2293  if (flags & fDS_PutBSuffixToo) {
2295  out_str.append(1, ' ');
2296  out_str.append(1, 'B');
2297  }
2298  }
2299  else {
2300  --suff_idx;
2302  out_str.append(1, ' ');
2303  out_str.append(1, s_Suffixes[suff_idx]);
2304  if (!(flags & fDS_ShortSuffix)) {
2305  if (flags & fDS_Binary)
2306  out_str.append(1, 'i');
2307  out_str.append(1, 'B');
2308  }
2309  }
2310  errno = 0;
2312 
2313 
2314 // A maximal double precision used in the double to string conversion
2315 #if defined(NCBI_OS_MSWIN)
2316  const int kMaxDoublePrecision = 200;
2317 #else
2318  const int kMaxDoublePrecision = 308;
2319 #endif
2320 // A maximal size of a double value in a string form.
2321 // Exponent size + sign + dot + ending '\0' + max.precision
2322 const int kMaxDoubleStringSize = 308 + 3 + kMaxDoublePrecision;
2323 
2324 
2325 void NStr::DoubleToString(string& out_str, double value,
2327 {
2328  char buffer[kMaxDoubleStringSize]; // inludes ending '\0'
2329  int n = 0;
2330  if (precision >= 0 ||
2331  ((flags & fDoublePosix) && (!finite(value) || value == 0.))) {
2333  buffer[n] = '\0';
2334  } else {
2335  const char* format;
2336  switch (flags & fDoubleGeneral) {
2337  case fDoubleFixed:
2338  format = "%f";
2339  break;
2340  case fDoubleScientific:
2341  format = "%e";
2342  break;
2343  case fDoubleGeneral: // default
2344  default:
2345  format = "%g";
2346  break;
2347  }
2348  n = ::snprintf(buffer, kMaxDoubleStringSize, format, value);
2349  if (n < 0) {
2350  buffer[0] = '\0';
2351  }
2352  if (flags & fDoublePosix) {
2353  struct lconv* conv = localeconv();
2354  if ('.' != *(conv->decimal_point)) {
2355  char* pos = strchr(buffer, *(conv->decimal_point));
2356  if (pos) {
2357  *pos = '.';
2358  }
2359  }
2360  }
2361  }
2362  out_str = buffer;
2363  errno = 0;
2364 }
2365 
2366 
2367 SIZE_TYPE NStr::DoubleToString(double value, unsigned int precision,
2368  char* buf, SIZE_TYPE buf_size,
2369  TNumToStringFlags flags)
2370 {
2371  char buffer[kMaxDoubleStringSize]; // inludes ending '\0'
2372  int n = 0;
2373  if ((flags & fDoublePosix) && (!finite(value) || value == 0.)) {
2374  if (value == 0.) {
2375  double zero = 0.;
2376  if (memcmp(&value, &zero, sizeof(double)) == 0) {
2377  strcpy(buffer, "0");
2378  n = 2;
2379  } else {
2380  strcpy(buffer, "-0");
2381  n = 3;
2382  }
2383  } else if (isnan(value)) {
2384  strcpy(buffer, "NaN");
2385  n = 4;
2386  } else if (value > 0.) {
2387  strcpy(buffer, "INF");
2388  n = 4;
2389  } else {
2390  strcpy(buffer, "-INF");
2391  n = 5;
2392  }
2393  } else {
2394  if (precision > (unsigned int)kMaxDoublePrecision) {
2395  precision = (unsigned int)kMaxDoublePrecision;
2396  }
2397  const char* format;
2398  switch (flags & fDoubleGeneral) {
2399  case fDoubleScientific:
2400  format = "%.*e";
2401  break;
2402  case fDoubleGeneral:
2403  format = "%.*g";
2404  break;
2405  case fDoubleFixed: // default
2406  default:
2407  format = "%.*f";
2408  break;
2409  }
2410  n = ::snprintf(buffer, kMaxDoubleStringSize, format, (int)precision, value);
2411  if (n < 0) {
2412  n = 0;
2413  }
2414  if (flags & fDoublePosix) {
2415  struct lconv* conv = localeconv();
2416  if ('.' != *(conv->decimal_point)) {
2417  char* pos = strchr(buffer, *(conv->decimal_point));
2418  if (pos) {
2419  *pos = '.';
2420  }
2421  }
2422  }
2423  }
2424  SIZE_TYPE n_copy = min((SIZE_TYPE) n, buf_size);
2425  memcpy(buf, buffer, n_copy);
2426  errno = 0;
2427  return n_copy;
2428 }
2429 
2430 
2431 static char* s_ncbi_append_int2str(char* buffer, unsigned int value, size_t digits, bool zeros)
2432 {
2433  char* buffer_start = buffer;
2434  char* buffer_end = (buffer += digits-1);
2435  if (zeros) {
2436  do {
2437  *buffer-- = (char)('0' + (value % 10));
2438  value /= 10;
2439  } while (--digits);
2440  } else {
2441  do {
2442  *buffer-- = (char)('0' + (value % 10));
2443  } while (value /= 10);
2444 
2445  if (++buffer != buffer_start) {
2446  memmove(buffer_start, buffer, buffer_end-buffer+1);
2447  buffer_end -= buffer - buffer_start;
2448  }
2449  }
2450  return ++buffer_end;
2452 
2453 
2454 #define __NLG NCBI_CONST_LONGDOUBLE
2455 
2456 SIZE_TYPE NStr::DoubleToString_Ecvt(double val, unsigned int precision,
2457  char* buffer, SIZE_TYPE bufsize,
2458  int* dec, int* sign)
2459 {
2460  //errno = 0;
2461  *dec = *sign = 0;
2462  if (precision==0) {
2463  return 0;
2464  }
2465  if (precision > DBL_DIG) {
2466  precision = DBL_DIG;
2467  }
2468  if (val == 0.) {
2469  double zero = 0.;
2470  if (memcmp(&val, &zero, sizeof(double)) == 0) {
2471  *buffer='0';
2472  return 1;
2473  }
2474  *buffer++='-';
2475  *buffer='0';
2476  *sign = -1;
2477  return 2;
2478  }
2479  *sign = val < 0. ? -1 : 1;
2480  if (*sign < 0) {
2481  val = -val;
2482  }
2483  bool high_precision = precision > 9;
2484 
2485 // calculate exponent
2486  unsigned int exp=0;
2487  bool exp_positive = val >= 1.;
2488  unsigned int first, second=0;
2489  long double mult = __NLG(1.);
2490  long double value = val;
2491 
2492  if (exp_positive) {
2493  while (value>=__NLG(1.e256))
2494  {value*=__NLG(1.e-256); exp+=256;}
2495  if (value >= __NLG(1.e16)) {
2496  if (value>=__NLG(1.e240)) {value*=__NLG(1.e-240); exp+=240;}
2497  else if (value>=__NLG(1.e224)) {value*=__NLG(1.e-224); exp+=224;}
2498  else if (value>=__NLG(1.e208)) {value*=__NLG(1.e-208); exp+=208;}
2499  else if (value>=__NLG(1.e192)) {value*=__NLG(1.e-192); exp+=192;}
2500  else if (value>=__NLG(1.e176)) {value*=__NLG(1.e-176); exp+=176;}
2501  else if (value>=__NLG(1.e160)) {value*=__NLG(1.e-160); exp+=160;}
2502  else if (value>=__NLG(1.e144)) {value*=__NLG(1.e-144); exp+=144;}
2503  else if (value>=__NLG(1.e128)) {value*=__NLG(1.e-128); exp+=128;}
2504  else if (value>=__NLG(1.e112)) {value*=__NLG(1.e-112); exp+=112;}
2505  else if (value>=__NLG(1.e96)) {value*=__NLG(1.e-96); exp+=96;}
2506  else if (value>=__NLG(1.e80)) {value*=__NLG(1.e-80); exp+=80;}
2507  else if (value>=__NLG(1.e64)) {value*=__NLG(1.e-64); exp+=64;}
2508  else if (value>=__NLG(1.e48)) {value*=__NLG(1.e-48); exp+=48;}
2509  else if (value>=__NLG(1.e32)) {value*=__NLG(1.e-32); exp+=32;}
2510  else if (value>=__NLG(1.e16)) {value*=__NLG(1.e-16); exp+=16;}
2511  }
2512  if (value< __NLG(1.)) {mult=__NLG(1.e+9); exp-= 1;}
2513  else if (value< __NLG(10.)) {mult=__NLG(1.e+8); }
2514  else if (value< __NLG(1.e2)) {mult=__NLG(1.e+7); exp+= 1;}
2515  else if (value< __NLG(1.e3)) {mult=__NLG(1.e+6); exp+= 2;}
2516  else if (value< __NLG(1.e4)) {mult=__NLG(1.e+5); exp+= 3;}
2517  else if (value< __NLG(1.e5)) {mult=__NLG(1.e+4); exp+= 4;}
2518  else if (value< __NLG(1.e6)) {mult=__NLG(1.e+3); exp+= 5;}
2519  else if (value< __NLG(1.e7)) {mult=__NLG(1.e+2); exp+= 6;}
2520  else if (value< __NLG(1.e8)) {mult= __NLG(10.); exp+= 7;}
2521  else if (value< __NLG(1.e9)) {mult= __NLG(1.); exp+= 8;}
2522  else if (value<__NLG(1.e10)) {mult= __NLG(0.1); exp+= 9;}
2523  else if (value<__NLG(1.e11)) {mult=__NLG(1.e-2); exp+=10;}
2524  else if (value<__NLG(1.e12)) {mult=__NLG(1.e-3); exp+=11;}
2525  else if (value<__NLG(1.e13)) {mult=__NLG(1.e-4); exp+=12;}
2526  else if (value<__NLG(1.e14)) {mult=__NLG(1.e-5); exp+=13;}
2527  else if (value<__NLG(1.e15)) {mult=__NLG(1.e-6); exp+=14;}
2528  else if (value<__NLG(1.e16)) {mult=__NLG(1.e-7); exp+=15;}
2529  else {mult=__NLG(1.e-8); exp+=16;}
2530  } else {
2531  while (value<=__NLG(1.e-256))
2532  {value*=__NLG(1.e256); exp+=256;}
2533  if (value <= __NLG(1.e-16)) {
2534  if (value<=__NLG(1.e-240)) {value*=__NLG(1.e240); exp+=240;}
2535  else if (value<=__NLG(1.e-224)) {value*=__NLG(1.e224); exp+=224;}
2536  else if (value<=__NLG(1.e-208)) {value*=__NLG(1.e208); exp+=208;}
2537  else if (value<=__NLG(1.e-192)) {value*=__NLG(1.e192); exp+=192;}
2538  else if (value<=__NLG(1.e-176)) {value*=__NLG(1.e176); exp+=176;}
2539  else if (value<=__NLG(1.e-160)) {value*=__NLG(1.e160); exp+=160;}
2540  else if (value<=__NLG(1.e-144)) {value*=__NLG(1.e144); exp+=144;}
2541  else if (value<=__NLG(1.e-128)) {value*=__NLG(1.e128); exp+=128;}
2542  else if (value<=__NLG(1.e-112)) {value*=__NLG(1.e112); exp+=112;}
2543  else if (value<=__NLG(1.e-96)) {value*=__NLG(1.e96); exp+=96;}
2544  else if (value<=__NLG(1.e-80)) {value*=__NLG(1.e80); exp+=80;}
2545  else if (value<=__NLG(1.e-64)) {value*=__NLG(1.e64); exp+=64;}
2546  else if (value<=__NLG(1.e-48)) {value*=__NLG(1.e48); exp+=48;}
2547  else if (value<=__NLG(1.e-32)) {value*=__NLG(1.e32); exp+=32;}
2548  else if (value<=__NLG(1.e-16)) {value*=__NLG(1.e16); exp+=16;}
2549  }
2550  if (value<__NLG(1.e-15)) {mult=__NLG(1.e24); exp+=16;}
2551  else if (value<__NLG(1.e-14)) {mult=__NLG(1.e23); exp+=15;}
2552  else if (value<__NLG(1.e-13)) {mult=__NLG(1.e22); exp+=14;}
2553  else if (value<__NLG(1.e-12)) {mult=__NLG(1.e21); exp+=13;}
2554  else if (value<__NLG(1.e-11)) {mult=__NLG(1.e20); exp+=12;}
2555  else if (value<__NLG(1.e-10)) {mult=__NLG(1.e19); exp+=11;}
2556  else if (value<__NLG(1.e-9)) {mult=__NLG(1.e18); exp+=10;}
2557  else if (value<__NLG(1.e-8)) {mult=__NLG(1.e17); exp+=9;}
2558  else if (value<__NLG(1.e-7)) {mult=__NLG(1.e16); exp+=8;}
2559  else if (value<__NLG(1.e-6)) {mult=__NLG(1.e15); exp+=7;}
2560  else if (value<__NLG(1.e-5)) {mult=__NLG(1.e14); exp+=6;}
2561  else if (value<__NLG(1.e-4)) {mult=__NLG(1.e13); exp+=5;}
2562  else if (value<__NLG(1.e-3)) {mult=__NLG(1.e12); exp+=4;}
2563  else if (value<__NLG(1.e-2)) {mult=__NLG(1.e11); exp+=3;}
2564  else if (value<__NLG(1.e-1)) {mult=__NLG(1.e10); exp+=2;}
2565  else if (value<__NLG(1.)) {mult=__NLG(1.e9); exp+=1;}
2566  else {mult=__NLG(1.e8); }
2567  }
2568 
2569 // get all digits
2570  long double t1 = value * mult;
2571  if (t1 >= __NLG(1.e9)) {
2572  first = 999999999;
2573  } else if (t1 < __NLG(1.e8)) {
2574  first = 100000000;
2575  t1 = first;
2576  } else {
2577  first = (unsigned int)t1;
2578  }
2579  if (high_precision) {
2580  long double t2 = (t1-first) * __NLG(1.e8);
2581  if (t2 >= __NLG(1.e8)) {
2582  second = 99999999;
2583  } else {
2584  second = (unsigned int)t2;
2585  }
2586  }
2587 
2588 // convert them into string
2589  bool use_ext_buffer = bufsize > 20;
2590  char tmp[32];
2591  char *digits = use_ext_buffer ? buffer : tmp;
2592  char *digits_end = s_ncbi_append_int2str(digits,first,9,false);
2593  if (high_precision) {
2594  digits_end = s_ncbi_append_int2str(digits_end,second,8,true);
2595  }
2596  size_t digits_len = digits_end - digits;
2597  size_t digits_got = digits_len;
2598  size_t digits_expected = high_precision ? 17 : 9;
2599 
2600 // get significant digits according to requested precision
2601  size_t pos = precision;
2602  if (digits_len > precision) {
2603  digits_len = precision;
2604 
2605  // this is questionable, but in fact,
2606  // improves the result (on average)
2607 #if 1
2608  if (high_precision) {
2609  if (digits[pos] == '4') {
2610  size_t pt = pos-1;
2611  while (pt != 0 && digits[--pt] == '9')
2612  ;
2613  if (pt != 0 && (pos-pt) > precision/2)
2614  digits[pos]='5';
2615  } else if (digits[pos] == '5') {
2616  size_t pt = pos;
2617  while (pt != 0 && digits[--pt] == '0')
2618  ;
2619  if (pt != 0 && (pos-pt) > precision/2)
2620  digits[pos]='4';
2621  }
2622  }
2623 #endif
2624 
2625  if (digits[pos] >= '5') {
2626  do {
2627  if (digits[--pos] < '9') {
2628  ++digits[pos++];
2629  break;
2630  }
2631  digits[pos]='0';
2632  } while (pos > 0);
2633  if (pos == 0) {
2634  if (digits_expected <= digits_got) {
2635  if (exp_positive) {
2636  ++exp;
2637  } else {
2638 // exp cannot be 0, by design
2639  exp_positive = --exp == 0;
2640  }
2641  }
2642  *digits = '1';
2643  digits_len = 1;
2644  }
2645  }
2646  }
2647 
2648 // truncate trailing zeros
2649  for (pos = digits_len; pos-- > 0 && digits[pos] == '0';)
2650  --digits_len;
2651 
2652  *dec = exp_positive ? int(exp) : -int(exp);
2653 
2654  if (!use_ext_buffer) {
2655  if (digits_len <= bufsize) {
2656  strncpy(buffer,digits,digits_len);
2657  } else {
2658  NCBI_THROW2(CStringException, eConvert,
2659  "Destination buffer too small", 0);
2660  }
2661  }
2662  return digits_len;
2663 }
2664 #undef __NLG
2665 
2666 
2667 SIZE_TYPE NStr::DoubleToStringPosix(double val, unsigned int precision,
2668  char* buffer, SIZE_TYPE bufsize)
2669 {
2670  if (bufsize < precision+8) {
2671  NCBI_THROW2(CStringException, eConvert,
2672  "Destination buffer too small", 0);
2673  }
2674  int dec=0, sign=0;
2675  char digits[32];
2676  size_t digits_len = DoubleToString_Ecvt(
2677  val, precision, digits, sizeof(digits), &dec, &sign);
2678  if (digits_len == 0) {
2679  errno = 0;
2680  return 0;
2681  }
2682  if (val == 0.) {
2683  strncpy(buffer,digits, digits_len);
2684  return digits_len;
2685  }
2686  if (digits_len == 1 && dec == 0 && sign >=0) {
2687  *buffer = digits[0];
2688  errno = 0;
2689  return 1;
2690  }
2691  bool exp_positive = dec >= 0;
2692  unsigned int exp= (unsigned int)(exp_positive ? dec : (-dec));
2693 
2694  // assemble the result
2695  char *buffer_pos = buffer;
2696 // char *buffer_end = buffer + bufsize;
2697  char *digits_pos = digits;
2698 
2699  if (sign < 0) {
2700  *buffer_pos++ = '-';
2701  }
2702  // The 'e' format is used when the exponent of the value is less than -4
2703  // or greater than or equal to the precision argument
2704  if ((exp_positive && exp >= precision) || (!exp_positive && exp > 4)) {
2705  *buffer_pos++ = *digits_pos++;
2706  --digits_len;
2707  if (digits_len != 0) {
2708  *buffer_pos++ = '.';
2709  strncpy(buffer_pos,digits_pos,digits_len);
2710  buffer_pos += digits_len;
2711  }
2712  *buffer_pos++ = 'e';
2713  *buffer_pos++ = exp_positive ? '+' : '-';
2714 
2715 //#if defined(NCBI_OS_MSWIN)
2716 #if NCBI_COMPILER_MSVC && _MSC_VER < 1900
2717  bool need_zeros = true;
2718  size_t need_digits = 3;
2719 #else
2720  bool need_zeros = exp < 10 ? true : false;
2721  size_t need_digits = exp < 100 ? 2 : 3;
2722 #endif
2723  // assuming exp < 1000
2724  buffer_pos = s_ncbi_append_int2str(buffer_pos, exp, need_digits,need_zeros);
2725  } else if (exp_positive) {
2726  *buffer_pos++ = *digits_pos++;
2727  --digits_len;
2728  if (digits_len > exp) {
2729  strncpy(buffer_pos,digits_pos,exp);
2730  buffer_pos += exp;
2731  *buffer_pos++ = '.';
2732  strncpy(buffer_pos,digits_pos+exp,digits_len-exp);
2733  buffer_pos += digits_len-exp;
2734  } else {
2735  strncpy(buffer_pos,digits_pos,digits_len);
2736  buffer_pos += digits_len;
2737  exp -= (unsigned int)digits_len;
2738  while (exp--) {
2739  *buffer_pos++ = '0';
2740  }
2741  }
2742  } else {
2743  *buffer_pos++ = '0';
2744  *buffer_pos++ = '.';
2745  for (--exp; exp--;) {
2746  *buffer_pos++ = '0';
2747  }
2748  strncpy(buffer_pos,digits_pos, digits_len);
2749  buffer_pos += digits_len;
2750  }
2751  errno = 0;
2752  return buffer_pos - buffer;
2753 }
2754 
2755 
2756 string NStr::SizetToString(size_t value, TNumToStringFlags flags, int base)
2757 {
2758 #if (SIZEOF_SIZE_T > 4)
2759  return UInt8ToString(value, flags, base);
2760 #else
2761  return UIntToString(static_cast<unsigned int>(value), flags, base);
2762 #endif
2763 }
2764 
2765 
2766 string NStr::PtrToString(const void* value)
2767 {
2768  errno = 0;
2769  const int kBufSize = 64;
2770  char buffer[kBufSize];
2771  ::snprintf(buffer, kBufSize, "%p", value);
2772  return buffer;
2773 }
2774 
2775 
2776 void NStr::PtrToString(string& out_str, const void* value)
2777 {
2778  errno = 0;
2779  const int kBufSize = 64;
2780  char buffer[kBufSize];
2781  ::snprintf(buffer, kBufSize, "%p", value);
2782  out_str = buffer;
2783 }
2784 
2785 
2786 const void* NStr::StringToPtr(const CTempStringEx str, TStringToNumFlags flags)
2787 {
2788  errno = 0;
2789  void *ptr = NULL;
2790  int res;
2791  if ( str.HasZeroAtEnd() ) {
2792  res = ::sscanf(str.data(), "%p", &ptr);
2793  } else {
2794  res = ::sscanf(string(str).c_str(), "%p", &ptr);
2795  }
2796  if (res != 1) {
2797  if (flags & fConvErr_NoErrMessage) {
2798  CNcbiError::SetErrno(errno = EINVAL);
2799  } else {
2800  CNcbiError::SetErrno(errno = EINVAL, str);
2801  }
2802  return NULL;
2803  }
2804  return ptr;
2808 static const char* s_kTrueString = "true";
2809 static const char* s_kFalseString = "false";
2810 static const char* s_kTString = "t";
2811 static const char* s_kFString = "f";
2812 static const char* s_kYesString = "yes";
2813 static const char* s_kNoString = "no";
2814 static const char* s_kYString = "y";
2815 static const char* s_kNString = "n";
2816 
2817 
2818 const string NStr::BoolToString(bool value)
2820  return value ? s_kTrueString : s_kFalseString;
2821 }
2822 
2823 
2825 {
2826  if ( str == "1" ||
2830  AStrEquiv(str, s_kYString, PNocase()) ) {
2831  errno = 0;
2832  return true;
2833  }
2834  if ( str == "0" ||
2838  AStrEquiv(str, s_kNString, PNocase()) ) {
2839  errno = 0;
2840  return false;
2841  }
2843  "String cannot be converted to bool", 0);
2844 }
2845 
2846 
2847 string NStr::FormatVarargs(const char* format, va_list args)
2848 {
2849 #ifdef HAVE_VASPRINTF
2850  char* s;
2851  int n = vasprintf(&s, format, args);
2852  if (n >= 0) {
2853  string str(s, n);
2854  free(s);
2855  return str;
2856  } else {
2857  return kEmptyStr;
2858  }
2859 
2860 #elif defined(HAVE_VSNPRINTF)
2861  // deal with implementation quirks
2862  SIZE_TYPE size = 1024;
2864  buf.get()[size-1] = buf.get()[size-2] = 0;
2865  SIZE_TYPE n = vsnprintf(buf.get(), size, format, args);
2866  while (n >= size || buf.get()[size-2]) {
2867  if (buf.get()[size-1]) {
2868  ERR_POST_X(1, Warning << "Buffer overrun by buggy vsnprintf");
2869  }
2870  size = max(size << 1, n);
2871  buf.reset(new char[size]);
2872  buf.get()[size-1] = buf.get()[size-2] = 0;
2873  n = vsnprintf(buf.get(), size, format, args);
2874  }
2875  return (n > 0) ? string(buf.get(), n) : kEmptyStr;
2876 
2877 #elif defined(HAVE_VPRINTF)
2878  char buf[1024];
2879  buf[sizeof(buf) - 1] = 0;
2880  vsprintf(buf, format, args);
2881  if (buf[sizeof(buf) - 1]) {
2882  ERR_POST_X(2, Warning << "Buffer overrun by vsprintf");
2883  }
2884  return buf;
2885 
2886 #else
2887 # error Please port this code to your system.
2888 #endif
2889 }
2890 
2891 
2893  const CTempString pattern,
2894  ECase use_case,
2895  EDirection direction,
2896  SIZE_TYPE occurence)
2897 {
2898  const SIZE_TYPE slen = str.length();
2899  const SIZE_TYPE plen = pattern.length();
2900  SIZE_TYPE current_occurence = 0;
2901  SIZE_TYPE pos = 0;
2902  SIZE_TYPE current_pos = 0; // saved position of last search
2903  SIZE_TYPE search_pos = 0; // next search position
2904 
2905  if (plen > slen) {
2906  return NPOS;
2907  }
2908 
2909  if (use_case == eCase) {
2910 
2911  if (direction == eForwardSearch) {
2912  do {
2913  pos = str.find(pattern, search_pos);
2914  if (pos == NPOS) {
2915  return NPOS;
2916  }
2917  current_pos = pos;
2918  search_pos = pos + plen;
2919  ++current_occurence;
2920  }
2921  while (current_occurence <= occurence);
2922 
2923  } else {
2924  _ASSERT(direction == eReverseSearch);
2925  search_pos = slen - plen;
2926  do {
2927  pos = str.rfind(pattern, search_pos);
2928  if (pos == NPOS) {
2929  return NPOS;
2930  }
2931  current_pos = pos;
2932  search_pos = (pos < plen) ? 0 : pos - plen;
2933  ++current_occurence;
2934  }
2935  while (current_occurence <= occurence);
2936  }
2937 
2938  } else {
2939  _ASSERT(use_case == eNocase);
2940 
2941  // A set of lower/upper characters for pattern[0].
2942  string x_first(pattern, 0, 1);
2943  if (isupper((unsigned char)x_first[0])) {
2944  x_first += (char)tolower((unsigned char)x_first[0]);
2945  } else if (islower((unsigned char)x_first[0])) {
2946  x_first += (char)toupper((unsigned char)x_first[0]);
2947  }
2948 
2949  if (direction == eForwardSearch) {
2950  do {
2951  pos = str.find_first_of(x_first, search_pos);
2952  while (pos != NPOS) {
2953  if ( (pos + plen) > slen ) {
2954  return NPOS;
2955  }
2956  if ( CompareNocase(str, pos, plen, pattern) == 0 ) {
2957  break;
2958  }
2959  pos = str.find_first_of(x_first, pos + 1);
2960  }
2961  if (pos > slen) {
2962  return NPOS;
2963  }
2964  current_pos = pos;
2965  search_pos = pos + plen;
2966  ++current_occurence;
2967  }
2968  while (current_occurence <= occurence);
2969 
2970  } else {
2971  _ASSERT(direction == eReverseSearch);
2972  search_pos = slen - plen;
2973  do {
2974  pos = str.find_last_of(x_first, search_pos);
2975  while (pos != NPOS && pos
2976  && CompareNocase(str, pos, plen, pattern) != 0) {
2977  if (pos == 0) {
2978  return NPOS;
2979  }
2980  pos = str.find_last_of(x_first, pos - 1);
2981  }
2982  current_pos = pos;
2983  search_pos = (pos < plen) ? 0 : pos - plen;
2984  ++current_occurence;
2985  }
2986  while (current_occurence <= occurence);
2987  }
2988  }
2989  return current_pos;
2990 }
2991 
2992 
2993 // @deprecated
2995  SIZE_TYPE start, SIZE_TYPE end, EOccurrence where)
2996 {
2997  string pat(pattern, 0, 1);
2998  SIZE_TYPE l = pattern.size();
2999  if (isupper((unsigned char) pat[0])) {
3000  pat += (char) tolower((unsigned char) pat[0]);
3001  } else if (islower((unsigned char) pat[0])) {
3002  pat += (char) toupper((unsigned char) pat[0]);
3003  }
3004 
3005  if (where == eFirst) {
3006  SIZE_TYPE pos = str.find_first_of(pat, start);
3007  while (pos != NPOS && (pos + l) <= end
3008  && CompareNocase(str, pos, l, pattern) != 0) {
3009  pos = str.find_first_of(pat, pos + 1);
3010  }
3011  return pos > end ? NPOS : pos;
3012 
3013  } else { // eLast
3014  SIZE_TYPE pos = str.find_last_of(pat, end);
3015  while (pos != NPOS && pos >= start
3016  && CompareNocase(str, pos, l, pattern) != 0) {
3017  if (pos == 0) {
3018  return NPOS;
3019  }
3020  pos = str.find_last_of(pat, pos - 1);
3021  }
3022  return pos < start ? NPOS : pos;
3023  }
3024 }
3025 
3026 
3027 const string* NStr::Find(const list <string>& lst, const CTempString val,
3028  ECase use_case)
3029 {
3030  if (lst.empty()) return NULL;
3031  ITERATE (list<string>, st_itr, lst) {
3032  if (Equal(*st_itr, val, use_case)) {
3033  return &*st_itr;
3034  }
3035  }
3036  return NULL;
3037 }
3038 
3039 const string* NStr::Find(const vector <string>& vec, const CTempString val,
3040  ECase use_case)
3041 {
3042  if (vec.empty()) return NULL;
3043  ITERATE (vector<string>, st_itr, vec) {
3044  if (Equal(*st_itr, val, use_case)) {
3045  return &*st_itr;
3046  }
3047  }
3048  return NULL;
3049 }
3051 
3052 /// @internal
3053 // Check that symbol 'ch' is a word boundary character (don't matches [a-zA-Z0-9_]).
3054 static inline
3055 bool s_IsWordBoundaryChar(char ch)
3057  return !(ch == '_' || isalnum((unsigned char)ch));
3058 }
3059 
3060 
3062  const CTempString word,
3063  ECase use_case,
3064  EDirection direction)
3065 {
3066  const SIZE_TYPE slen = str.length();
3067  const SIZE_TYPE plen = word.length();
3068 
3069  SIZE_TYPE start = 0;
3070  SIZE_TYPE end = slen;
3071 
3072  SIZE_TYPE pos = Find(str, word, use_case, direction);
3073 
3074  while (pos != NPOS) {
3075  // Check word boundaries
3076  if ( ((pos == 0) || s_IsWordBoundaryChar(str[pos-1])) &&
3077  ((pos + plen == slen) || s_IsWordBoundaryChar(str[pos+plen])) ) {
3078  return pos;
3079  }
3080  // Find next occurrence
3081  if (direction == eForwardSearch) {
3082  if (pos + plen == slen) {
3083  return NPOS;
3084  }
3085  ++start;
3086  } else {
3087  if (pos == 0) {
3088  return NPOS;
3089  }
3090  --end;
3091  }
3092  pos = Find(CTempString(str, start, end - start), word, use_case, direction);
3093  if (pos != NPOS) {
3094  // update position: from start of the string "str"
3095  pos += start;
3096  }
3097  }
3098  return pos;
3099 }
3100 
3101 
3103 {
3104  const SIZE_TYPE len1 = s1.length();
3105  const SIZE_TYPE len2 = s2.length();
3106 
3107  // Eliminate the null case
3108  if (len1 == 0 || len2 == 0) {
3109  return 0;
3110  }
3111  SIZE_TYPE len = min(len1, len2);
3112 
3113  // Truncate the longer string
3114  CTempString t1, t2;
3115  if (len1 > len2) {
3116  t1 = s1.substr(len1-len, len);
3117  t2 = s2;
3118  } else {
3119  t1 = s1;
3120  t2 = s2.substr(0, len);
3121  }
3122  // Quick check for the worst case
3123  if (memcmp(t1.data(), t2.data(), len) == 0) {
3124  return len;
3125  }
3126 
3127  // Start by looking for a single character match
3128  // and increase length until no match is found.
3129  // Performance analysis: http://neil.fraser.name/news/2010/11/04/
3130  SIZE_TYPE best = 0;
3131  SIZE_TYPE n = 1;
3132  for (;;) {
3133  // Right 'n' symbols of 't1'
3134  CTempString pattern(t1.data() + len - n, n);
3135  SIZE_TYPE pos = t2.find(pattern);
3136  if (pos == NPOS) {
3137  return best;
3138  }
3139  n += pos;
3140  if (pos == 0 || memcmp(t1.data() + len - n, t2.data(), n) == 0) {
3141  best = n;
3142  n++;
3143  }
3144  }
3145  // Unreachable
3146  return best;
3147 }
3148 
3149 
3150 template <class TStr>
3151 TStr s_TruncateSpaces(const TStr& str, NStr::ETrunc where,
3152  const TStr& empty_str)
3153 {
3154  SIZE_TYPE length = str.length();
3155  if (length == 0) {
3156  return empty_str;
3157  }
3158  SIZE_TYPE beg = 0;
3159  if (where == NStr::eTrunc_Begin || where == NStr::eTrunc_Both) {
3160  _ASSERT(beg < length);
3161  while ( isspace((unsigned char) str[beg]) ) {
3162  if (++beg == length) {
3163  return empty_str;
3164  }
3165  }
3166  }
3167  SIZE_TYPE end = length;
3168  if ( where == NStr::eTrunc_End || where == NStr::eTrunc_Both ) {
3169  _ASSERT(beg < end);
3170  while (isspace((unsigned char) str[--end])) {
3171  if (beg == end) {
3172  return empty_str;
3173  }
3174  }
3175  _ASSERT(beg <= end && !isspace((unsigned char) str[end]));
3176  ++end;
3177  }
3178  _ASSERT(beg < end && end <= length);
3179  if ( beg | (end - length) ) { // if either beg != 0 or end != length
3180  return str.substr(beg, end - beg);
3181  }
3182  else {
3183  return str;
3184  }
3185 }
3186 
3187 string NStr::TruncateSpaces(const string& str, ETrunc where)
3188 {
3189  return s_TruncateSpaces(str, where, kEmptyStr);
3190 }
3191 
3193 {
3194  return s_TruncateSpaces(str, where, CTempString());
3195 }
3196 
3198 {
3199  str = s_TruncateSpaces(str, where, CTempString());
3200 }
3201 
3202 void NStr::TruncateSpacesInPlace(string& str, ETrunc where)
3203 {
3204  SIZE_TYPE length = str.length();
3205  if (length == 0) {
3206  return;
3207  }
3208  SIZE_TYPE beg = 0;
3209  if ( where == eTrunc_Begin || where == eTrunc_Both ) {
3210  // It's better to use str.data()[] to check string characters
3211  // to avoid implicit modification of the string by non-const operator[]
3212  _ASSERT(beg < length);
3213  while ( isspace((unsigned char) str.data()[beg]) ) {
3214  if (++beg == length) {
3215  str.erase();
3216  return;
3217  }
3218  }
3219  }
3220 
3221  SIZE_TYPE end = length;
3222  if ( where == eTrunc_End || where == eTrunc_Both ) {
3223  // It's better to use str.data()[] to check string characters
3224  // to avoid implicit modification of the string by non-const operator[]
3225  _ASSERT(beg < end);
3226  while (isspace((unsigned char) str.data()[--end])) {
3227  if (beg == end) {
3228  str.erase();
3229  return;
3230  }
3231  }
3232  _ASSERT(beg <= end && !isspace((unsigned char) str.data()[end]));
3233  ++end;
3234  }
3235  _ASSERT(beg < end && end <= length);
3236 
3237  if ( beg | (end - length) ) { // if either beg != 0 or end != length
3238  str.replace(0, length, str, beg, end - beg);
3239  }
3240 }
3241 
3242 
3243 void NStr::TrimPrefixInPlace(string& str, const CTempString prefix,
3244  ECase use_case)
3245 {
3246  if (!str.length() ||
3247  !prefix.length() ||
3248  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3249  return;
3250  }
3251  str.erase(0, prefix.length());
3252 }
3253 
3254 
3256  ECase use_case)
3257 {
3258  if (!str.length() ||
3259  !prefix.length() ||
3260  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3261  return;
3262  }
3263  str.assign(str.data() + prefix.length(), str.length() - prefix.length());
3264 }
3265 
3266 
3268  ECase use_case)
3269 {
3270  if (!str.length() ||
3271  !prefix.length() ||
3272  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3273  return str;
3274  }
3275  return CTempString(str.data() + prefix.length(), str.length() - prefix.length());
3276 }
3277 
3278 
3279 void NStr::TrimSuffixInPlace(string& str, const CTempString suffix,
3280  ECase use_case)
3281 {
3282  if (!str.length() ||
3283  !suffix.length() ||
3284  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3285  return;
3286  }
3287  str.erase(str.length() - suffix.length());
3288 }
3289 
3290 
3292  ECase use_case)
3293 {
3294  if (!str.length() ||
3295  !suffix.length() ||
3296  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3297  return;
3298  }
3299  str.erase(str.length() - suffix.length());
3300 }
3301 
3302 
3304  ECase use_case)
3305 {
3306  if (!str.length() ||
3307  !suffix.length() ||
3308  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3309  return str;
3310  }
3311  return CTempString(str.data(), str.length() - suffix.length());
3312 }
3313 
3314 
3315 string& NStr::Replace(const string& src,
3316  const string& search, const string& replace,
3317  string& dst, SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3318  SIZE_TYPE* num_replace)
3319 {
3320  // source and destination should not be the same
3321  if (&src == &dst) {
3322  NCBI_THROW2(CStringException, eBadArgs,
3323  "NStr::Replace(): source and destination are the same", 0);
3324  }
3325  if (num_replace)
3326  *num_replace = 0;
3327  if (start_pos + search.size() > src.size() || search == replace) {
3328  dst = src;
3329  return dst;
3330  }
3331 
3332  // Use different algorithms depending on size or 'search' and 'replace'
3333  // for better performance (and for big strings only! > 16KB).
3334 
3335  if (replace.size() > search.size() && src.size() > 16*1024) {
3336  // Replacing string is longer -- worst case.
3337  // Try to avoid memory reallocations inside std::string.
3338  // Count replacing strings first
3339  SIZE_TYPE n = 0;
3340  SIZE_TYPE start_orig = start_pos;
3341  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3342  start_pos = src.find(search, start_pos);
3343  if (start_pos == NPOS)
3344  break;
3345  n++;
3346  start_pos += search.size();
3347  }
3348  // Reallocate memory for destination string
3349  dst.resize(src.size() - n*search.size() + n*replace.size());
3350 
3351  // Use copy() to create destination string
3352  start_pos = start_orig;
3353  string::const_iterator src_start = src.begin();
3354  string::const_iterator src_end = src.begin();
3355  string::iterator dst_pos = dst.begin();
3356 
3357  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3358  start_pos = src.find(search, start_pos);
3359  if (start_pos == NPOS)
3360  break;
3361  // Copy from source string up to 'search'
3362  src_end = src.begin() + start_pos;
3363  copy(src_start, src_end, dst_pos);
3364  dst_pos += (src_end - src_start);
3365  // Append 'replace'
3366  copy(replace.begin(), replace.end(), dst_pos);
3367  dst_pos += replace.size();
3368  start_pos += search.size();
3369  src_start = src.begin() + start_pos;
3370  }
3371  // Copy source's string tail to the place
3372  copy(src_start, src.end(), dst_pos);
3373  if (num_replace)
3374  *num_replace = n;
3375 
3376  } else {
3377  // Replacing string is shorter or have the same length.
3378  // ReplaceInPlace() can be faster on some platform, but not much,
3379  // so we use regular algorithm even for equal lengths here.
3380  dst = src;
3381  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3382  start_pos = dst.find(search, start_pos);
3383  if (start_pos == NPOS)
3384  break;
3385  dst.replace(start_pos, search.size(), replace);
3386  start_pos += replace.size();
3387  if (num_replace)
3388  (*num_replace)++;
3389  }
3390  }
3391  return dst;
3392 }
3393 
3394 
3395 string NStr::Replace(const string& src,
3396  const string& search, const string& replace,
3397  SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3398  SIZE_TYPE* num_replace)
3399 {
3400  string dst;
3401  Replace(src, search, replace, dst, start_pos, max_replace, num_replace);
3402  return dst;
3403 }
3404 
3405 
3406 string& NStr::ReplaceInPlace(string& src,
3407  const string& search, const string& replace,
3408  SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3409  SIZE_TYPE* num_replace)
3410 {
3411  if ( num_replace )
3412  *num_replace = 0;
3413  if ( start_pos + search.size() > src.size() || search == replace )
3414  return src;
3415 
3416  bool equal_len = (search.size() == replace.size());
3417  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3418  start_pos = src.find(search, start_pos);
3419  if (start_pos == NPOS)
3420  break;
3421  // On some platforms string's replace() implementation
3422  // is not optimal if size of search and replace strings are equal
3423  if ( equal_len ) {
3424  copy(replace.begin(), replace.end(), src.begin() + start_pos);
3425  } else {
3426  src.replace(start_pos, search.size(), replace);
3427  }
3428  start_pos += replace.size();
3429  if (num_replace)
3430  (*num_replace)++;
3431  }
3432  return src;
3433 }
3434 
3435 
3436 template<typename TString, typename TContainer>
3437 TContainer& s_Split(const TString& str, const TString& delim,
3438  TContainer& arr, NStr::TSplitFlags flags,
3439  vector<SIZE_TYPE>* token_pos,
3440  CTempString_Storage* storage = NULL)
3441 {
3442  typedef CStrTokenPosAdapter<vector<SIZE_TYPE> > TPosArray;
3444  typedef CStrTokenize<TString, TContainer, TPosArray,
3445  CStrDummyTokenCount, TReserve> TSplitter;
3446 
3447  TPosArray token_pos_proxy(token_pos);
3448  TSplitter splitter(str, delim, flags, storage);
3449  splitter.Do(arr, token_pos_proxy, kEmptyStr);
3450  return arr;
3451 }
3452 
3453 #define CHECK_SPLIT_TEMPSTRING_FLAGS(where) \
3454  { \
3455  if ((flags & (NStr::fSplit_CanEscape | NStr::fSplit_CanQuote)) && !storage) { \
3456  NCBI_THROW2(CStringException, eBadArgs, \
3457  "NStr::" #where "(): the selected flags require non-NULL storage", 0); \
3458  } \
3459 }
3460 
3461 
3462 list<string>& NStr::Split(const CTempString str, const CTempString delim,
3463  list<string>& arr, TSplitFlags flags,
3464  vector<SIZE_TYPE>* token_pos)
3465 {
3466  return s_Split(str, delim, arr, flags, token_pos);
3467 }
3468 
3469 vector<string>& NStr::Split(const CTempString str, const CTempString delim,
3470  vector<string>& arr, TSplitFlags flags,
3471  vector<SIZE_TYPE>* token_pos)
3472 {
3473  return s_Split(str, delim, arr, flags, token_pos);
3474 }
3475 
3476 list<CTempString>& NStr::Split(const CTempString str, const CTempString delim,
3477  list<CTempString>& arr, TSplitFlags flags,
3478  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3481  return s_Split(str, delim, arr, flags, token_pos, storage);
3482 }
3483 
3484 vector<CTempString>& NStr::Split(const CTempString str, const CTempString delim,
3485  vector<CTempString>& arr, TSplitFlags flags,
3486  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3489  return s_Split(str, delim, arr, flags, token_pos, storage);
3490 }
3491 
3492 list<CTempStringEx>& NStr::Split(const CTempString str, const CTempString delim,
3493  list<CTempStringEx>& arr, TSplitFlags flags,
3494  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3497  return s_Split(str, delim, arr, flags, token_pos, storage);
3498 }
3499 
3500 vector<CTempStringEx>& NStr::Split(const CTempString str, const CTempString delim,
3501  vector<CTempStringEx>& arr, TSplitFlags flags,
3502  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3505  return s_Split(str, delim, arr, flags, token_pos, storage);
3506 }
3507 
3508 list<string>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3509  list<string>& arr, TSplitFlags flags,
3510  vector<SIZE_TYPE>* token_pos)
3511 {
3512  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos);
3513 }
3514 
3515 vector<string>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3516  vector<string>& arr, TSplitFlags flags,
3517  vector<SIZE_TYPE>* token_pos)
3518 {
3519  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos);
3520 }
3521 
3522 list<CTempString>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3523  list<CTempString>& arr, TSplitFlags flags,
3524  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3527  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3528 }
3529 
3530 vector<CTempString>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3531  vector<CTempString>& arr, TSplitFlags flags,
3532  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3535  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3536 }
3537 
3538 list<CTempStringEx>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3539  list<CTempStringEx>& arr, TSplitFlags flags,
3540  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3543  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3544 }
3545 
3546 vector<CTempStringEx>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3547  vector<CTempStringEx>& arr, TSplitFlags flags,
3548  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3549 {
3551  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3552 }
3553 
3554 
3555 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3556  string& str1, string& str2, TSplitFlags flags)
3557 {
3558  CTempStringEx ts1, ts2;
3559  CTempString_Storage storage;
3560  bool result = SplitInTwo(str, delim, ts1, ts2, flags, &storage);
3561  str1 = ts1;
3562  str2 = ts2;
3563  return result;
3564 }
3565 
3566 
3567 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3568  CTempString& str1, CTempString& str2, TSplitFlags flags,
3569  CTempString_Storage* storage)
3570 {
3571  CTempStringEx ts1, ts2;
3572  bool result = SplitInTwo(str, delim, ts1, ts2, flags, storage);
3573  str1 = ts1;
3574  str2 = ts2;
3575  return result;
3576 }
3577 
3578 
3579 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3580  CTempStringEx& str1, CTempStringEx& str2,
3581  TSplitFlags flags, CTempString_Storage* storage)
3582 {
3587 
3588  CTempStringList part_collector(storage);
3589  TSplitter splitter(str, delim, flags, storage);
3590  SIZE_TYPE delim_pos = NPOS;
3591 
3592  // get first part
3593  splitter.Advance(&part_collector, NULL, &delim_pos);
3594  part_collector.Join(&str1);
3595  part_collector.Clear();
3596 
3597  // don't need further splitting, just quote and escape parsing
3598  splitter.SetDelim(kEmptyStr);
3599  splitter.Advance(&part_collector);
3600  part_collector.Join(&str2);
3602  return delim_pos != NPOS;
3603 }
3604 
3606 #define SS_ADD_CHAR(c) \
3607  out.push_back(c); \
3608  last = c;
3609 
3610 string NStr::Sanitize(CTempString str, CTempString allow_chars, CTempString reject_chars,
3611  char reject_replacement, TSS_Flags flags)
3612 {
3613  string out;
3614  out.reserve(str.size());
3615 
3616  // Use fSS_print by default if no any other filter, including custom
3617  bool have_class = (flags & (fSS_alpha | fSS_digit | fSS_alnum | fSS_print | fSS_cntrl | fSS_punct)) > 0;
3618  if ( allow_chars.empty() && reject_chars.empty() && !have_class ) {
3619  flags |= fSS_print;
3620  have_class = true;
3621  }
3622 
3623  bool have_allowed = false;
3624  char last = '\0';
3625 
3626  for (char c : str) {
3627 
3628  // Check against filters: character classes via flags, allowed chars, rejected chars.
3629  bool allowed = false;
3630  if ( have_class ) {
3631  allowed = ((flags & fSS_Reject) != 0);
3632  if (((flags & fSS_print) && isprint((unsigned char)c)) ||
3633  ((flags & fSS_alnum) && isalnum((unsigned char)c)) ||
3634  ((flags & fSS_alpha) && isalpha((unsigned char)c)) ||
3635  ((flags & fSS_digit) && isdigit((unsigned char)c)) ||
3636  ((flags & fSS_cntrl) && iscntrl((unsigned char)c)) ||
3637  ((flags & fSS_punct) && ispunct((unsigned char)c)) ) {
3638 
3639  // If matched and reverse logic -- treat char as rejected
3640  allowed = ((flags & fSS_Reject) == 0);
3641  }
3642  }
3643  else {
3644  // Special case: no any character class specified in flags
3645 
3646  // If <allow_chars> and fSS_Reject flag, then no any character allowed except <allow_chars>
3647  // -- "allow" already FALSE, no need to check this;
3648  // -- <allow_chars> will be checked below.
3649 
3650  // If <reject_chars> and no fSS_Reject flag, then all characters allowed except <reject_chars>.
3651  if (!reject_chars.empty() && ((flags & fSS_Reject) == 0)) {
3652  allowed = true;
3653  }
3654  // -- <reject_chars> will be checked below.
3655  }
3656  if (!allowed && !allow_chars.empty() && allow_chars.find(c) != NPOS ) {
3657  allowed = true;
3658  }
3659  if (allowed && !reject_chars.empty() && reject_chars.find(c) != NPOS ) {
3660  allowed = false;
3661  }
3662 
3663  // Good character?
3664  if ( allowed ) {
3665  // Special processing for allowed spaces.
3666  // Truncate leading spaces and merge if necessary
3667  if ( c == ' ' ) {
3668  if (!have_allowed && !(flags & fSS_NoTruncate_Begin)) {
3669  // Skip spaces at start of the string
3670  continue;
3671  }
3672  if (flags & fSS_NoMerge) {
3673  SS_ADD_CHAR(c);
3674  }
3675  else {
3676  // Merge spaces
3677  if (last != ' ') {
3678  SS_ADD_CHAR(c);
3679  }
3680  }
3681  }
3682  else {
3683  // Some other allowed character
3684  SS_ADD_CHAR(c);
3685  have_allowed = true;
3686  }
3687  continue;
3688  }
3689 
3690  // Rejected
3691  if ( flags & fSS_Remove ) {
3692  continue;
3693  }
3694  // Special check on leading spaces, if <reject_replacement> is a space
3695  if (reject_replacement == ' ') {
3696  if (!have_allowed && !(flags & fSS_NoTruncate_Begin)) {
3697  // Skip spaces at start of the string
3698  continue;
3699  }
3700  }
3701  // Replace rejected character
3702  if (flags & fSS_NoMerge) {
3703  SS_ADD_CHAR(reject_replacement);
3704  have_allowed = true;
3705  }
3706  else {
3707  // Merge rejected
3708  if (last != reject_replacement) {
3709  SS_ADD_CHAR(reject_replacement);
3710  have_allowed = true;
3711  }
3712  }
3713  }
3714 
3715  // Truncate trailing spaces if necessary
3716  if (last == ' ' && !(flags & fSS_NoTruncate_End)) {
3717  SIZE_TYPE pos = out.find_last_not_of(last);
3718  if (pos == NPOS) {
3719  out.clear();
3720  }
3721  else {
3722  out.resize(pos+1);
3723  }
3724  }
3725 
3726  return out;
3728 
3730 
3731 enum ELanguage {
3734 };
3735 
3736 
3737 static string s_PrintableString(const CTempString str,
3739  ELanguage lang)
3740 {
3741  unique_ptr<CNcbiOstrstream> out;
3742  SIZE_TYPE i, j = 0;
3743 
3744  for (i = 0; i < str.size(); ++i) {
3745  bool octal = false;
3746  char c = str[i];
3747  switch (c) {
3748  case '\a':
3749  if (lang == eLanguage_C)
3750  c = 'a';
3751  else
3752  octal = true;
3753  break;
3754  case '\b':
3755  c = 'b';
3756  break;
3757  case '\f':
3758  c = 'f';
3759  break;
3760  case '\r':
3761  c = 'r';
3762  break;
3763  case '\t':
3764  c = 't';
3765  break;
3766  case '\v':
3767  c = 'v';
3768  break;
3769  case '\n':
3770  if (!(mode & NStr::fNewLine_Passthru))
3771  c = 'n';
3772  /*FALLTHRU*/
3773  case '\\':
3774  case '\'':
3775  case '"':
3776  break;
3777  case '&':
3778  if (lang == eLanguage_Javascript)
3779  break;
3780  continue;
3781  case '?':
3782  if (lang == eLanguage_C) {
3783  if (i && str[i - 1] == '?')
3784  break;
3785  if (i < str.size() - 1 && str[i + 1] == '?')
3786  break;
3787  }
3788  continue;
3789  default:
3790  if (!isascii((unsigned char) c)) {
3791  if (mode & NStr::fNonAscii_Quote) {
3792  octal = true;
3793  break;
3794  }
3795  }
3796  if (!isprint((unsigned char) c)) {
3797  octal = true;
3798  break;
3799  }
3800  continue;
3801  }
3802  if (!out.get()) {
3803  out.reset(new CNcbiOstrstream);
3804  }
3805  if (i > j) {
3806  out->write(str.data() + j, i - j);
3807  }
3808  out->put('\\');
3809  if (c == '\n') {
3810  out->write("n\\\n", 3);
3811  } else if (octal) {
3812  bool reduce;
3813  if (!(mode & NStr::fPrintable_Full)) {
3814  reduce = (i == str.size() - 1 ||
3815  str[i + 1] < '0' || '7' < str[i + 1] ? true : false);
3816  } else {
3817  reduce = false;
3818  }
3819  unsigned char v;
3820  char val[3];
3821  int k = 0;
3822  v = (unsigned char) c >> 6;
3823  if (v || !reduce) {
3824  val[k++] = char('0' + v);
3825  reduce = false;
3826  }
3827  v = ((unsigned char) c >> 3) & 7;
3828  if (v || !reduce) {
3829  val[k++] = char('0' + v);
3830  }
3831  v = (unsigned char) c & 7;
3832  val[k++] = char('0' + v);
3833  out->write(val, k);
3834  } else {
3835  out->put(c);
3836  }
3837  j = i + 1;
3838  }
3839  if (j && i > j) {
3840  _ASSERT(out.get());
3841  out->write(str.data() + j, i - j);
3842  }
3843  if (out.get()) {
3844  // Return encoded string
3845  return CNcbiOstrstreamToString(*out);
3846  }
3847 
3848  // All characters are good - return (a copy of) the original string
3849  return str;
3850 }
3851 
3852 
3853 string NStr::Escape(const CTempString str, const CTempString metacharacters, char escape_char)
3854 {
3855  string out;
3856  if ( str.empty() ) {
3857  return out;
3858  }
3859  out.reserve(str.size() * 2); // maximum size for a new string (have all metacharacters)
3860 
3861  for (char c : str) {
3862  if (c == escape_char || metacharacters.find(c) != NPOS) {
3863  out += escape_char;
3864  }
3865  out += c;
3866  }
3867  return out;
3868 }
3869 
3870 
3871 string NStr::Unescape(const CTempString str, char escape_char)
3872 {
3873  string out;
3874  if ( str.empty() ) {
3875  return out;
3876  }
3877  out.reserve(str.size());
3878  bool escaped = false;
3879 
3880  for (char c : str) {
3881  if (escaped) {
3882  out += c;
3883  escaped = false;
3884  }
3885  else {
3886  if (c == escape_char) {
3887  escaped = true;
3888  }
3889  else {
3890  out += c;
3891  }
3892  }
3893  }
3894  return out;
3895 }
3896 
3897 
3898 string NStr::Quote(const CTempString str, char quote_char, char escape_char)
3899 {
3900  string out;
3901  if (str.empty()) {
3902  return out;
3903  }
3904  out.reserve(str.size() * 2); // maximum size for a new string
3905 
3906  out.push_back(quote_char);
3907  for (char c : str) {
3908  if (c == quote_char || c == escape_char) {
3909  out += escape_char;
3910  }
3911  out += c;
3912  }
3913  out.push_back(quote_char);
3915  return out;
3916 }
3917 
3918 
3919 string NStr::Unquote(const CTempString str, char escape_char)
3920 {
3921  string out;
3922  if (str.empty()) {
3923  return out;
3924  }
3925  out.reserve(str.size());
3926  bool escaped = false;
3927  char quote_char = str[0];
3928 
3929  if (str.length() < 2 || str[str.length()-1] != quote_char) {
3930  NCBI_THROW2(CStringException, eFormat,
3931  "The source string must start and finish with the same character", 0);
3932  }
3933  // Remove first and last characters ("quotes")
3934  CTempString s(str, 1, str.length() - 2);
3935 
3936  for (char c : s) {
3937  if (escaped) {
3938  out += c;
3939  escaped = false;
3940  }
3941  else {
3942  if (c == escape_char) {
3943  escaped = true;
3944  }
3945  else {
3946  out += c;
3947  }
3948  }
3949  }
3950  return out;
3951 }
3952 
3953 
3957 }
3958 
3959 
3961 {
3962  return s_PrintableString(str,
3965 }
3966 
3967 
3968 string NStr::CEncode(const CTempString str, EQuoted quoted)
3969 {
3970  switch (quoted) {
3971  case eNotQuoted:
3972  return PrintableString(str);
3973  case eQuoted:
3974  return '"' + PrintableString(str) + '"';
3975  }
3976  _TROUBLE;
3977  // Unreachable
3978  return str;
3979 }
3980 
3981 
3982 string NStr::CParse(const CTempString str, EQuoted quoted)
3983 {
3984  if (quoted == eNotQuoted) {
3985  return ParseEscapes(str);
3986  }
3987  _ASSERT(quoted == eQuoted);
3988 
3989  SIZE_TYPE pos;
3990  SIZE_TYPE len = str.length();
3991  const char quote_char = '"';
3992 
3993  if (len < 2 || str[0] != quote_char || str[len-1] != quote_char) {
3994  NCBI_THROW2(CStringException, eFormat,
3995  "The source string must start and finish with a double quote", 0);
3996  }
3997 
3998  // Flag that next char is escaped, ignore it
3999  bool escaped = false;
4000  // We have a quote mark, start collect string chars
4001  bool collect = true;
4002  // Position of last quote
4003  SIZE_TYPE last_quote = 0;
4004 
4005  string out;
4006  out.reserve(str.size());
4007 
4008  for (pos = 1; pos < len; ++pos) {
4009  unsigned char ch = str[pos];
4010  if (ch == quote_char && !escaped) {
4011  // Have a substring
4012  CTempString sub(str.data() + last_quote + 1, pos - last_quote - 1);
4013  if (collect) {
4014  // Parse escape sequences and add it to result
4015  out += ParseEscapes(sub);
4016  } else {
4017  // Possible we have adjacent strings ("A""B").
4018  if (pos != last_quote + 1) {
4019  NCBI_THROW2(CStringException, eFormat,
4020  "Quoted string format error", pos);
4021  }
4022  }
4023  last_quote = pos;
4024  collect = !collect;
4025  } else {
4026  escaped = ch == '\\' ? !escaped : false;
4027  }
4028  }
4029  if (escaped || last_quote != len-1) {
4030  NCBI_THROW2(CStringException, eFormat,
4031  "Unterminated quoted string", str.length());
4032  }
4033  return out;
4034 }
4035 
4036 
4037 string NStr::XmlEncode(const CTempString str, TXmlEncode flags)
4038 // http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent
4039 {
4040  string result;
4041  SIZE_TYPE i;
4042 
4043  // wild guess...
4044  result.reserve(str.size());
4045 
4046  for (i = 0; i < str.size(); i++) {
4047  char c = str[i];
4048  switch ( c ) {
4049  case '&':
4050  result.append("&amp;");
4051  break;
4052  case '<':
4053  result.append("&lt;");
4054  break;
4055  case '>':
4056  result.append("&gt;");
4057  break;
4058  case '\'':
4059  result.append("&apos;");
4060  break;
4061  case '"':
4062  result.append("&quot;");
4063  break;
4064  case '-':
4065  // translate double hyphen and ending hyphen
4066  // http://www.w3.org/TR/xml11/#sec-comments
4067  if (flags & eXmlEnc_CommentSafe) {
4068  if (i+1 == str.size()) {
4069  result.append("&#x2d;");
4070  break;
4071  } else if (str[i+1] == '-') {
4072  ++i;
4073  result.append(1, c).append("&#x2d;");
4074  break;
4075  }
4076  }
4077  result.append(1, c);
4078  break;
4079 
4080  default:
4081  unsigned int uc = (unsigned int)(c);
4082 
4084  // Optional check on non-safe characters:
4085  // [#x1-#x8], [#xB-#xC], [#xE-#x1F], [#x7F-#x84], [#x86-#x9F]
4086  // https://www.w3.org/TR/xml11/#NT-Char
4087 
4088  if ((uc < 0x8) || (uc == 0xB) || (uc == 0xC) ||
4089  (uc >= 0x0E && uc <=0x1F) ||
4090  (uc >= 0x7F && uc <=0x84) ||
4091  (uc >= 0x86 && uc <=0x9F) )
4092  {
4093  // Skip unsafe characters
4094  if (flags & eXmlEnc_Unsafe_Skip) {
4095  continue;
4096  }
4097  // else, throw
4098  NCBI_THROW2(CStringException, eConvert,
4099  "NStr::XmlEncode -- Unsafe character '0x" + NStr::NumericToString(c, 0, 16) + "'", i);
4100  }
4101  }
4102  // Default behavior
4103  if (uc < 0x20) {
4104  const char* charmap = "0123456789abcdef";
4105  result.append("&#x");
4106  Uint1 ch = c;
4107  unsigned hi = ch >> 4;
4108  unsigned lo = ch & 0xF;
4109  if ( hi ) {
4110  result.append(1, charmap[hi]);
4111  }
4112  result.append(1, charmap[lo]).append(1, ';');
4113  } else {
4114  result.append(1, c);
4115  }
4116  break;
4117  }
4118  }
4119  return result;
4120 }
4121 
4122 
4123 string NStr::HtmlEncode(const CTempString str, THtmlEncode flags)
4124 {
4125  string result;
4126  SIZE_TYPE i;
4127  SIZE_TYPE semicolon = 0;
4128 
4129  // wild guess...
4130  result.reserve(str.size());
4131 
4132  const char* begin = str.data();
4133  const char* end = begin + str.size();
4134  for ( const char* curr = begin; curr < end; ++curr ) {
4135  TUnicodeSymbol c = CUtf8::Decode(curr);
4136  switch ( c ) {
4137  case '&':
4138  {{
4139  i = curr - begin;
4140  result.append("&");
4141  // Check on HTML entity
4142  bool is_entity = false;
4143  if ((flags & fHtmlEnc_SkipEntities) &&
4144  (i+2 < str.size()) && (semicolon != NPOS)) {
4145 
4146  if ( i >= semicolon ) {
4147  semicolon = str.find(";", i+1);
4148  }
4149  if ( semicolon != NPOS ) {
4150  SIZE_TYPE len = semicolon - i;
4151  SIZE_TYPE p = i + 1;
4152  if (str[i+1] == '#') {
4153  // Check on numeric character reference encoding
4155  p++;
4156  if (len || len <= 4) {
4157  for (; p < semicolon; ++p) {
4158  if (!isdigit((unsigned char)(str[p])))
4159  break;
4160  }
4161  }
4162  }
4163  } else {
4164  // Check on literal entity
4166  if (len && len <= 10) {
4167  for (; p < semicolon; ++p) {
4168  if (!isalpha((unsigned char)(str[p])))
4169  break;
4170  }
4171  }
4172  }
4173  }
4174  is_entity = (p == semicolon);
4175  }
4176  }
4177  if ( is_entity ) {
4179  ERR_POST_X_ONCE(5, Info << "string \"" << str <<
4180  "\" contains HTML encoded entities");
4181  }
4182  } else {
4183  result.append("amp;");
4184  }
4185  }}
4186  break;
4187  case '<':
4188  result.append("&lt;");
4189  break;
4190  case '>':
4191  result.append("&gt;");
4192  break;
4193  case '\'':
4194  result.append("&apos;");
4195  break;
4196  case '"':
4197  result.append("&quot;");
4198  break;
4199  default:
4200  if ((unsigned int)c < 0x20) {
4201  const char* charmap = "0123456789abcdef";
4202  result.append("&#x");
4203  Uint1 ch = c;
4204  unsigned hi = ch >> 4;
4205  unsigned lo = ch & 0xF;
4206  if ( hi ) {
4207  result.append(1, charmap[hi]);
4208  }
4209  result.append(1, charmap[lo]).append(1, ';');
4210  } else if (c > 0x7F) {
4211  result.append("&#x").append( NStr::NumericToString(c, 0, 16)).append(1, ';');;
4212  } else {
4213  result.append(1, c);
4214  }
4215  break;
4216  }
4217  }
4218  return result;
4219 }
4220 
4221 
4222 // Character entity references
4223 // http://www.w3.org/TR/html4/sgml/entities.html
4224 // http://www.w3.org/TR/1998/REC-html40-19980424/charset.html#h-5.3
4225 // only some entities from here were added (those shifted to right):
4226 // http://dev.w3.org/html5/html-author/charref
4227 
4228 static struct tag_HtmlEntities
4229 {
4230  TUnicodeSymbol u;
4231  const char* s;
4232 }
4233 const s_HtmlEntities[] = {
4234  { 9, "Tab" },
4235  { 10, "NewLine" },
4236  { 33, "excl" },
4237  { 34, "quot" },
4238  { 35, "num" },
4239  { 36, "dollar" },
4240  { 37, "percnt" },
4241  { 38, "amp" },
4242  { 39, "apos" },
4243  { 40, "lpar" },
4244  { 41, "rpar" },
4245  { 42, "ast" },
4246  { 43, "plus" },
4247  { 44, "comma" },
4248  { 46, "period" },
4249  { 47, "sol" },
4250  { 58, "colon" },
4251  { 59, "semi" },
4252  { 60, "lt" },
4253  { 61, "equals" },
4254  { 62, "gt" },
4255  { 63, "quest" },
4256  { 64, "commat" },
4257  { 91, "lsqb" },
4258  { 92, "bsol" },
4259  { 93, "rsqb" },
4260  { 94, "Hat" },
4261  { 95, "lowbar" },
4262  { 96, "grave" },
4263  { 123, "lcub" },
4264  { 124, "verbar" },
4265  { 125, "rcub" },
4266  { 160, "nbsp" },
4267  { 161, "iexcl" },
4268  { 162, "cent" },
4269  { 163, "pound" },
4270  { 164, "curren" },
4271  { 165, "yen" },
4272  { 166, "brvbar" },
4273  { 167, "sect" },
4274  { 168, "uml" },
4275  { 169, "copy" },
4276  { 170, "ordf" },
4277  { 171, "laquo" },
4278  { 172, "not" },
4279  { 173, "shy" },
4280  { 174, "reg" },
4281  { 175, "macr" },
4282  { 176, "deg" },
4283  { 177, "plusmn" },
4284  { 178, "sup2" },
4285  { 179, "sup3" },
4286  { 180, "acute" },
4287  { 181, "micro" },
4288  { 182, "para" },
4289  { 183, "middot" },
4290  { 184, "cedil" },
4291  { 185, "sup1" },
4292  { 186, "ordm" },
4293  { 187, "raquo" },
4294  { 188, "frac14" },
4295  { 189, "frac12" },
4296  { 190, "frac34" },
4297  { 191, "iquest" },
4298  { 192, "Agrave" },
4299  { 193, "Aacute" },
4300  { 194, "Acirc" },
4301  { 195, "Atilde" },
4302  { 196, "Auml" },
4303  { 197, "Aring" },
4304  { 198, "AElig" },
4305  { 199, "Ccedil" },
4306  { 200, "Egrave" },
4307  { 201, "Eacute" },
4308  { 202, "Ecirc" },
4309  { 203, "Euml" },
4310  { 204, "Igrave" },
4311  { 205, "Iacute" },
4312  { 206, "Icirc" },
4313  { 207, "Iuml" },
4314  { 208, "ETH" },
4315  { 209, "Ntilde" },
4316  { 210, "Ograve" },
4317  { 211, "Oacute" },
4318  { 212, "Ocirc" },
4319  { 213, "Otilde" },
4320  { 214, "Ouml" },
4321  { 215, "times" },
4322  { 216, "Oslash" },
4323  { 217, "Ugrave" },
4324  { 218, "Uacute" },
4325  { 219, "Ucirc" },
4326  { 220, "Uuml" },
4327  { 221, "Yacute" },
4328  { 222, "THORN" },
4329  { 223, "szlig" },
4330  { 224, "agrave" },
4331  { 225, "aacute" },
4332  { 226, "acirc" },
4333  { 227, "atilde" },
4334  { 228, "auml" },
4335  { 229, "aring" },
4336  { 230, "aelig" },
4337  { 231, "ccedil" },
4338  { 232, "egrave" },
4339  { 233, "eacute" },
4340  { 234, "ecirc" },
4341  { 235, "euml" },
4342  { 236, "igrave" },
4343  { 237, "iacute" },
4344  { 238, "icirc" },
4345  { 239, "iuml" },
4346  { 240, "eth" },
4347  { 241, "ntilde" },
4348  { 242, "ograve" },
4349  { 243, "oacute" },
4350  { 244, "ocirc" },
4351  { 245, "otilde" },
4352  { 246, "ouml" },
4353  { 247, "divide" },
4354  { 248, "oslash" },
4355  { 249, "ugrave" },
4356  { 250, "uacute" },
4357  { 251, "ucirc" },
4358  { 252, "uuml" },
4359  { 253, "yacute" },
4360  { 254, "thorn" },
4361  { 255, "yuml" },
4362  { 338, "OElig" },
4363  { 339, "oelig" },
4364  { 352, "Scaron" },
4365  { 353, "scaron" },
4366  { 376, "Yuml" },
4367  { 402, "fnof" },
4368  { 710, "circ" },
4369  { 732, "tilde" },
4370  { 913, "Alpha" },
4371  { 914, "Beta" },
4372  { 915, "Gamma" },
4373  { 916, "Delta" },
4374  { 917, "Epsilon" },
4375  { 918, "Zeta" },
4376  { 919, "Eta" },
4377  { 920, "Theta" },
4378  { 921, "Iota" },
4379  { 922, "Kappa" },
4380  { 923, "Lambda" },
4381  { 924, "Mu" },
4382  { 925, "Nu" },
4383  { 926, "Xi" },
4384  { 927, "Omicron" },
4385  { 928, "Pi" },
4386  { 929, "Rho" },
4387  { 931, "Sigma" },
4388  { 932, "Tau" },
4389  { 933, "Upsilon" },
4390  { 934, "Phi" },
4391  { 935, "Chi" },
4392  { 936, "Psi" },
4393  { 937, "Omega" },
4394  { 945, "alpha" },
4395  { 946, "beta" },
4396  { 947, "gamma" },
4397  { 948, "delta" },
4398  { 949, "epsilon" },
4399  { 950, "zeta" },
4400  { 951, "eta" },
4401  { 952, "theta" },
4402  { 953, "iota" },
4403  { 954, "kappa" },
4404  { 955, "lambda" },
4405  { 956, "mu" },
4406  { 957, "nu" },
4407  { 958, "xi" },
4408  { 959, "omicron" },
4409  { 960, "pi" },
4410  { 961, "rho" },
4411  { 962, "sigmaf" },
4412  { 963, "sigma" },
4413  { 964, "tau" },
4414  { 965, "upsilon" },
4415  { 966, "phi" },
4416  { 967, "chi" },
4417  { 968, "psi" },
4418  { 969, "omega" },
4419  { 977, "thetasym" },
4420  { 978, "upsih" },
4421  { 982, "piv" },
4422  { 8194, "ensp" },
4423  { 8195, "emsp" },
4424  { 8201, "thinsp" },
4425  { 8204, "zwnj" },
4426  { 8205, "zwj" },
4427  { 8206, "lrm" },
4428  { 8207, "rlm" },
4429  { 8211, "ndash" },
4430  { 8212, "mdash" },
4431  { 8216, "lsquo" },
4432  { 8217, "rsquo" },
4433  { 8218, "sbquo" },
4434  { 8220, "ldquo" },
4435  { 8221, "rdquo" },
4436  { 8222, "bdquo" },
4437  { 8224, "dagger" },
4438  { 8225, "Dagger" },
4439  { 8226, "bull" },
4440  { 8230, "hellip" },
4441  { 8240, "permil" },
4442  { 8242, "prime" },
4443  { 8243, "Prime" },
4444  { 8249, "lsaquo" },
4445  { 8250, "rsaquo" },
4446  { 8254, "oline" },
4447  { 8260, "frasl" },
4448  { 8364, "euro" },
4449  { 8472, "weierp" },
4450  { 8465, "image" },
4451  { 8476, "real" },
4452  { 8482, "trade" },
4453  { 8501, "alefsym" },
4454  { 8592, "larr" },
4455  { 8593, "uarr" },
4456  { 8594, "rarr" },
4457  { 8595, "darr" },
4458  { 8596, "harr" },
4459  { 8629, "crarr" },
4460  { 8656, "lArr" },
4461  { 8657, "uArr" },
4462  { 8658, "rArr" },
4463  { 8659, "dArr" },
4464  { 8660, "hArr" },
4465  { 8704, "forall" },
4466  { 8706, "part" },
4467  { 8707, "exist" },
4468  { 8709, "empty" },
4469  { 8711, "nabla" },
4470  { 8712, "isin" },
4471  { 8713, "notin" },
4472  { 8715, "ni" },
4473  { 8719, "prod" },
4474  { 8721, "sum" },
4475  { 8722, "minus" },
4476  { 8727, "lowast" },
4477  { 8730, "radic" },
4478  { 8733, "prop" },
4479  { 8734, "infin" },
4480  { 8736, "ang" },
4481  { 8743, "and" },
4482  { 8744, "or" },
4483  { 8745, "cap" },
4484  { 8746, "cup" },
4485  { 8747, "int" },
4486  { 8756, "there4" },
4487  { 8764, "sim" },
4488  { 8773, "cong" },
4489  { 8776, "asymp" },
4490  { 8800, "ne" },
4491  { 8801, "equiv" },
4492  { 8804, "le" },
4493  { 8805, "ge" },
4494  { 8834, "sub" },
4495  { 8835, "sup" },
4496  { 8836, "nsub" },
4497  { 8838, "sube" },
4498  { 8839, "supe" },
4499  { 8853, "oplus" },
4500  { 8855, "otimes" },
4501  { 8869, "perp" },
4502  { 8901, "sdot" },
4503  { 8968, "lceil" },
4504  { 8969, "rceil" },
4505  { 8970, "lfloor" },
4506  { 8971, "rfloor" },
4507  { 9001, "lang" },
4508  { 9002, "rang" },
4509  { 9674, "loz" },
4510  { 9824, "spades" },
4511  { 9827, "clubs" },
4512  { 9829, "hearts" },
4513  { 9830, "diams" },
4514  { 0, 0 }
4515 };
4516 
4518 {
4519  const struct tag_HtmlEntities* p = s_HtmlEntities;
4520  for ( ; p->u != 0; ++p) {
4521  if (uch == p->u) {
4522  return p->s;
4523  }
4524  }
4525  return kEmptyStr;
4526 }
4527 
4528 string NStr::HtmlDecode(const CTempString str, EEncoding encoding, THtmlDecode* result_flags)
4529 {
4530  string ustr;
4531  THtmlDecode result = 0;
4532 
4533  if (encoding == eEncoding_Unknown) {
4534  encoding = CUtf8::GuessEncoding(str);
4535  if (encoding == eEncoding_Unknown) {
4536  NCBI_THROW2(CStringException, eBadArgs,
4537  "Unable to guess the source string encoding", 0);
4538  }
4539  }
4540  // wild guess...
4541  ustr.reserve(str.size());
4542 
4543  CTempString::const_iterator i, e = str.end();
4544  char ch;
4546 
4547  for (i = str.begin(); i != e;) {
4548  ch = *(i++);
4549  //check for HTML entities and character references
4550  if (i != e && ch == '&') {
4551  CTempString::const_iterator start_of_entity, end_of_entity, itmp;
4552  end_of_entity = itmp = i;
4553  bool ent, dec, hex, parsed=false;
4554  ent = isalpha((unsigned char)(*itmp)) != 0;
4555  dec = !ent && *itmp == '#' && ++itmp != e &&
4556  isdigit((unsigned char)(*itmp)) != 0;
4557  hex = !dec && itmp != e &&
4558  (*itmp == 'x' || *itmp == 'X') && ++itmp != e &&
4559  isxdigit((unsigned char)(*itmp)) != 0;
4560  start_of_entity = itmp;
4561 
4562  if (itmp != e && (ent || dec || hex)) {
4563  // do not look too far
4564  for (int len=0; len<16 && itmp != e; ++len, ++itmp) {
4565  if (*itmp == '&' || *itmp == '#') {
4566  break;
4567  }
4568  if (*itmp == ';') {
4569  end_of_entity = itmp;
4570  break;
4571  }
4572  ent = ent && isalnum( (unsigned char)(*itmp)) != 0;
4573  dec = dec && isdigit( (unsigned char)(*itmp)) != 0;
4574  hex = hex && isxdigit((unsigned char)(*itmp)) != 0;
4575  }
4576  if (end_of_entity != i && (ent || dec || hex)) {
4577  uch = 0;
4578  if (ent) {
4579  string entity(start_of_entity, end_of_entity);
4580  const struct tag_HtmlEntities* p = s_HtmlEntities;
4581  for ( ; p->u != 0; ++p) {
4582  if (entity.compare(p->s) == 0) {
4583  uch = p->u;
4584  parsed = true;
4586  break;
4587  }
4588  }
4589  } else {
4590  parsed = true;
4592  for (itmp = start_of_entity; itmp != end_of_entity; ++itmp) {
4593  TUnicodeSymbol ud = *itmp;
4594  if (dec) {
4595  uch = 10 * uch + (ud - '0');
4596  } else if (hex) {
4597  if (ud >='0' && ud <= '9') {
4598  ud -= '0';
4599  } else if (ud >='a' && ud <= 'f') {
4600  ud -= 'a';
4601  ud += 10;
4602  } else if (ud >='A' && ud <= 'F') {
4603  ud -= 'A';
4604  ud += 10;
4605  }
4606  uch = 16 * uch + ud;
4607  }
4608  }
4609  }
4610  if (parsed) {
4611  ustr += CUtf8::AsUTF8(&uch,1);
4612  i = ++end_of_entity;
4613  continue;
4614  }
4615  }
4616  }
4617  }
4618  // no entity - append as is
4619  if (encoding == eEncoding_UTF8 || encoding == eEncoding_Ascii) {
4620  ustr.append( 1, ch );
4621  } else {
4623  ustr += CUtf8::AsUTF8(CTempString(&ch,1), encoding);
4624  }
4625  }
4626  if (result_flags) {
4627  *result_flags = result;
4628  }
4629  return ustr;
4631 
4632 
4633 // http://www.json.org/
4634 
4635 string NStr::JsonEncode(const CTempString str, EJsonEncode encoding)
4636 {
4637  string result;
4638  // wild guess...
4639  result.reserve(str.size()+2);
4640 
4641  auto encode_char = [&](char c)
4642  {
4643  static const char* charmap = "0123456789abcdef";
4644  result.append("\\u00");
4645  Uint1 ch = c;
4646  unsigned hi = ch >> 4;
4647  unsigned lo = ch & 0xF;
4648  result.append(1, charmap[hi]);
4649  result.append(1, charmap[lo]);
4650  };
4651 
4652  for (auto c : str) {
4653  switch ( c ) {
4654  case '"':
4655  result.append("\\\"");
4656  break;
4657  case '\\':
4658  result.append("\\\\");
4659  break;
4660  default:
4661  if ((unsigned int)c < 0x20) {
4662  // Control characters U+0000 through U+001F
4663  encode_char(c);
4664  } else {
4665  if (encoding == eJsonEnc_UTF8 && (unsigned int)c >= 0x80) {
4666  encode_char(c);
4667  } else {
4668  result.append(1, c);
4669  }
4670  }
4671  break;
4672  }
4673  }
4674  if (encoding == eJsonEnc_Quoted) {
4675  return '"' + result + '"';
4676  }
4677  return result;
4678 }
4679 
4680 
4681 string NStr::ShellEncode(const string& str)
4682 {
4683  // 1. Special-case of non-printable characters. We have no choice and
4684  // must use BASH extensions if we want printable output.
4685  //
4686  // Aesthetic issue: Most people are not familiar with the BASH-only
4687  // quoting style. Avoid it as much as possible.
4688 
4689  ITERATE ( string, it, str ) {
4690  if ( !isprint(Uchar(*it)) ) {
4691  return "$'" + NStr::PrintableString(str) + "'";
4692  }
4693  }
4694 
4695  /////////////////////////////////////////////////////////////////////////
4696  // Bourne Shell quoting as IEEE-standard without special extensions.
4697  //
4698  // There are 3 basic ways to quote/escape in Bourne Shell:
4699  //
4700  // - Single-quotes. All characters (including non-printable
4701  // characters newlines, backslashes), are literal. There is no escape.
4702  // - Double-quotes. Need to escape some metacharacters, such as literal
4703  // escape (\‍), variable expansion ($) and command substitution (`).
4704  // - Escape without quotes. Use backslash.
4705  /////////////////////////////////////////////////////////////////////////
4706 
4707  // 2. Non-empty printable string without meta-characters.
4708  //
4709  // Shell special characters, according to IEEE Std 1003.1,
4710  // plus ! (Bourne shell exit status negation and Bash history expansion),
4711  // braces (Bourne enhanced expansion), space, tab, and newline.
4712  //
4713  // See http://www.opengroup.org/onlinepubs/009695399/toc.htm
4714  // See Bourne and Bash man pages.
4715 
4716  if (!str.empty() &&
4717  str.find_first_of("!{} \t\r\n[|&;<>()$`\"'*?#~=%\\") == NPOS) {
4718  return str;
4719  }
4720 
4721  // 3. Printable string, but either empty or some shell meta-characters.
4722  //
4723  // Aesthetics preference:
4724  // i) If the string includes literal single-quotes, then prefer
4725  // double-quoting provided there is no need to escape embedded
4726  // literal double-quotes, escapes (\‍), variable substitution ($),
4727  // or command substitution (`).
4728 
4729  if (str.find('\'') != NPOS &&
4730  str.find_first_of("\"\\$`") == NPOS) {
4731  return "\"" + str + "\"";
4732  }
4733 
4734  // Use single-quoting. The only special case for Bourne shell
4735  // single-quoting is a literal single-quote, which needs to
4736  // be pulled out of the quoted region.
4737  //
4738  // Single-quoting does not have any escape character, so close
4739  // the quoted string ('), then emit an escaped or quoted literal
4740  // single-quote (\' or "'"), and resume the quoted string (').
4741  //
4742  // Aesthetics preferences:
4743  // ii) Prefer single-quoting over escape characters, especially
4744  // escaped whitespace. However, this is in compromise to optimal
4745  // quoting: if there are many literal single-quotes and the
4746  // use of double-quotes would involve the need to escape embedded
4747  // characters, then it may be more pleasing to escape the
4748  // shell meta-characters, and avoid the need for single-quoting
4749  // in the presence of literal single-quotes.
4750  // iii) If there are no literal double-quotes, then all else being equal,
4751  // avoid double-quotes and prefer escaping. Double-quotes are
4752  // more commonly used by enclosing formats such as ASN.1 Text
4753  // and CVS, and would thus need to be escaped. If there are
4754  // literal double-quotes, then having them is in the output is
4755  // unavoidable, and this aesthetics rule becomes secondary to
4756  // the preference for avoiding escape characters. If there are
4757  // literal escape characters, then having them is unavoidable
4758  // and avoidance of double-quotes is once again recommended.
4759 
4760  // TODO: Should simplify runs of multiple quotes, for example:
4761  // '\'''\'''\'' -> '"'''"'
4762 
4763  bool avoid_double_quotes = (str.find('"') == NPOS ||
4764  str.find('\\') != NPOS);
4765  string s = "'" + NStr::Replace(str, "'",
4766  avoid_double_quotes ? "'\\''" : "'\"'\"'") + "'";
4767 
4768  // Aesthetic improvement: Remove paired single-quotes ('')
4769  // that aren't escaped, as these evaluate to an empty string.
4770  // Don't apply this simplification for the degenerate case when
4771  // the string is the empty string ''. (Non degenerate strings
4772  // must be length greater than 2). Implement the equivalent
4773  // of the Perl regexp:
4774  //
4775  // s/(?<!\\‍)''//g
4776  //
4777  if (s.size() > 2) {
4778  size_t pos = 0;
4779  while ( true ) {
4780  pos = s.find("''", pos);
4781  if (pos == NPOS) break;
4782  if (pos == 0 || s[pos-1] != '\\') {
4783  s.erase(pos, 2);
4784  } else {
4785  ++pos;
4786  }
4787  }
4788  }
4790  return s;
4791 }
4792 
4793 
4794 string NStr::ParseEscapes(const CTempString str, EEscSeqRange mode, char user_char)
4795 {
4796  string out;
4797  out.reserve(str.size()); // result string can only be smaller
4798  SIZE_TYPE pos = 0;
4799  bool is_error = false;
4800 
4801  while (pos < str.size() || !is_error) {
4802  SIZE_TYPE pos2 = str.find('\\', pos);
4803  if (pos2 == NPOS) {
4804  //~ out += str.substr(pos);
4805  CTempString sub(str, pos);
4806  out += sub;
4807  break;
4808  }
4809  //~ out += str.substr(pos, pos2 - pos);
4810  CTempString sub(str, pos, pos2-pos);
4811  out += sub;
4812  if (++pos2 == str.size()) {
4813  NCBI_THROW2(CStringException, eFormat,
4814  "Unterminated escape sequence", pos2);
4815  }
4816  switch (str[pos2]) {
4817  case 'a': out += '\a'; break;
4818  case 'b': out += '\b'; break;
4819  case 'f': out += '\f'; break;
4820  case 'n': out += '\n'; break;
4821  case 'r': out += '\r'; break;
4822  case 't': out += '\t'; break;
4823  case 'v': out += '\v'; break;
4824  case 'x':
4825  {{
4826  pos = ++pos2;
4827  while (pos < str.size()
4828  && isxdigit((unsigned char) str[pos])) {
4829  pos++;
4830  }
4831  if (pos > pos2) {
4832  SIZE_TYPE len = pos-pos2;
4833  if ((mode == eEscSeqRange_FirstByte) && (len > 2)) {
4834  // Take only 2 first hex-digits
4835  len = 2;
4836  pos = pos2 + 2;
4837  }
4838  unsigned int value =
4839  StringToUInt(CTempString(str, pos2, len), 0, 16);
4840  if ((mode != eEscSeqRange_Standard) && (value > 255)) {
4841  // eEscSeqRange_Standard -- by default
4842  switch (mode) {
4844  // Already have right value
4845  break;
4846  case eEscSeqRange_Throw:
4847  NCBI_THROW2(CStringException, eFormat,
4848  "Escape sequence '" + NStr::PrintableString(CTempString(str, pos2, len)) +
4849  "' is out of range [0-255]", pos2);
4850  break;
4851  case eEscSeqRange_Errno:
4852  CNcbiError::SetErrno(errno = ERANGE, str);
4853  is_error = true;
4854  continue;
4855  case eEscSeqRange_User:
4856  value = (unsigned)user_char;
4857  break;
4858  default:
4859  NCBI_THROW2(CStringException, eFormat, "Wrong set of flags", pos2);
4860  }
4861  }
4862  out += static_cast<char>(value);
4863  } else {
4864  NCBI_THROW2(CStringException, eFormat,
4865  "\\x followed by no hexadecimal digits", pos);
4866  }
4867  }}
4868  continue;
4869  case '0': case '1': case '2': case '3':
4870  case '4': case '5': case '6': case '7':
4871  {{
4872  pos = pos2;
4873  unsigned char c = (unsigned char)(str[pos++] - '0');
4874  while (pos < pos2 + 3 && pos < str.size()
4875  && str[pos] >= '0' && str[pos] <= '7') {
4876  c = (unsigned char)((c << 3) | (str[pos++] - '0'));
4877  }
4878  out += c;
4879  }}
4880  continue;
4881  case '\n':
4882  // quoted EOL means no EOL
4883  break;
4884  default:
4885  out += str[pos2];
4886  break;
4887  }
4888  pos = pos2 + 1;
4889  }
4890  if (mode == eEscSeqRange_Errno) {
4891  if (is_error) {
4892  return kEmptyStr;
4893  }
4894  errno = 0;
4895  }
4896  return out;
4897 }
4898 
4899 
4900 CTempString s_Unquote(const CTempString str, size_t* n_read)
4901 {
4902  const char* str_pos = str.data();
4903  char quote_char;
4904 
4905  if (str.empty() || ((quote_char = *str_pos) != '"' && quote_char != '\'')) {
4906  NCBI_THROW2(CStringException, eFormat,
4907  "The source string must start with a quote", 0);
4908  }
4909 
4910  const char* str_end = str_pos + str.length();
4911  bool escaped = false;
4912 
4913  while (++str_pos < str_end) {
4914  if (*str_pos == quote_char && !escaped) {
4915  size_t pos = str_pos - str.data();
4916  if (n_read != NULL)
4917  *n_read = pos + 1;
4918  return CTempString(str.data() + 1, pos - 1);
4919  } else {
4920  escaped = *str_pos == '\\' ? !escaped : false;
4921  }
4922  }
4924  "Unterminated quoted string", str.length());
4925 }
4926 
4927 
4928 string NStr::ParseQuoted(const CTempString str, size_t* n_read /*= NULL*/)
4929 {
4930  return ParseEscapes(s_Unquote(std::move(str), n_read));
4931 }
4932 
4933 
4934 // An adjusted copy-paste of NStr::ParseEscapes
4936 {
4937  string out;
4938  out.reserve(str.size()); // result string can only be smaller
4939  SIZE_TYPE pos = 0;
4940 
4941  while (pos < str.size()) {
4942  SIZE_TYPE pos2 = str.find('\\', pos);
4943  if (pos2 == NPOS) {
4944  //~ out += str.substr(pos);
4945  CTempString sub(str, pos);
4946  out += sub;
4947  break;
4948  }
4949  //~ out += str.substr(pos, pos2 - pos);
4950  CTempString sub(str, pos, pos2-pos);
4951  out += sub;
4952  if (++pos2 == str.size()) {
4953  NCBI_THROW2(CStringException, eFormat,
4954  "Unterminated escape sequence", pos2);
4955  }
4956  switch (str[pos2]) {
4957  case '"':
4958  case '\\':
4959  case '/': out += str[pos2]; break;
4960  case 'b': out += '\b'; break;
4961  case 'f': out += '\f'; break;
4962  case 'n': out += '\n'; break;
4963  case 'r': out += '\r'; break;
4964  case 't': out += '\t'; break;
4965  case 'u':
4966  pos = ++pos2;
4967  while (pos < str.size() && isxdigit((unsigned char) str[pos])) {
4968  pos++;
4969  }
4970  if (auto len = pos - pos2) {
4971  if (len < 4) {
4972  NCBI_THROW2(CStringException, eFormat, "Invalid JSON escape sequence", pos2);
4973  } else if (len > 4) {
4974  len = 4;
4975  pos = pos2 + 4;
4976  }
4977  unsigned int value = NStr::StringToUInt(CTempString(str, pos2, len), 0, 16);
4978  if (value > 0xff) {
4979  NCBI_THROW2(CStringException, eConvert,
4980  "Escaped UTF-8 characters after '\\u00ff' are not supported", pos2);
4981  }
4982  out += static_cast<char>(value);
4983  continue;
4984  } else {
4985  NCBI_THROW2(CStringException, eFormat, "\\u followed by no hexadecimal digits", pos);
4986  }
4987  default:
4988  NCBI_THROW2(CStringException, eFormat, "Invalid JSON escape sequence", pos2);
4989  }
4990  pos = pos2 + 1;
4991  }
4992  return out;
4993 }
4994 
4995 
4996 string NStr::JsonDecode(const CTempString str, size_t* n_read /*= NULL*/)
4997 {
4998  return s_ParseJsonEncodeEscapes(s_Unquote(std::move(str), n_read));
5000 
5001 
5002 // Determines the end of an HTML <...> tag, accounting for attributes
5003 // and comments (the latter allowed only within <!...>).
5004 static SIZE_TYPE s_EndOfTag(const string& str, SIZE_TYPE start)
5005 {
5006  _ASSERT(start < str.size() && str[start] == '<');
5007  bool comments_ok = (start + 1 < str.size() && str[start + 1] == '!');
5008  for (SIZE_TYPE pos = start + 1; pos < str.size(); ++pos) {
5009  switch (str[pos]) {
5010  case '>': // found the end
5011  return pos;
5012 
5013  case '\"': // start of "string"; advance to end
5014  pos = str.find('\"', pos + 1);
5015  if (pos == NPOS) {
5016  NCBI_THROW2(CStringException, eFormat,
5017  "Unclosed string in HTML tag", start);
5018  // return pos;
5019  }
5020  break;
5021 
5022  case '-': // possible start of -- comment --; advance to end
5023  if (comments_ok && pos + 1 < str.size()
5024  && str[pos + 1] == '-') {
5025  pos = str.find("--", pos + 2);
5026  if (pos == NPOS) {
5027  NCBI_THROW2(CStringException, eFormat,
5028  "Unclosed comment in HTML tag", start);
5029  // return pos;
5030  } else {
5031  ++pos;
5032  }
5033  }
5034  }
5035  }
5036  NCBI_THROW2(CStringException, eFormat, "Unclosed HTML tag", start);
5037  // return NPOS;
5039 
5040 
5041 // Determines the end of an HTML &foo; character/entity reference
5042 // (which might not actually end with a semicolon :-/ , but we ignore that case)
5043 static SIZE_TYPE s_EndOfReference(const string& str, SIZE_TYPE start)
5044 {
5045  _ASSERT(start < str.size() && str[start] == '&');
5046 
5047  SIZE_TYPE pos = str.find_first_not_of
5048  ("#0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
5049  start + 1);
5050  if (pos != NPOS && str[pos] == ';') {
5051  // found terminating semicolon, so it's valid, and we return that
5052  return pos;
5053  } else {
5054  // We consider it just a '&' by itself since it's invalid
5055  return start;
5056  }
5057 }
5058 
5059 
5060 static SIZE_TYPE s_VisibleHtmlWidth(const string& str)
5061 {
5062  SIZE_TYPE width = 0, pos = 0;
5063  for (;;) {
5064  SIZE_TYPE pos2 = str.find_first_of("<&", pos);
5065  if (pos2 == NPOS) {
5066  width += str.size() - pos;
5067  break;
5068  } else {
5069  width += pos2 - pos;
5070  if (str[pos2] == '&') {
5071  ++width;
5072  pos = s_EndOfReference(str, pos);
5073  } else {
5074  pos = s_EndOfTag(str, pos);
5075  }
5076  if (pos == NPOS) {
5077  break;
5078  } else {
5079  ++pos;
5080  }
5081  }
5082  }
5083  return width;
5084 }
5085 
5086 static
5087 inline bool _isspace(unsigned char c)
5089  return ((c>=0x09 && c<=0x0D) || (c==0x20));
5090 }
5091 
5092 template<typename _D>
5093 void NStr::WrapIt(const string& str, SIZE_TYPE width,
5094  _D& dest, TWrapFlags flags,
5095  const string* prefix,
5096  const string* prefix1)
5097 {
5098  if (prefix == 0) {
5099  prefix = &kEmptyStr;
5100  }
5101 
5102  if (prefix1 == 0)
5103  prefix1 = prefix;
5104 
5105  SIZE_TYPE pos = 0, len = str.size(), nl_pos = 0;
5106 
5107  const bool is_html = flags & fWrap_HTMLPre ? true : false;
5108  const bool do_flat = (flags & fWrap_FlatFile) != 0;
5109  string temp_back; temp_back.reserve(width);
5110 
5111  enum EScore { // worst to best
5112  eForced,
5113  ePunct,
5114  eComma,
5115  eSpace,
5116  eNewline
5117  };
5118 
5119  // To avoid copying parts of str when we need to store a
5120  // substr of str, we store the substr as a pair
5121  // representing start (inclusive) and end (exclusive).
5122  typedef pair<SIZE_TYPE, SIZE_TYPE> TWrapSubstr;
5123 
5124  // This variable is used for HTML links that cross line boundaries.
5125  // Since it's aesthetically displeasing for a link to cross a boundary, we
5126  // close it at the end of each line and re-open it after the next line's
5127  // prefix
5128  // (This is needed in, e.g. AE017351)
5129  TWrapSubstr best_link(0, 0); // last link found before current best_pos
5130  TWrapSubstr latest_link(0, 0); // last link found at all
5131 
5132  while (pos < len) {
5133  bool hyphen = false; // "-" or empty
5134  SIZE_TYPE column = is_html ? s_VisibleHtmlWidth(*prefix1) : prefix1->size();
5135  SIZE_TYPE column0 = column;
5136  // the next line will start at best_pos
5137  SIZE_TYPE best_pos = NPOS;
5138  EScore best_score = eForced;
5139 
5140  // certain logic can be skipped if this part has no backspace,
5141  // which is, by far, the most common case
5142  bool thisPartHasBackspace = false;
5143 
5144  temp_back = *prefix1;
5145 
5146  // append any still-open links from previous lines
5147  if (is_html && best_link.second != 0) {
5148  temp_back.append(
5149  str.begin() + best_link.first,
5150  str.begin() + best_link.second);
5151  }
5152 
5153  SIZE_TYPE pos0 = pos;
5154 
5155  // we can't do this in HTML mode because we might have to deal with
5156  // link tags that go across lines.
5157  if (!is_html) {
5158  if (nl_pos <= pos) {
5159  nl_pos = str.find('\n', pos);
5160  if (nl_pos == NPOS) {
5161  nl_pos = len;
5162  }
5163  }
5164  if (column + (nl_pos - pos) <= width) {
5165  pos0 = nl_pos;
5166  }
5167  }
5168 
5169  for (SIZE_TYPE pos2 = pos0; pos2 < len && column <= width;
5170  ++pos2, ++column) {
5171  EScore score = eForced;
5172  SIZE_TYPE score_pos = pos2;
5173  const char c = str[pos2];
5174 
5175  if (c == '\n') {
5176  best_pos = pos2;
5177  best_score = eNewline;
5178  best_link = latest_link;
5179  break;
5180  }
5181  else if (_isspace((unsigned char)c)) {
5182  if (!do_flat && pos2 > 0 &&
5183  _isspace((unsigned char)str[pos2 - 1])) {
5184  if (pos2 < len - 1 && str[pos2 + 1] == '\b') {
5185  thisPartHasBackspace = true;
5186  }
5187  continue; // take the first space of a group
5188  }
5189  score = eSpace;
5190  }
5191  else if (is_html && c == '<') {
5192  // treat tags as zero-width...
5193  SIZE_TYPE start_of_tag = pos2;
5194  pos2 = s_EndOfTag(str, pos2);
5195  --column;
5196  if (pos2 == NPOS) {
5197  break;
5198  }
5199 
5200  if ((pos2 - start_of_tag) >= 6 &&
5201  str[start_of_tag + 1] == 'a' &&
5202  str[start_of_tag + 2] == ' ' &&
5203  str[start_of_tag + 3] == 'h' &&
5204  str[start_of_tag + 4] == 'r' &&
5205  str[start_of_tag + 5] == 'e' &&
5206  str[start_of_tag + 6] == 'f')
5207  {
5208  // remember current link in case of line wrap
5209  latest_link.first = start_of_tag;
5210  latest_link.second = pos2 + 1;
5211  }
5212  if ((pos2 - start_of_tag) >= 3 &&
5213  str[start_of_tag + 1] == '/' &&
5214  str[start_of_tag + 2] == 'a' &&
5215  str[start_of_tag + 3] == '>')
5216  {
5217  // link is closed
5218  latest_link.first = 0;
5219  latest_link.second = 0;
5220  }
5221  }
5222  else if (is_html && c == '&') {
5223  // ...and references as single characters
5224  pos2 = s_EndOfReference(str, pos2);
5225  if (pos2 == NPOS) {
5226  break;
5227  }
5228  }
5229  else if (c == ',' && column < width && score_pos < len - 1) {
5230  score = eComma;
5231  ++score_pos;
5232  }
5233  else if (do_flat ? c == '-' : ispunct((unsigned char)c)) {
5234  // For flat files, only whitespace, hyphens and commas
5235  // are special.
5236  switch (c) {
5237  case '(': case '[': case '{': case '<': case '`':
5238  score = ePunct;
5239  break;
5240  default:
5241  if (score_pos < len - 1 && column < width) {
5242  score = ePunct;
5243  ++score_pos;
5244  }
5245  break;
5246  }
5247  }
5248 
5249  if (score >= best_score && score_pos > pos0) {
5250  best_pos = score_pos;
5251  best_score = score;
5252  best_link = latest_link;
5253  }
5254 
5255  while (pos2 < len - 1 && str[pos2 + 1] == '\b') {
5256  // Account for backspaces
5257  ++pos2;
5258  if (column > column0) {
5259  --column;
5260  }
5261  thisPartHasBackspace = true;
5262  }
5263  }
5264 
5265  if (best_score != eNewline && column <= width) {
5266  if (best_pos != len) {
5267  // If the whole remaining text can fit, don't split it...
5268  best_pos = len;
5269  best_link = latest_link;
5270  // Force backspace checking, to play it safe
5271  thisPartHasBackspace = true;
5272  }
5273  }
5274  else if (best_score == eForced && (flags & fWrap_Hyphenate)) {
5275  hyphen = true;
5276  --best_pos;
5277  }
5278 
5279  {{
5280  string::const_iterator begin = str.begin() + pos;
5281  string::const_iterator end = str.begin() + best_pos;
5282  if (thisPartHasBackspace) {
5283  // eat backspaces and the characters (if any) that precede them
5284 
5285  string::const_iterator bs; // position of next backspace
5286  while ((bs = find(begin, end, '\b')) != end) {
5287  if (bs != begin) {
5288  // add all except the last one
5289  temp_back.append(begin, bs - 1);
5290  }
5291  else {
5292  // The backspace is at the beginning of next substring,
5293  // so we should remove previously added symbol if any.
5294  SIZE_TYPE size = temp_back.size();
5295  if (size > prefix1->size()) { /