NCBI C++ ToolKit
ncbistr.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: ncbistr.cpp 102274 2024-04-15 14:13:11Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko, Denis Vakatov
27  *
28  * File Description:
29  * Some helper functions
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/tempstr.hpp>
37 #include <corelib/ncbistr_util.hpp>
38 #include <corelib/error_codes.hpp>
39 #include <corelib/ncbierror.hpp>
40 #include <corelib/ncbifloat.h>
41 #include <corelib/ncbi_base64.h>
42 #include <memory>
43 #include <functional>
44 #include <algorithm>
45 #include <iterator>
46 #include <stdio.h>
47 #include <locale.h>
48 #include <math.h>
49 
50 
51 #define NCBI_USE_ERRCODE_X Corelib_Util
52 
53 
55 
56 
57 // Digits (up to base 36)
58 static const char kDigitUpper[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
59 static const char kDigitLower[] = "0123456789abcdefghijklmnopqrstuvwxyz";
60 
61 
62 static inline
63 SIZE_TYPE s_DiffPtr(const char* end, const char* start)
64 {
65  return end ? (SIZE_TYPE)(end - start) : (SIZE_TYPE) 0;
66 }
67 
68 const char *const kEmptyCStr = "";
69 
70 #if defined(HAVE_WSTRING)
71 const wchar_t *const kEmptyWCStr = L"";
72 #endif
73 
74 
75 extern const char* const kNcbiDevelopmentVersionString;
77  = "NCBI_DEVELOPMENT_VER_" NCBI_AS_STRING(NCBI_DEVELOPMENT_VER);
78 
79 #ifdef NCBI_PRODUCTION_VER
80 extern const char* const kNcbiProductionVersionString;
81 const char* const kNcbiProductionVersionString
82  = "NCBI_PRODUCTION_VER_" NCBI_AS_STRING(NCBI_PRODUCTION_VER);
83 #endif
84 
85 
86 #if !defined(NCBI_OS_MSWIN) && \
87  !(defined(NCBI_OS_LINUX) && \
88  (defined(NCBI_COMPILER_GCC) || defined(NCBI_COMPILER_ANY_CLANG)))
89 const string* CNcbiEmptyString::m_Str = 0;
90 const string& CNcbiEmptyString::FirstGet(void) {
91  static const string s_Str = "";
92  m_Str = &s_Str;
93  return s_Str;
94 }
95 # ifdef HAVE_WSTRING
96 const wstring* CNcbiEmptyWString::m_Str = 0;
97 const wstring& CNcbiEmptyWString::FirstGet(void) {
98  static const wstring s_Str = L"";
99  m_Str = &s_Str;
100  return s_Str;
101 }
102 # endif
103 #endif
104 
105 
107 {
108  SIZE_TYPE len = str.length();
109  for (SIZE_TYPE idx = pos; idx < len; ++idx) {
110  if (!isspace((unsigned char) str[idx])) {
111  return false;
112  }
113  }
114  return true;
115 }
116 
117 
119 {
120  SIZE_TYPE n1 = s1.length();
121  SIZE_TYPE n2 = s2.length();
122  if ( !n1 ) {
123  return n2 ? -1 : 0;
124  }
125  if ( !n2 ) {
126  return 1;
127  }
128  if (int res = memcmp(s1.data(), s2.data(), min(n1, n2))) {
129  return res;
130  }
131  return (n1 == n2) ? 0 : (n1 > n2 ? 1 : -1);
132 }
133 
134 
136  const char* s2)
137 {
138  if (pos == NPOS || !n || s1.length() <= pos) {
139  return *s2 ? -1 : 0;
140  }
141  if ( !*s2 ) {
142  return 1;
143  }
144  if (n == NPOS || n > s1.length() - pos) {
145  n = s1.length() - pos;
146  }
147  const char* s = s1.data() + pos;
148  while (n && *s2 && *s == *s2) {
149  s++; s2++; n--;
150  }
151  if (n == 0) {
152  return *s2 ? -1 : 0;
153  }
154  return *s - *s2;
155 }
156 
157 
159  const CTempString s2)
160 {
161  if (pos == NPOS || !n || s1.length() <= pos) {
162  return s2.empty() ? 0 : -1;
163  }
164  if (s2.empty()) {
165  return 1;
166  }
167  if (n == NPOS || n > s1.length() - pos) {
168  n = s1.length() - pos;
169  }
170  SIZE_TYPE n_cmp = n;
171  if (n_cmp > s2.length()) {
172  n_cmp = s2.length();
173  }
174  const char* s = s1.data() + pos;
175  const char* p = s2.data();
176  while (n_cmp && *s == *p) {
177  s++; p++; n_cmp--;
178  }
179 
180  if (n_cmp == 0) {
181  if (n == s2.length())
182  return 0;
183  return n > s2.length() ? 1 : -1;
184  }
185 
186  return *s - *p;
187 }
188 
189 
191 {
192  SIZE_TYPE n1 = s1.length();
193  SIZE_TYPE n2 = s2.length();
194 
195  if ( !n1 ) {
196  return n2 ? -1 : 0;
197  }
198  if ( !n2 ) {
199  return 1;
200  }
201  SIZE_TYPE n = min(n1, n2);
202  const char* p1 = s1.data();
203  const char* p2 = s2.data();
204 
205  while (n && (*p1 == *p2 ||
206  tolower((unsigned char)(*p1)) == tolower((unsigned char)(*p2))) ) {
207  p1++; p2++; n--;
208  }
209  if ( !n ) {
210  return (n1 == n2) ? 0 : (n1 > n2 ? 1 : -1);
211  }
212  if (*p1 == *p2) {
213  return 0;
214  }
215  return tolower((unsigned char)(*p1)) - tolower((unsigned char)(*p2));
216 }
217 
218 
220  const char* s2)
221 {
222  if (pos == NPOS || !n || s1.length() <= pos) {
223  return *s2 ? -1 : 0;
224  }
225  if ( !*s2 ) {
226  return 1;
227  }
228 
229  if (n == NPOS || n > s1.length() - pos) {
230  n = s1.length() - pos;
231  }
232 
233  const char* s = s1.data() + pos;
234  while (n && *s2 && (*s == *s2 ||
235  tolower((unsigned char)(*s)) == tolower((unsigned char)(*s2))) ) {
236  s++; s2++; n--;
237  }
238  if (n == 0) {
239  return *s2 ? -1 : 0;
240  }
241  if (*s == *s2) {
242  return 0;
243  }
244  return tolower((unsigned char)(*s)) - tolower((unsigned char)(*s2));
245 }
246 
247 
249  const CTempString s2)
250 {
251  if (pos == NPOS || !n || s1.length() <= pos) {
252  return s2.empty() ? 0 : -1;
253  }
254  if (s2.empty()) {
255  return 1;
256  }
257  if (n == NPOS || n > s1.length() - pos) {
258  n = s1.length() - pos;
259  }
260 
261  SIZE_TYPE n_cmp = n;
262  if (n_cmp > s2.length()) {
263  n_cmp = s2.length();
264  }
265  const char* s = s1.data() + pos;
266  const char* p = s2.data();
267  while (n_cmp && (*s == *p ||
268  tolower((unsigned char)(*s)) == tolower((unsigned char)(*p))) ) {
269  s++; p++; n_cmp--;
270  }
271  if (n_cmp == 0) {
272  return (n == s2.length()) ? 0 : (n > s2.length() ? 1 : -1);
273  }
274  if (*s == *p) {
275  return 0;
276  }
277  return tolower((unsigned char)(*s)) - tolower((unsigned char)(*p));
278 }
279 
280 
281 // MatchesMask() tri-state result
283  eMatch = 1, // match
284  eNoMatch = 0, // no match
285  eMismatch = -1 // mismatch, stop search
286 };
287 
288 // Implements the same logic as UTIL_MatchesMask() from 'include/connect/ncbi_util.h',
289 // but for CTempString instead of char*.
290 
292 {
293  char s, m;
294  size_t str_pos = 0, mask_pos = 0;
295 
296  for ( ; (m = mask[mask_pos]); ++str_pos, ++mask_pos) {
297 
298  s = str[str_pos];
299 
300  if (!s && m != '*') {
301  return eMismatch;
302  }
303  // Analyze mask symbol
304  switch ( m ) {
305  case '?':
306  _ASSERT(s);
307  continue;
308  case '*':
309  // Collapse multiple stars
310  while ( (m = mask[mask_pos]) == '*' ) mask_pos++;
311  if ( !m ) {
312  // only stars left in the mask
313  return eMatch;
314  }
315  // General case, use recursion
316  while ( s ) {
317  EMatchesMaskResult res = s_MatchesMask(str.substr(str_pos), mask.substr(mask_pos), ignore_case);
318  if ( res != eNoMatch ) {
319  // match or mismatch
320  return res;
321  }
322  // continue search
323  s = str[str_pos++];
324  }
325  return eMismatch;
326 
327  case '[':
328  if (!(m = mask[++mask_pos]))
329  return eMismatch; // mismatch, pattern error
330  if (m == '!') {
331  m = 1 /*complement*/;
332  ++mask_pos;
333  } else
334  m = 0;
335  if (ignore_case)
336  s = (char) tolower((unsigned char) s);
337  _ASSERT(s);
338  char a, b; // range for [a-b]
339  do {
340  if (!(a = mask[mask_pos++]))
341  return eMismatch; // mismatch, pattern error
342  if (mask[mask_pos] == '-' && mask[mask_pos+1] != ']') {
343  ++mask_pos;
344  if (!(b = mask[mask_pos++]))
345  return eMismatch; // mismatch, pattern error
346  } else
347  b = a;
348  if (s) {
349  if (ignore_case) {
350  a = (char) tolower((unsigned char) a);
351  b = (char) tolower((unsigned char) b);
352  }
353  if (a <= s && s <= b)
354  s = 0 /*mark as found*/;
355  }
356  } while (mask[mask_pos] != ']');
357  if (m == !s)
358  return eNoMatch; // mismatch
359  continue;
360 
361  case '\\':
362  if (!(m = mask[++mask_pos]))
363  return eMismatch; // mismatch, pattern error
364  /*FALLTHRU*/
365 
366  default:
367  // Compare non pattern character in mask and name
368  _ASSERT(s && m);
369  if (ignore_case) {
370  if (s != m && tolower((unsigned char)s) != tolower((unsigned char)m))
371  return eNoMatch;
372  } else {
373  if (s != m)
374  return eNoMatch;
375  }
376  continue;
377  }
378  }
379  // Matches if we reach the end of the string and mask at the same time only
380  if ( str[str_pos] ) {
381  return eNoMatch;
382  }
383  return eMatch;
384 }
385 
386 
387 // NOTE: This code is also used in CDirEntry::MatchesMask().
388 //
390 {
391  return s_MatchesMask(str, mask, use_case == NStr::eNocase) == eMatch;
392 }
393 
394 
395 char* NStr::ToLower(char* str)
396 {
397  char* s;
398  for (s = str; *str; str++) {
399  *str = (char)tolower((unsigned char)(*str));
400  }
401  return s;
402 }
403 
404 
405 string& NStr::ToLower(string& str)
406 {
407  NON_CONST_ITERATE (string, it, str) {
408  *it = (char)tolower((unsigned char)(*it));
409  }
410  return str;
411 }
412 
413 
414 char* NStr::ToUpper(char* str)
415 {
416  char* s;
417  for (s = str; *str; str++) {
418  *str = (char)toupper((unsigned char)(*str));
419  }
420  return s;
421 }
422 
423 
424 string& NStr::ToUpper(string& str)
425 {
426  NON_CONST_ITERATE (string, it, str) {
427  *it = (char)toupper((unsigned char)(*it));
428  }
429  return str;
430 }
431 
432 
434 {
435  SIZE_TYPE len = str.length();
436  for (SIZE_TYPE i = 0; i < len; ++i) {
437  if (isalpha((unsigned char)str[i]) && !islower((unsigned char)str[i])) {
438  return false;
439  }
440  }
441  return true;
442 }
443 
444 
446 {
447  SIZE_TYPE len = str.length();
448  for (SIZE_TYPE i = 0; i < len; ++i) {
449  if (isalpha((unsigned char)str[i]) && !isupper((unsigned char)str[i])) {
450  return false;
451  }
452  }
453  return true;
454 }
455 
456 
458 {
459  int error = 0, ret = -1;
460  size_t len = str.size();
461 
462  if (!len) {
463  error = EINVAL;
464  } else {
465  size_t i = 0;
466  // skip leading '+' if any
467  if (str.data()[0] == '+' && len > 1) {
468  ++i;
469  }
470  unsigned v = 0;
471  for (; i < len; ++i) {
472  unsigned d = str.data()[i] - '0';
473  if (d > 9) {
474  error = EINVAL;
475  break;
476  }
477  unsigned nv = v * 10 + d;
478  const unsigned kOverflowLimit = (INT_MAX - 9) / 10 + 1;
479  if (v >= kOverflowLimit) {
480  // possible overflow
481  if (v > kOverflowLimit || nv > INT_MAX) {
482  error = ERANGE;
483  break;
484  }
485  }
486  v = nv;
487  }
488  if (!error) {
489  ret = static_cast<int>(v);
490  }
491  }
492 /*
493  if (flags & fConvErr_NoErrno) {
494  return ret;
495  }
496 */
497  errno = error;
498  if (error) {
501  } else {
503  }
504  }
505  return ret;
506 }
507 
508 
509 /// @internal
510 // Access to errno is slow on some platforms, because it use TLS to store a value
511 // for each thread. This guard class can set an errno value in string to numeric
512 // conversion functions only once before exit, and when necessary.
514 {
515 public:
516  CS2N_Guard(NStr::TStringToNumFlags, bool skip_if_zero) :
517  m_NoErrno(false), // m_NoErrno((flags & NStr::fConvErr_NoErrno) > 0),
518  m_SkipIfZero(skip_if_zero),
519  m_Errno(0)
520  { }
521  ~CS2N_Guard(void) {
522  if (!m_NoErrno) {
523  // Is the guard used against the code that already set an errno?
524  // If the error code is not defined here, do not even try to check/set it.
525  if (!m_SkipIfZero || m_Errno) {
526  errno = m_Errno;
527  }
528  }
529  }
530  void Set(int errcode) { m_Errno = errcode; }
531  int Errno(void) const { return m_Errno; }
532  // Says that we want to throw an exception, do not set errno in this case
533  void Throw(void) { m_SkipIfZero = true; m_Errno = 0; }
534  // Auxiliary function to create a message about conversion error
535  // to specified type. It doesn't have any relation to the guard itself,
536  // but can help to save on the amount of code in calling macro.
537  string Message(const CTempString str, const char* to_type, const CTempString msg);
538 
539 private:
540  bool m_NoErrno; // do not set errno at all
541  bool m_SkipIfZero; // do not set errno if TRUE and m_Errno == 0
542  int m_Errno; // errno value to set
543 };
544 
545 string CS2N_Guard::Message(const CTempString str, const char* to_type, const CTempString msg)
546 {
547  string s;
548  s.reserve(str.length() + msg.length() + 50);
549  s += "Cannot convert string '";
551  s += "' to ";
552  s += to_type;
553  if ( !msg.empty() ) {
554  s += ", ";
555  s += msg;
556  }
557  return s;
558 }
559 
560 /// Regular guard
561 #define S2N_CONVERT_GUARD(flags) \
562  CS2N_Guard err_guard(flags, false)
563 
564 // This guard can be used against the code that already set an errno.
565 // If the error code is not defined, the guard not even try to check/set it (even to zero).
566 #define S2N_CONVERT_GUARD_EX(flags) \
567  CS2N_Guard err_guard(flags, true)
568 
569 #define S2N_CONVERT_ERROR(to_type, msg, errcode, pos) \
570  do { \
571  err_guard.Set(errcode); \
572  if ( !(flags & NStr::fConvErr_NoThrow) ) { \
573  err_guard.Throw(); \
574  NCBI_THROW2(CStringException, eConvert, \
575  err_guard.Message(str, #to_type, msg), pos); \
576  } else { \
577 /* \
578  if (flags & NStr::fConvErr_NoErrno) { \
579  / Error, but forced to return 0 / \
580  return 0; \
581  } \
582 */ \
583  if (flags & NStr::fConvErr_NoErrMessage) { \
584  CNcbiError::SetErrno(err_guard.Errno()); \
585  } else { \
586  CNcbiError::SetErrno(err_guard.Errno(), \
587  err_guard.Message(str, #to_type, msg)); \
588  } \
589  return 0; \
590  } \
591  } while (false)
592 
593 
594 #define S2N_CONVERT_ERROR_INVAL(to_type) \
595  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos)
596 
597 #define S2N_CONVERT_ERROR_RADIX(to_type, msg) \
598  S2N_CONVERT_ERROR(to_type, msg, EINVAL, pos)
599 
600 #define S2N_CONVERT_ERROR_OVERFLOW(to_type) \
601  S2N_CONVERT_ERROR(to_type, "overflow", ERANGE, pos)
602 
603 #define CHECK_ENDPTR(to_type) \
604  if ( str[pos] ) { \
605  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos); \
606  }
607 
608 #define CHECK_ENDPTR_SIZE(to_type) \
609  if ( pos < size ) { \
610  S2N_CONVERT_ERROR(to_type, kEmptyStr, EINVAL, pos); \
611  }
612 
613 #define CHECK_COMMAS \
614  /* Check on possible commas */ \
615  if (flags & NStr::fAllowCommas) { \
616  if (ch == ',') { \
617  if ((numpos == pos) || \
618  ((comma >= 0) && (comma != 3)) ) { \
619  /* Not first comma, sitting on incorrect place */ \
620  break; \
621  } \
622  /* Skip it */ \
623  comma = 0; \
624  pos++; \
625  continue; \
626  } else { \
627  if (comma >= 0) { \
628  /* Count symbols between commas */ \
629  comma++; \
630  } \
631  } \
632  }
633 
634 
636 {
638  Int8 value = StringToInt8(str, flags, base);
639  if ( value < kMin_Int || value > kMax_Int ) {
640  S2N_CONVERT_ERROR(int, "overflow", ERANGE, 0);
641  }
642  return (int) value;
643 }
644 
645 
646 unsigned int
647 NStr::StringToUInt(const CTempString str, TStringToNumFlags flags, int base)
648 {
650  Uint8 value = StringToUInt8(str, flags, base);
651  if ( value > kMax_UInt ) {
652  S2N_CONVERT_ERROR(unsigned int, "overflow", ERANGE, 0);
653  }
654  return (unsigned int) value;
655 }
656 
657 
658 long NStr::StringToLong(const CTempString str, TStringToNumFlags flags, int base)
659 {
661  Int8 value = StringToInt8(str, flags, base);
662  if ( value < kMin_Long || value > kMax_Long ) {
663  S2N_CONVERT_ERROR(long, "overflow", ERANGE, 0);
664  }
665  return (long) value;
666 }
667 
668 
669 unsigned long
670 NStr::StringToULong(const CTempString str, TStringToNumFlags flags, int base)
671 {
673  Uint8 value = StringToUInt8(str, flags, base);
674  if ( value > kMax_ULong ) {
675  S2N_CONVERT_ERROR(unsigned long, "overflow", ERANGE, 0);
676  }
677  return (unsigned long) value;
678 }
679 
680 
681 /// @internal
682 // Check that symbol 'ch' is good symbol for number with radix 'base'.
683 static inline
684 bool s_IsGoodCharForRadix(char ch, int base, int* value = 0)
685 {
686  if ( base <= 10 ) {
687  // shortcut for most frequent case
688  int delta = ch-'0';
689  if ( unsigned(delta) < unsigned(base) ) {
690  if ( value ) {
691  *value = delta;
692  }
693  return true;
694  }
695  return false;
696  }
697  if (!isalnum((unsigned char) ch)) {
698  return false;
699  }
700  // Corresponding numeric value of *endptr
701  int delta;
702  if (isdigit((unsigned char) ch)) {
703  delta = ch - '0';
704  } else {
705  ch = (char) tolower((unsigned char) ch);
706  delta = ch - 'a' + 10;
707  }
708  if ( value ) {
709  *value = delta;
710  }
711  return delta < base;
712  }
713 
714 
715 // Skip all allowed chars (all except used for digit composition).
716 // Update 'ptr' to current position in the string.
717 enum ESkipMode {
718  eSkipAll, // all symbols
719  eSkipAllAllowed, // all symbols, except digit/+/-/.
720  eSkipSpacesOnly // spaces only
721 };
722 
723 static inline
724 bool s_IsDecimalPoint(unsigned char ch, NStr::TStringToNumFlags flags)
725 {
726  if ( ch != '.' && ch != ',') {
727  return false;
728  }
729  if (flags & NStr::fDecimalPosix) {
730  return ch == '.';
731  }
732  else if (flags & NStr::fDecimalPosixOrLocal) {
733  return ch == '.' || ch == ',';
734  }
735  struct lconv* conv = localeconv();
736  return ch == *(conv->decimal_point);
737 }
738 
739 static inline
741  SIZE_TYPE& pos,
742  ESkipMode skip_mode,
744 {
745  if (skip_mode == eSkipAll) {
746  pos = str.length();
747  return;
748  }
749 
750  for ( SIZE_TYPE len = str.length(); pos < len; ++pos ) {
751  unsigned char ch = str[pos];
752  if ( isdigit(ch) || ch == '+' || ch == '-' || s_IsDecimalPoint(ch,flags) ) {
753  break;
754  }
755  if ( (skip_mode == eSkipSpacesOnly) && !isspace(ch) ) {
756  break;
757  }
758  }
759 }
760 
761 
762 // Check radix base. If it is zero, determine base using first chars
763 // of the string. Update 'base' value.
764 // Update 'ptr' to current position in the string.
765 static inline
766 bool s_CheckRadix(const CTempString str, SIZE_TYPE& pos, int& base)
767 {
768  if ( base == 10 || base == 8 ) {
769  // shortcut for most frequent case
770  return true;
771  }
772  // Check base
773  if ( base < 0 || base == 1 || base > 36 ) {
774  return false;
775  }
776  // Try to determine base using first chars of the string
777  unsigned char ch = str[pos];
778  unsigned char next = str[pos+1];
779  if ( base == 0 ) {
780  if ( ch != '0' ) {
781  base = 10;
782  } else if (next == 'x' || next == 'X') {
783  base = 16;
784  } else {
785  base = 8;
786  }
787  }
788  // Remove leading '0x' for hex numbers
789  if ( base == 16 ) {
790  if (ch == '0' && (next == 'x' || next == 'X')) {
791  pos += 2;
792  }
793  }
794  return true;
795 }
796 
797 
798 Int8 NStr::StringToInt8(const CTempString str, TStringToNumFlags flags, int base)
799 {
801 
802  // Current position in the string
803  SIZE_TYPE pos = 0;
804 
805  // Skip allowed leading symbols
806  if (flags & fAllowLeadingSymbols) {
807  bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
810  }
811  // Determine sign
812  bool sign = false;
813  switch (str[pos]) {
814  case '-':
815  sign = true;
816  /*FALLTHRU*/
817  case '+':
818  pos++;
819  break;
820  default:
821  if (flags & fMandatorySign) {
823  }
824  break;
825  }
826  SIZE_TYPE pos0 = pos;
827  // Check radix base
828  if ( !s_CheckRadix(str, pos, base) ) {
829  S2N_CONVERT_ERROR_RADIX(Int8, "bad numeric base '" +
830  NStr::IntToString(base)+ "'");
831  }
832 
833  // Begin conversion
834  Int8 n = 0;
835  Int8 limdiv = base==10? kMax_I8 / 10: kMax_I8 / base;
836  Int8 limoff = (base==10? kMax_I8 % 10: kMax_I8 % base) + (sign ? 1 : 0);
837 
838  // Number of symbols between two commas. '-1' means -- no comma yet.
839  int comma = -1;
840  SIZE_TYPE numpos = pos;
841 
842  while (char ch = str[pos]) {
843  int delta; // corresponding numeric value of 'ch'
844 
845  // Check on possible commas
846  CHECK_COMMAS;
847  // Sanity check
848  if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
849  break;
850  }
851  // Overflow check
852  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
854  }
855  n *= base;
856  n += delta;
857  pos++;
858  }
859 
860  // Last checks
861  if ( pos == pos0 || ((comma >= 0) && (comma != 3)) ) {
863  }
864  // Skip allowed trailing symbols
866  bool spaces = ((flags & fAllowTrailingSymbols) ==
869  }
870  // Assign sign before the end pointer check
871  n = sign ? -n : n;
873 
874  return n;
875 }
876 
877 
879  TStringToNumFlags flags, int base)
880 {
882 
883  const TStringToNumFlags slow_flags =
885 
886  if ( base == 10 && (flags & slow_flags) == 0 ) {
887  // fast conversion
888 
889  // Current position in the string
890  CTempString::const_iterator ptr = str.begin(), end = str.end();
891 
892  // Determine sign
893  if ( ptr != end && *ptr == '+' ) {
894  ++ptr;
895  }
896  if ( ptr == end ) {
897  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, ptr-str.begin());
898  }
899 
900  // Begin conversion
901  Uint8 n = 0;
902 
903  const Uint8 limdiv = kMax_UI8/10;
904  const int limoff = int(kMax_UI8 % 10);
905 
906  do {
907  char ch = *ptr;
908  int delta = ch - '0';
909  if ( unsigned(delta) >= 10 ) {
910  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, ptr-str.begin());
911  }
912  // Overflow check
913  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
914  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, ptr-str.begin());
915  }
916  n = n*10+delta;
917  } while ( ++ptr != end );
918 
919  return n;
920  }
921 
922  // Current position in the string
923  SIZE_TYPE pos = 0, size = str.size();
924 
925  // Skip allowed leading symbols
926  if (flags & fAllowLeadingSymbols) {
927  bool spaces = ((flags & fAllowLeadingSymbols) == fAllowLeadingSpaces);
930  }
931  // Determine sign
932  if (str[pos] == '+') {
933  pos++;
934  } else {
935  if (flags & fMandatorySign) {
937  }
938  }
939  SIZE_TYPE pos0 = pos;
940 
941  // Begin conversion
942  Uint8 n = 0;
943  // Check radix base
944  if ( !s_CheckRadix(str, pos, base) ) {
945  S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
946  NStr::IntToString(base) + "'");
947  }
948 
949  Uint8 limdiv = kMax_UI8 / base;
950  int limoff = int(kMax_UI8 % base);
951 
952  // Number of symbols between two commas. '-1' means -- no comma yet.
953  int comma = -1;
954  SIZE_TYPE numpos = pos;
955 
956  while (char ch = str[pos]) {
957  int delta; // corresponding numeric value of 'ch'
958 
959  // Check on possible commas
960  CHECK_COMMAS;
961  // Sanity check
962  if ( !s_IsGoodCharForRadix(ch, base, &delta) ) {
963  break;
964  }
965  // Overflow check
966  if ( n >= limdiv && (n > limdiv || delta > limoff) ) {
968  }
969  n *= base;
970  n += delta;
971  pos++;
972  }
973 
974  // Last checks
975  if ( pos == pos0 || ((comma >= 0) && (comma != 3)) ) {
977  }
978  // Skip allowed trailing symbols
980  bool spaces = ((flags & fAllowTrailingSymbols) ==
983  }
985  return n;
986 }
987 
988 
989 double NStr::StringToDoublePosix(const char* ptr, char** endptr, TStringToNumFlags flags)
990 {
992 
993  const char* start = ptr;
994  char c = *ptr++;
995 
996  // skip leading blanks
997  while ( isspace((unsigned char)c) ) {
998  c = *ptr++;
999  }
1000 
1001  int sign = 0;
1002  if ( c == '-' ) {
1003  sign = -1;
1004  c = *ptr++;
1005  }
1006  else if ( c == '+' ) {
1007  sign = +1;
1008  c = *ptr++;
1009  }
1010 
1011  if (c == 0) {
1012  if (endptr) {
1013  *endptr = (char*)start;
1014  }
1015  err_guard.Set(EINVAL);
1016  return 0.;
1017  }
1018 
1019  // short-cut - single digit
1020  if ( !*ptr && c >= '0' && c <= '9' ) {
1021  if (endptr) {
1022  *endptr = (char*)ptr;
1023  }
1024  double result = c-'0';
1025  // some compilers fail to negate zero
1026  return sign < 0 ? (c == '0' ? -0. : -result) : result;
1027  }
1028 
1029  bool dot = false, expn = false, anydigits = false;
1030  int digits = 0, dot_position = 0;
1031  unsigned int first=0, second=0, first_mul=1;
1032  long double second_mul = NCBI_CONST_LONGDOUBLE(1.),
1033  third = NCBI_CONST_LONGDOUBLE(0.);
1034 
1035  // up to exponent
1036  for ( ; ; c = *ptr++ ) {
1037  if (c >= '0' && c <= '9') {
1038  // digits: accumulate
1039  c = (char)(c - '0');
1040  anydigits = true;
1041  ++digits;
1042  if (first == 0) {
1043  first = c;
1044  if ( first == 0 ) {
1045  // omit leading zeros
1046  --digits;
1047  if (dot) {
1048  --dot_position;
1049  }
1050  }
1051  } else if (digits <= 9) {
1052  // first 9 digits come to 'first'
1053  first = first*10 + c;
1054  } else if (digits <= 18) {
1055  // next 9 digits come to 'second'
1056  first_mul *= 10;
1057  second = second*10 + c;
1058  } else {
1059  // other digits come to 'third'
1060  second_mul *= NCBI_CONST_LONGDOUBLE(10.);
1061  third = third * NCBI_CONST_LONGDOUBLE(10.) + c;
1062  }
1063  }
1064  else if (c == '.') {
1065  // dot
1066  // if second dot, stop
1067  if (dot) {
1068  --ptr;
1069  break;
1070  }
1071  dot_position = digits;
1072  dot = true;
1073  }
1074  else if (c == 'e' || c == 'E') {
1075  // if exponent, stop
1076  if (!anydigits) {
1077  --ptr;
1078  break;
1079  }
1080  expn = true;
1081  break;
1082  }
1083  else {
1084  --ptr;
1085  if (!anydigits) {
1086  if ( !dot && (c == 'n' || c == 'N') &&
1087  NStr::strncasecmp(ptr,"nan",3)==0) {
1088  if (endptr) {
1089  *endptr = (char*)(ptr+3);
1090  }
1091  return HUGE_VAL/HUGE_VAL; /* NCBI_FAKE_WARNING */
1092  }
1093  if ( (c == 'i' || c == 'I') ) {
1094  if ( NStr::strncasecmp(ptr,"inf",3)==0) {
1095  ptr += 3;
1096  if ( NStr::strncasecmp(ptr,"inity",5)==0) {
1097  ptr += 5;
1098  }
1099  if (endptr) {
1100  *endptr = (char*)ptr;
1101  }
1102  return sign < 0 ? -HUGE_VAL : HUGE_VAL;
1103  }
1104  }
1105  }
1106  break;
1107  }
1108  }
1109  // if no digits, stop now - error
1110  if (!anydigits) {
1111  if (endptr) {
1112  *endptr = (char*)start;
1113  }
1114  err_guard.Set(EINVAL);
1115  return 0.;
1116  }
1117  int exponent = dot ? dot_position - digits : 0;
1118 
1119  // read exponent
1120  if (expn && *ptr) {
1121  int expvalue = 0;
1122  bool expsign = false, expnegate= false;
1123  int expdigits= 0;
1124  for( ; ; ++ptr) {
1125  c = *ptr;
1126  // sign: should be no digits at this point
1127  if (c == '-' || c == '+') {
1128  // if there was sign or digits, stop
1129  if (expsign || expdigits) {
1130  break;
1131  }
1132  expsign = true;
1133  expnegate = c == '-';
1134  }
1135  // digits: accumulate
1136  else if (c >= '0' && c <= '9') {
1137  ++expdigits;
1138  int newexpvalue = expvalue*10 + (c-'0');
1139  if (newexpvalue > expvalue) {
1140  expvalue = newexpvalue;
1141  }
1142  }
1143  else {
1144  break;
1145  }
1146  }
1147  // if no digits, rollback
1148  if (!expdigits) {
1149  // rollback sign
1150  if (expsign) {
1151  --ptr;
1152  }
1153  // rollback exponent
1154  if (expn) {
1155  --ptr;
1156  }
1157  }
1158  else {
1159  exponent = expnegate ? exponent - expvalue : exponent + expvalue;
1160  }
1161  }
1162  long double ret;
1163  if ( first_mul > 1 ) {
1164  _ASSERT(first);
1165  ret = ((long double)first * first_mul + second)* second_mul + third;
1166  }
1167  else {
1168  _ASSERT(first_mul == 1);
1169  _ASSERT(second == 0);
1170  _ASSERT(second_mul == 1);
1171  _ASSERT(third == 0);
1172  ret = first;
1173  }
1174  // calculate exponent
1175  if ( first && exponent ) {
1176  // multiply by power of 10 only non-zero mantissa
1177  if (exponent > 2*DBL_MAX_10_EXP) {
1178  ret = (flags & fDecimalPosixFinite) ? DBL_MAX : HUGE_VAL;
1179  err_guard.Set(ERANGE);
1180  } else if (exponent < 2*DBL_MIN_10_EXP) {
1181  ret = (flags & fDecimalPosixFinite) ? DBL_MIN : 0.;
1182  err_guard.Set(ERANGE);
1183  } else {
1184  if ( exponent > 0 ) {
1185  static const double mul1[16] = {
1186  1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
1187  1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15
1188  };
1189  ret *= mul1[exponent&15];
1190  if ( exponent >>= 4 ) {
1191  static const long double mul2[16] = {
1192  NCBI_CONST_LONGDOUBLE(1e0),
1193  NCBI_CONST_LONGDOUBLE(1e16),
1194  NCBI_CONST_LONGDOUBLE(1e32),
1195  NCBI_CONST_LONGDOUBLE(1e48),
1196  NCBI_CONST_LONGDOUBLE(1e64),
1197  NCBI_CONST_LONGDOUBLE(1e80),
1198  NCBI_CONST_LONGDOUBLE(1e96),
1199  NCBI_CONST_LONGDOUBLE(1e112),
1200  NCBI_CONST_LONGDOUBLE(1e128),
1201  NCBI_CONST_LONGDOUBLE(1e144),
1202  NCBI_CONST_LONGDOUBLE(1e160),
1203  NCBI_CONST_LONGDOUBLE(1e176),
1204  NCBI_CONST_LONGDOUBLE(1e192),
1205  NCBI_CONST_LONGDOUBLE(1e208),
1206  NCBI_CONST_LONGDOUBLE(1e224),
1207  NCBI_CONST_LONGDOUBLE(1e240)
1208  };
1209  ret *= mul2[exponent&15];
1210  for ( exponent >>= 4; exponent; --exponent ) {
1211  ret *= NCBI_CONST_LONGDOUBLE(1e256);
1212  }
1213  }
1214  if (!finite(double(ret))) {
1215  if (flags & fDecimalPosixFinite) {
1216  ret = DBL_MAX;
1217  }
1218  err_guard.Set(ERANGE);
1219  }
1220  }
1221  else {
1222  exponent = -exponent;
1223  static const long double mul1[16] = {
1224  NCBI_CONST_LONGDOUBLE(1e-0),
1225  NCBI_CONST_LONGDOUBLE(1e-1),
1226  NCBI_CONST_LONGDOUBLE(1e-2),
1227  NCBI_CONST_LONGDOUBLE(1e-3),
1228  NCBI_CONST_LONGDOUBLE(1e-4),
1229  NCBI_CONST_LONGDOUBLE(1e-5),
1230  NCBI_CONST_LONGDOUBLE(1e-6),
1231  NCBI_CONST_LONGDOUBLE(1e-7),
1232  NCBI_CONST_LONGDOUBLE(1e-8),
1233  NCBI_CONST_LONGDOUBLE(1e-9),
1234  NCBI_CONST_LONGDOUBLE(1e-10),
1235  NCBI_CONST_LONGDOUBLE(1e-11),
1236  NCBI_CONST_LONGDOUBLE(1e-12),
1237  NCBI_CONST_LONGDOUBLE(1e-13),
1238  NCBI_CONST_LONGDOUBLE(1e-14),
1239  NCBI_CONST_LONGDOUBLE(1e-15)
1240  };
1241  ret *= mul1[exponent&15];
1242  if ( exponent >>= 4 ) {
1243  static const long double mul2[16] = {
1244  NCBI_CONST_LONGDOUBLE(1e-0),
1245  NCBI_CONST_LONGDOUBLE(1e-16),
1246  NCBI_CONST_LONGDOUBLE(1e-32),
1247  NCBI_CONST_LONGDOUBLE(1e-48),
1248  NCBI_CONST_LONGDOUBLE(1e-64),
1249  NCBI_CONST_LONGDOUBLE(1e-80),
1250  NCBI_CONST_LONGDOUBLE(1e-96),
1251  NCBI_CONST_LONGDOUBLE(1e-112),
1252  NCBI_CONST_LONGDOUBLE(1e-128),
1253  NCBI_CONST_LONGDOUBLE(1e-144),
1254  NCBI_CONST_LONGDOUBLE(1e-160),
1255  NCBI_CONST_LONGDOUBLE(1e-176),
1256  NCBI_CONST_LONGDOUBLE(1e-192),
1257  NCBI_CONST_LONGDOUBLE(1e-208),
1258  NCBI_CONST_LONGDOUBLE(1e-224),
1259  NCBI_CONST_LONGDOUBLE(1e-240)
1260  };
1261  ret *= mul2[exponent&15];
1262  for ( exponent >>= 4; exponent; --exponent ) {
1263  ret *= NCBI_CONST_LONGDOUBLE(1e-256);
1264  }
1265  }
1266  if ( ret < DBL_MIN ) {
1267  if (flags & fDecimalPosixFinite) {
1268  ret = DBL_MIN;
1269  }
1270  err_guard.Set(ERANGE);
1271  }
1272  }
1273  }
1274  }
1275  if ( sign < 0 ) {
1276  ret = -ret;
1277  }
1278  // done
1279  if (endptr) {
1280  *endptr = (char*)ptr;
1281  }
1282  return (double)ret;
1283 }
1284 
1285 
1286 /// @internal
1287 static double s_StringToDouble(const char* str, size_t size,
1289 {
1290  _ASSERT(str[size] == '\0');
1292  NCBI_THROW2(CStringException, eBadArgs,
1293  "NStr::StringToDouble(): mutually exclusive flags specified", 0);
1294  }
1296 
1297  // Current position in the string
1298  SIZE_TYPE pos = 0;
1299 
1300  // Skip allowed leading symbols
1302  bool spaces = ((flags & NStr::fAllowLeadingSymbols) ==
1305  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
1306  }
1307  // Check mandatory sign
1308  if (flags & NStr::fMandatorySign) {
1309  switch (str[pos]) {
1310  case '-':
1311  case '+':
1312  break;
1313  default:
1314  S2N_CONVERT_ERROR_INVAL(double);
1315  }
1316  }
1317  // For consistency make additional check on incorrect leading symbols.
1318  // Because strtod() may just skip such symbols.
1319  if (!(flags & NStr::fAllowLeadingSymbols)) {
1320  char c = str[pos];
1321  if ( !isdigit((unsigned char)c) && !s_IsDecimalPoint(c,flags) && c != '-' && c != '+') {
1322  S2N_CONVERT_ERROR_INVAL(double);
1323  }
1324  }
1325 
1326  // Conversion
1327  int& errno_ref = errno;
1328  errno_ref = 0;
1329 
1330  char* endptr = 0;
1331  const char* begptr = str + pos;
1332 
1333  double n;
1334  if (flags & NStr::fDecimalPosix) {
1335  n = NStr::StringToDoublePosix(begptr, &endptr, flags);
1336  } else {
1337  n = strtod(begptr, &endptr);
1338  }
1340  char* endptr2 = 0;
1341  double n2 = NStr::StringToDoublePosix(begptr, &endptr2, flags);
1342  if (!endptr || (endptr2 && endptr2 > endptr)) {
1343  n = n2;
1344  endptr = endptr2;
1345  }
1346  }
1347  if ( !endptr || endptr == begptr ) {
1348  S2N_CONVERT_ERROR(double, kEmptyStr, EINVAL, s_DiffPtr(endptr, begptr) + pos);
1349  }
1350  // some libs set ERANGE, others do not
1351  // here, we do not consider ERANGE as error
1352  if ( errno_ref && errno_ref != ERANGE ) {
1353  S2N_CONVERT_ERROR(double, kEmptyStr, errno_ref, s_DiffPtr(endptr, begptr) + pos);
1354  }
1355  // special cases
1356  if ((flags & NStr::fDecimalPosixFinite) && n != 0. && !isnan(n))
1357  {
1358  bool is_negative = n < 0.;
1359  if (is_negative) {
1360  n = -n;
1361  }
1362  if ( n < DBL_MIN) {
1363  n = DBL_MIN;
1364  } else if (!finite(n)) {
1365  n = DBL_MAX;
1366  }
1367  if (is_negative) {
1368  n = -n;
1369  }
1370  }
1371 
1372  pos += s_DiffPtr(endptr, begptr);
1373 
1374  // Skip allowed trailing symbols
1376  bool spaces = ((flags & NStr::fAllowTrailingSymbols) ==
1379  }
1380  CHECK_ENDPTR(double);
1381  return n;
1382 }
1383 
1384 
1385 double NStr::StringToDoubleEx(const char* str, size_t size,
1386  TStringToNumFlags flags)
1388  return s_StringToDouble(str, size, flags);
1389 }
1390 
1391 
1392 double NStr::StringToDouble(const CTempStringEx str, TStringToNumFlags flags)
1393 {
1394  size_t size = str.size();
1395  if ( str.HasZeroAtEnd() ) {
1396  // string has zero at the end already
1397  return s_StringToDouble(str.data(), size, flags);
1398  }
1399  char buf[256]; // small temporary buffer on stack for appending zero char
1400  if ( size < sizeof(buf) ) {
1401  memcpy(buf, str.data(), size);
1402  buf[size] = '\0';
1403  return s_StringToDouble(buf, size, flags);
1404  }
1405  else {
1406  // use std::string() to allocate memory for appending zero char
1407  return s_StringToDouble(string(str).c_str(), size, flags);
1408  }
1409 }
1410 
1411 /// @internal
1413  SIZE_TYPE& pos,
1414  Uint8 value,
1416 {
1418 
1419  unsigned char ch = str[pos];
1420  if ( !ch ) {
1421  return value;
1422  }
1423 
1424  ch = (unsigned char)toupper(ch);
1425  Uint8 v = value;
1426  bool err = false;
1427 
1428  switch(ch) {
1429  case 'K':
1430  pos++;
1431  if ((kMax_UI8 / 1024) < v) {
1432  err = true;
1433  }
1434  v *= 1024;
1435  break;
1436  case 'M':
1437  pos++;
1438  if ((kMax_UI8 / 1024 / 1024) < v) {
1439  err = true;
1440  }
1441  v *= 1024 * 1024;
1442  break;
1443  case 'G':
1444  pos++;
1445  if ((kMax_UI8 / 1024 / 1024 / 1024) < v) {
1446  err = true;
1447  }
1448  v *= 1024 * 1024 * 1024;
1449  break;
1450  default:
1451  // error -- the "qual" points to the last unprocessed symbol
1453  }
1454  if ( err ) {
1455  S2N_CONVERT_ERROR_OVERFLOW(DataSize);
1456  }
1457 
1458  ch = str[pos];
1459  if ( ch && toupper(ch) == 'B' ) {
1460  pos++;
1461  }
1462  return v;
1463 }
1464 
1465 
1467  TStringToNumFlags flags,
1468  int base)
1469 {
1470  // We have a limited base range here
1471  if ( base < 2 || base > 16 ) {
1472  NCBI_THROW2(CStringException, eConvert,
1473  "Bad numeric base '" + NStr::IntToString(base)+ "'", 0);
1474  }
1476 
1477  // Current position in the string
1478  SIZE_TYPE pos = 0;
1479 
1480  // Find end of number representation
1481  {{
1482  // Skip allowed leading symbols
1483  if (flags & fAllowLeadingSymbols) {
1484  bool spaces = ((flags & fAllowLeadingSymbols) ==
1487  spaces ? eSkipSpacesOnly : eSkipAllAllowed, flags);
1488  }
1489  // Determine sign
1490  if (str[pos] == '+') {
1491  pos++;
1492  // strip fMandatorySign flag
1493  flags &= ~fMandatorySign;
1494  } else {
1495  if (flags & fMandatorySign) {
1497  }
1498  }
1499  // Check radix base
1500  if ( !s_CheckRadix(str, pos, base) ) {
1501  S2N_CONVERT_ERROR_RADIX(Uint8, "bad numeric base '" +
1502  NStr::IntToString(base) + "'");
1503  }
1504  }}
1505 
1506  SIZE_TYPE numpos = pos;
1507  char ch = str[pos];
1508  while (ch) {
1509  if ( !s_IsGoodCharForRadix(ch, base) &&
1510  ((ch != ',') || !(flags & fAllowCommas)) ) {
1511  break;
1512  }
1513  ch = str[++pos];
1514  }
1515  // If string is empty, just use whole remaining string for conversion
1516  // (for correct error reporting)
1517  if (pos-numpos == 0) {
1518  pos = str.length();
1519  }
1520 
1521  // Convert to number
1522  Uint8 n = StringToUInt8(CTempString(str.data()+numpos, pos-numpos),
1523  flags, base);
1524  if ( !n && errno ) {
1525  // If exceptions are enabled that it has been already thrown.
1526  // The errno is also set, so just return a zero.
1527  return 0;
1528  }
1529  // Check trailer (KB, MB, ...)
1530  if ( ch ) {
1531  n = s_DataSizeConvertQual(str, pos, n, flags);
1532  }
1533  // Skip allowed trailing symbols
1534  if (flags & fAllowTrailingSymbols) {
1535  bool spaces = ((flags & fAllowTrailingSymbols) ==
1538  }
1540  return n;
1541 }
1542 
1543 
1545  TStringToNumFlags flags /* = 0 */)
1546 {
1547  TStringToNumFlags allowed_flags = fConvErr_NoThrow +
1548  fMandatorySign +
1549  fAllowCommas +
1552  fDS_ForceBinary +
1555 
1556  if ((flags & allowed_flags) != flags) {
1557  NCBI_THROW2(CStringException, eConvert, "Wrong set of flags", 0);
1558  }
1560 
1561  const char* str_ptr = str.data();
1562  const char* str_end = str_ptr + str.size();
1563  if (flags & fAllowLeadingSymbols) {
1564  bool allow_all = (flags & fAllowLeadingSymbols) != fAllowLeadingSpaces;
1565  for (; str_ptr < str_end; ++str_ptr) {
1566  char c = *str_ptr;
1567  if (isdigit(c))
1568  break;
1569  if (isspace(c))
1570  continue;
1571  if ((c == '+' || c == '-') && (flags & fMandatorySign)
1572  && str_ptr + 1 < str_end && isdigit(*(str_ptr + 1)))
1573  {
1574  break;
1575  }
1576  if (!allow_all)
1577  break;
1578  }
1579  }
1580 
1581  if (str_ptr < str_end && *str_ptr == '+') {
1582  ++str_ptr;
1583  }
1584  else if ((str_ptr < str_end && *str_ptr == '-')
1585  || (flags & fMandatorySign))
1586  {
1587  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1588  }
1589 
1590  const char* num_start = str_ptr;
1591  bool have_dot = false;
1592  bool allow_commas = (flags & fAllowCommas) != 0;
1593  bool allow_dot = (flags & fDS_ProhibitFractions) == 0;
1594  Uint4 digs_pre_dot = 0, digs_post_dot = 0;
1595 
1596  for (; str_ptr < str_end; ++str_ptr) {
1597  char c = *str_ptr;
1598  if (isdigit(c)) {
1599  if (have_dot)
1600  ++digs_post_dot;
1601  else
1602  ++digs_pre_dot;
1603  }
1604  else if (c == '.' && allow_dot) {
1605  if (have_dot || str_ptr == num_start)
1606  break;
1607  if (*(str_ptr - 1) == ',') {
1608  --str_ptr;
1609  break;
1610  }
1611  have_dot = true;
1612  }
1613  else if (c == ',' && allow_commas) {
1614  if (have_dot || str_ptr == num_start)
1615  break;
1616  if (*(str_ptr - 1) == ',') {
1617  --str_ptr;
1618  break;
1619  }
1620  }
1621  else
1622  break;
1623  }
1624  if (have_dot && digs_post_dot == 0)
1625  --str_ptr;
1626  else if (str_ptr > num_start && *(str_ptr - 1) == ',')
1627  --str_ptr;
1628 
1629  const char* num_end = str_ptr;
1630  if (num_start == num_end) {
1631  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1632  }
1633  if (str_ptr < str_end && *str_ptr == ' '
1635  {
1636  ++str_ptr;
1637  }
1638  char suff_c = 0;
1639  if (str_ptr < str_end)
1640  suff_c = (char)toupper(*str_ptr);
1641 
1642  static const char s_Suffixes[] = {'K', 'M', 'G', 'T', 'P', 'E'};
1643  static const char* const s_BinCoefs[] = {"1024", "1048576", "1073741824",
1644  "1099511627776",
1645  "1125899906842624",
1646  "1152921504606846976"};
1647  static const Uint4 s_NumSuffixes = (Uint4)(sizeof(s_Suffixes) / sizeof(s_Suffixes[0]));
1648 
1649  bool binary_suff = (flags & fDS_ForceBinary) != 0;
1650  Uint4 suff_idx = 0;
1651  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
1652  if (suff_c == s_Suffixes[suff_idx])
1653  break;
1654  }
1655  if (suff_idx < s_NumSuffixes) {
1656  ++str_ptr;
1657  if (str_ptr + 1 < str_end && toupper(*str_ptr) == 'I'
1658  && toupper(*(str_ptr + 1)) == 'B')
1659  {
1660  str_ptr += 2;
1661  binary_suff = true;
1662  }
1663  else if (str_ptr < str_end && toupper(*str_ptr) == 'B')
1664  ++str_ptr;
1665  }
1666  else if (suff_c == 'B') {
1667  ++str_ptr;
1668  }
1669  else if (*(str_ptr - 1) == ' ')
1670  --str_ptr;
1671 
1672  if (flags & fAllowTrailingSymbols) {
1673  bool allow_all = (flags & fAllowTrailingSymbols) != fAllowTrailingSpaces;
1674  for (; str_ptr < str_end; ++str_ptr) {
1675  char c = *str_ptr;
1676  if (isspace(c))
1677  continue;
1678  if (!allow_all)
1679  break;
1680  }
1681  }
1682  if (str_ptr != str_end) {
1683  S2N_CONVERT_ERROR(Uint8, kEmptyStr, EINVAL, str_ptr - str.data());
1684  }
1685 
1686  Uint4 orig_digs = digs_pre_dot + digs_post_dot;
1687  AutoArray<Uint1> orig_num(orig_digs);
1688  str_ptr = num_start;
1689  for (Uint4 i = 0; str_ptr < num_end; ++str_ptr) {
1690  if (*str_ptr == ',' || *str_ptr == '.')
1691  continue;
1692  orig_num[i++] = Uint1(*str_ptr - '0');
1693  }
1694 
1695  Uint1* num_to_conv = orig_num.get();
1696  Uint4 digs_to_conv = digs_pre_dot;
1697  AutoArray<Uint1> mul_num;
1698  if (binary_suff && suff_idx < s_NumSuffixes) {
1699  const char* coef = s_BinCoefs[suff_idx];
1700  Uint4 coef_size = Uint4(strlen(coef));
1701  mul_num = new Uint1[orig_digs + coef_size];
1702  memset(mul_num.get(), 0, orig_digs + coef_size);
1703  for (Uint4 coef_i = 0; coef_i < coef_size; ++coef_i) {
1704  Uint1 coef_d = Uint1(coef[coef_i] - '0');
1705  Uint1 carry = 0;
1706  Uint4 res_idx = orig_digs + coef_i;
1707  for (int orig_i = orig_digs - 1; orig_i >= 0; --orig_i, --res_idx) {
1708  Uint1 orig_d = orig_num[orig_i];
1709  Uint1 res_d = Uint1(coef_d * orig_d + carry + mul_num[res_idx]);
1710  carry = 0;
1711  while (res_d >= 10) {
1712  res_d = (Uint1)(res_d - 10); // res_d -= 10;
1713  ++carry;
1714  }
1715  mul_num[res_idx] = res_d;
1716  }
1717  _ASSERT(carry <= 9);
1718  for (; carry != 0; --res_idx) {
1719  Uint1 res_d = Uint1(mul_num[res_idx] + carry);
1720  carry = 0;
1721  while (res_d >= 10) {
1722  res_d = (Uint1)(res_d - 10); // res_d -= 10;
1723  ++carry;
1724  }
1725  mul_num[res_idx] = res_d;
1726  }
1727  }
1728  digs_to_conv = orig_digs + coef_size - digs_post_dot;
1729  num_to_conv = mul_num.get();
1730  while (digs_to_conv > 1 && *num_to_conv == 0) {
1731  --digs_to_conv;
1732  ++num_to_conv;
1733  }
1734  }
1735  else if (suff_idx < s_NumSuffixes) {
1736  Uint4 coef_size = (suff_idx + 1) * 3;
1737  if (coef_size <= digs_post_dot) {
1738  digs_to_conv += coef_size;
1739  digs_post_dot -= coef_size;
1740  }
1741  else {
1742  digs_to_conv += digs_post_dot;
1743  coef_size -= digs_post_dot;
1744  digs_post_dot = 0;
1745  mul_num = new Uint1[digs_to_conv + coef_size];
1746  memmove(mul_num.get(), num_to_conv, digs_to_conv);
1747  memset(mul_num.get() + digs_to_conv, 0, coef_size);
1748  num_to_conv = mul_num.get();
1749  digs_to_conv += coef_size;
1750  }
1751  }
1752 
1753  const Uint8 limdiv = kMax_UI8/10;
1754  const int limoff = int(kMax_UI8 % 10);
1755  Uint8 n = 0;
1756  for (Uint4 i = 0; i < digs_to_conv; ++i) {
1757  Uint1 d = num_to_conv[i];
1758  if (n >= limdiv && (n > limdiv || d > limoff)) {
1759  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, i);
1760  }
1761  n *= 10;
1762  n += d;
1763  }
1764  if (digs_post_dot != 0 && num_to_conv[digs_to_conv] >= 5) {
1765  if (n == kMax_UI8) {
1766  S2N_CONVERT_ERROR(Uint8, kEmptyStr, ERANGE, digs_to_conv);
1767  }
1768  ++n;
1769  }
1770  return n;
1771 }
1772 
1773 
1774 size_t NStr::StringToSizet(const CTempString str,
1775  TStringToNumFlags flags, int base)
1776 {
1777 #if (SIZEOF_SIZE_T > 4)
1778  return StringToUInt8(str, flags, base);
1779 #else
1780  return StringToUInt(str, flags, base);
1781 #endif
1783 
1784 
1785 /// @internal
1786 template <typename T>
1787 static void s_UnsignedOtherBaseToString(string& out_str,
1788  T value,
1790  int base)
1791 {
1792  _ASSERT(base != 10);
1793 
1794  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1795  char buffer[kBufSize + 2]; // +2 for fWithRadix
1796  char* pos = buffer + kBufSize;
1797  const char* kDigit = (flags & NStr::fUseLowercase) ? kDigitLower : kDigitUpper;
1798 
1799  out_str.erase();
1800 
1801  if ( base == 16 ) {
1802  if ( flags & NStr::fWithRadix ) {
1803  out_str.append("0x");
1804  }
1805 
1806  do {
1807  *--pos = kDigit[value % 16];
1808  value /= 16;
1809  } while ( value );
1810  }
1811  else if ( base == 8 ) {
1812  if ( flags & NStr::fWithRadix ) {
1813  out_str.append("0");
1814  if ( value == 0 ) {
1815  // to prevent "00"
1816  return;
1817  }
1818  }
1819  do {
1820  *--pos = kDigit[value % 8];
1821  value /= 8;
1822  } while ( value );
1823  }
1824  else {
1825  do {
1826  *--pos = kDigit[value % base];
1827  value /= base;
1828  } while ( value );
1829  }
1830  out_str.append(pos, buffer + kBufSize - pos);
1831 }
1832 
1833 
1834 /// @internal
1835 static void s_SignedBase10ToString(string& out_str,
1836  unsigned long value,
1837  long svalue,
1839  int base)
1840 {
1841  _ASSERT(base == 10);
1842 
1843  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1844  char buffer[kBufSize+2];
1845  char* pos = buffer + kBufSize;
1846 
1847  if (svalue < 0) {
1848  value = static_cast<unsigned long>(-svalue);
1849  }
1850  if ((flags & NStr::fWithCommas)) {
1851  int cnt = -1;
1852  do {
1853  if (++cnt == 3) {
1854  *--pos = ',';
1855  cnt = 0;
1856  }
1857  *--pos = '0' + value % 10;
1858  value /= 10;
1859  } while (value);
1860  }
1861  else {
1862  do {
1863  *--pos = '0' + value % 10;
1864  value /= 10;
1865  } while (value);
1866  }
1867 
1868  if (svalue < 0)
1869  *--pos = '-';
1870  else if (flags & NStr::fWithSign)
1871  *--pos = '+';
1873  out_str.assign(pos, buffer + kBufSize - pos);
1874 }
1875 
1876 
1877 void NStr::IntToString(string& out_str, int svalue,
1878  TNumToStringFlags flags, int base)
1879 {
1880  if ( base < 2 || base > 36 ) {
1881  CNcbiError::SetErrno(errno = EINVAL);
1882  return;
1883  }
1884  unsigned int value = static_cast<unsigned int>(svalue);
1885  if ( base == 10 ) {
1886  s_SignedBase10ToString(out_str, value, svalue, flags, base);
1887  } else {
1888  s_UnsignedOtherBaseToString(out_str, value, flags, base);
1889  }
1890  errno = 0;
1891 }
1892 
1893 
1894 void NStr::LongToString(string& out_str, long svalue,
1895  TNumToStringFlags flags, int base)
1896 {
1897  if ( base < 2 || base > 36 ) {
1898  CNcbiError::SetErrno(errno = EINVAL);
1899  return;
1900  }
1901  unsigned long value = static_cast<unsigned long>(svalue);
1902  if ( base == 10 ) {
1903  s_SignedBase10ToString(out_str, value, svalue, flags, base);
1904  } else {
1905  s_UnsignedOtherBaseToString(out_str, value, flags, base);
1906  }
1907  errno = 0;
1908 }
1909 
1910 
1911 void NStr::ULongToString(string& out_str,
1912  unsigned long value,
1913  TNumToStringFlags flags,
1914  int base)
1915 {
1916  if ( base < 2 || base > 36 ) {
1917  CNcbiError::SetErrno(errno = EINVAL);
1918  return;
1919  }
1920  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
1921  char buffer[kBufSize];
1922  char* pos = buffer + kBufSize;
1923  out_str.erase();
1924 
1925  if ( base == 10 ) {
1926  if ( (flags & fWithCommas) ) {
1927  int cnt = -1;
1928  do {
1929  if (++cnt == 3) {
1930  *--pos = ',';
1931  cnt = 0;
1932  }
1933  *--pos = '0' + value % 10;
1934  value /= 10;
1935  } while ( value );
1936  }
1937  else {
1938  do {
1939  *--pos = '0' + value % 10;
1940  value /= 10;
1941  } while ( value );
1942  }
1943 
1944  if ( (flags & fWithSign) ) {
1945  *--pos = '+';
1946  }
1947  out_str.assign(pos, buffer + kBufSize - pos);
1948  }
1949  else {
1950  s_UnsignedOtherBaseToString(out_str, value, flags, base);
1951  }
1952  errno = 0;
1953 }
1954 
1955 
1957 // On some platforms division of Int8 is very slow,
1958 // so will try to optimize it working with chunks.
1959 // Works only for radix base == 10.
1961 #define PRINT_INT8_CHUNK 1000000000
1962 #define PRINT_INT8_CHUNK_SIZE 9
1963 
1964 /// @internal
1965 static char* s_PrintBase10Uint8(char* pos,
1966  Uint8 value,
1968 {
1969  if ( (flags & NStr::fWithCommas) ) {
1970  int cnt = -1;
1971 #ifdef PRINT_INT8_CHUNK
1972  // while n doesn't fit in Uint4 process the number
1973  // by 9-digit chunks within 32-bit Uint4
1974  while ( value & ~Uint8(Uint4(~0)) ) {
1975  Uint4 chunk = Uint4(value);
1977  chunk -= PRINT_INT8_CHUNK*Uint4(value);
1978  char* end = pos - PRINT_INT8_CHUNK_SIZE - 2; // 9-digit chunk should have 2 commas
1979  do {
1980  if (++cnt == 3) {
1981  *--pos = ',';
1982  cnt = 0;
1983  }
1984  *--pos = '0' + chunk % 10;
1985  chunk /= 10;
1986  } while ( pos != end );
1987  }
1988  // process all remaining digits in 32-bit number
1989  Uint4 chunk = Uint4(value);
1990  do {
1991  if (++cnt == 3) {
1992  *--pos = ',';
1993  cnt = 0;
1994  }
1995  *--pos = '0' + chunk % 10;
1996  chunk /= 10;
1997  } while ( chunk );
1998 #else
1999  do {
2000  if (++cnt == 3) {
2001  *--pos = ',';
2002  cnt = 0;
2003  }
2004  *--pos = '0' + value % 10;
2005  value /= 10;
2006  } while ( value );
2007 #endif
2008  }
2009  else {
2010 #ifdef PRINT_INT8_CHUNK
2011  // while n doesn't fit in Uint4 process the number
2012  // by 9-digit chunks within 32-bit Uint4
2013  while ( value & ~Uint8(Uint4(~0)) ) {
2014  Uint4 chunk = Uint4(value);
2016  chunk -= PRINT_INT8_CHUNK*Uint4(value);
2017  char* end = pos - PRINT_INT8_CHUNK_SIZE;
2018  do {
2019  *--pos = '0' + chunk % 10;
2020  chunk /= 10;
2021  } while ( pos != end );
2022  }
2023  // process all remaining digits in 32-bit number
2024  Uint4 chunk = Uint4(value);
2025  do {
2026  *--pos = '0' + chunk % 10;
2027  chunk /= 10;
2028  } while ( chunk );
2029 #else
2030  do {
2031  *--pos = '0' + value % 10;
2032  value /= 10;
2033  } while ( value );
2034 #endif
2035  }
2036  return pos;
2037 }
2038 
2039 
2040 void NStr::Int8ToString(string& out_str, Int8 svalue,
2041  TNumToStringFlags flags, int base)
2042 {
2043  if ( base < 2 || base > 36 ) {
2044  CNcbiError::SetErrno(errno = EINVAL);
2045  return;
2046  }
2047  Uint8 value;
2048  if (base == 10) {
2049  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
2050  char buffer[kBufSize];
2051 
2052  value = static_cast<Uint8>(svalue<0?-svalue:svalue);
2053  char* pos = s_PrintBase10Uint8(buffer + kBufSize, value, flags);
2054  if (svalue < 0)
2055  *--pos = '-';
2056  else if (flags & fWithSign)
2057  *--pos = '+';
2058  out_str.assign(pos, buffer + kBufSize - pos);
2059  } else {
2060  value = static_cast<Uint8>(svalue);
2061  s_UnsignedOtherBaseToString(out_str, value, flags, base);
2062  }
2063  errno = 0;
2064 }
2065 
2066 
2067 void NStr::UInt8ToString(string& out_str, Uint8 value,
2068  TNumToStringFlags flags, int base)
2069 {
2070  if ( base < 2 || base > 36 ) {
2071  CNcbiError::SetErrno(errno = EINVAL);
2072  return;
2073  }
2074  if (base == 10) {
2075  const SIZE_TYPE kBufSize = CHAR_BIT * sizeof(value);
2076  char buffer[kBufSize];
2077 
2078  char* pos = s_PrintBase10Uint8(buffer + kBufSize, value, flags);
2079  if ( flags & fWithSign ) {
2080  *--pos = '+';
2081  }
2082  out_str.assign(pos, buffer + kBufSize - pos);
2083  } else {
2084  s_UnsignedOtherBaseToString(out_str, value, flags, base);
2085  }
2086  errno = 0;
2087 }
2088 
2089 
2090 void NStr::UInt8ToString_DataSize(string& out_str,
2091  Uint8 value,
2092  TNumToStringFlags flags /* = 0 */,
2093  unsigned int max_digits /* = 3 */)
2094 {
2095  TNumToStringFlags allowed_flags = fWithSign +
2096  fWithCommas +
2097  fDS_Binary +
2100  fDS_ShortSuffix +
2102 
2103  if ((flags & allowed_flags) != flags) {
2104  NCBI_THROW2(CStringException, eConvert, "Wrong set of flags", 0);
2105  }
2106 
2107  if (max_digits < 3)
2108  max_digits = 3;
2109 
2110  static const char s_Suffixes[] = {'K', 'M', 'G', 'T', 'P', 'E'};
2111  static const Uint4 s_NumSuffixes = Uint4(sizeof(s_Suffixes) / sizeof(s_Suffixes[0]));
2112 
2113  static const SIZE_TYPE kBufSize = 50;
2114  char buffer[kBufSize];
2115  char* num_start;
2116  char* dot_ptr;
2117  char* num_end;
2118  Uint4 digs_pre_dot, suff_idx;
2119 
2120  if (!(flags &fDS_Binary)) {
2121  static const Uint8 s_Coefs[] = {1000, 1000000, 1000000000,
2122  NCBI_CONST_UINT8(1000000000000),
2123  NCBI_CONST_UINT8(1000000000000000),
2124  NCBI_CONST_UINT8(1000000000000000000)};
2125  suff_idx = 0;
2126  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
2127  if (value < s_Coefs[suff_idx])
2128  break;
2129  }
2130  num_start = s_PrintBase10Uint8(buffer + kBufSize, value, 0);
2131  num_start[-1] = '0';
2132  dot_ptr = buffer + kBufSize - 3 * suff_idx;
2133  digs_pre_dot = Uint4(dot_ptr - num_start);
2134  if (!(flags & fDS_NoDecimalPoint)) {
2135  num_end = min(buffer + kBufSize, dot_ptr + (max_digits - digs_pre_dot));
2136  }
2137  else {
2138  while (suff_idx > 0 && max_digits - digs_pre_dot >= 3) {
2139  --suff_idx;
2140  digs_pre_dot += 3;
2141  dot_ptr += 3;
2142  }
2143  num_end = dot_ptr;
2144  }
2145  char* round_dig = num_end - 1;
2146  if (num_end < buffer + kBufSize && *num_end >= '5')
2147  ++(*round_dig);
2148  while (*round_dig == '0' + 10) {
2149  *round_dig = '0';
2150  --round_dig;
2151  ++(*round_dig);
2152  }
2153  if (round_dig < num_start) {
2154  _ASSERT(num_start - round_dig == 1);
2155  num_start = round_dig;
2156  ++digs_pre_dot;
2157  if (!(flags & fDS_NoDecimalPoint)) {
2158  if (digs_pre_dot > 3) {
2159  ++suff_idx;
2160  digs_pre_dot -= 3;
2161  dot_ptr -= 3;
2162  }
2163  --num_end;
2164  }
2165  else {
2166  if (digs_pre_dot > max_digits) {
2167  ++suff_idx;
2168  digs_pre_dot -= 3;
2169  dot_ptr -= 3;
2170  num_end = dot_ptr;
2171  }
2172  }
2173  }
2174  }
2175  else {
2176  static const Uint8 s_Coefs[] = {1, 1024, 1048576, 1073741824,
2177  NCBI_CONST_UINT8(1099511627776),
2178  NCBI_CONST_UINT8(1125899906842624),
2179  NCBI_CONST_UINT8(1152921504606846976)};
2180 
2181  suff_idx = 1;
2182  for (; suff_idx < s_NumSuffixes; ++suff_idx) {
2183  if (value < s_Coefs[suff_idx])
2184  break;
2185  }
2186  bool can_try_another = true;
2187 try_another_suffix:
2188  Uint8 mul_coef = s_Coefs[suff_idx - 1];
2189  Uint8 whole_num = value / mul_coef;
2190  if (max_digits == 3 && whole_num >= 1000) {
2191  ++suff_idx;
2192  goto try_another_suffix;
2193  }
2194  num_start = s_PrintBase10Uint8(buffer + kBufSize, whole_num, 0);
2195  num_start[-1] = '0';
2196  digs_pre_dot = Uint4(buffer + kBufSize - num_start);
2197  if (max_digits - digs_pre_dot >= 3 && (flags & fDS_NoDecimalPoint)
2198  && suff_idx != 1 && can_try_another)
2199  {
2200  Uint4 new_suff = suff_idx - 1;
2201 try_even_more_suffix:
2202  Uint8 new_num = value / s_Coefs[new_suff - 1];
2203  char* new_start = s_PrintBase10Uint8(buffer + kBufSize / 2, new_num, 0);
2204  Uint4 new_digs = Uint4(buffer + kBufSize / 2 - new_start);
2205  if (new_digs <= max_digits) {
2206  if (max_digits - digs_pre_dot >= 3 && new_suff != 1) {
2207  --new_suff;
2208  goto try_even_more_suffix;
2209  }
2210  suff_idx = new_suff;
2211  can_try_another = false;
2212  goto try_another_suffix;
2213  }
2214  if (new_suff != suff_idx - 1) {
2215  suff_idx = new_suff + 1;
2216  can_try_another = false;
2217  goto try_another_suffix;
2218  }
2219  }
2220  memcpy(buffer, num_start - 1, digs_pre_dot + 1);
2221  num_start = buffer + 1;
2222  dot_ptr = num_start + digs_pre_dot;
2223  Uint4 cnt_more_digs = 1;
2224  if (!(flags & fDS_NoDecimalPoint))
2225  cnt_more_digs += min(max_digits - digs_pre_dot, 3 * (suff_idx - 1));
2226  num_end = dot_ptr;
2227  Uint8 left_val = value - whole_num * mul_coef;
2228  do {
2229  left_val *= 10;
2230  Uint1 d = Uint1(left_val / mul_coef);
2231  *num_end = char(d + '0');
2232  ++num_end;
2233  left_val -= d * mul_coef;
2234  --cnt_more_digs;
2235  }
2236  while (cnt_more_digs != 0);
2237  --num_end;
2238 
2239  char* round_dig = num_end - 1;
2240  if (*num_end >= '5')
2241  ++(*round_dig);
2242  while (*round_dig == '0' + 10) {
2243  *round_dig = '0';
2244  --round_dig;
2245  ++(*round_dig);
2246  }
2247  if (round_dig < num_start) {
2248  _ASSERT(round_dig == buffer);
2249  num_start = round_dig;
2250  ++digs_pre_dot;
2251  if (digs_pre_dot > max_digits) {
2252  ++suff_idx;
2253  goto try_another_suffix;
2254  }
2255  if (num_end != dot_ptr)
2256  --num_end;
2257  }
2258  if (!(flags & fDS_NoDecimalPoint) && digs_pre_dot == 4
2259  && num_start[0] == '1' && num_start[1] == '0'
2260  && num_start[2] == '2' && num_start[3] == '4')
2261  {
2262  ++suff_idx;
2263  goto try_another_suffix;
2264  }
2265 
2266  --suff_idx;
2267  }
2268 
2269  out_str.erase();
2270  if (flags & fWithSign)
2271  out_str.append(1, '+');
2272  if (!(flags & fWithCommas) || digs_pre_dot <= 3) {
2273  out_str.append(num_start, digs_pre_dot);
2274  }
2275  else {
2276  Uint4 digs_first = digs_pre_dot % 3;
2277  out_str.append(num_start, digs_first);
2278  char* left_ptr = num_start + digs_first;
2279  Uint4 digs_left = digs_pre_dot - digs_first;
2280  while (digs_left != 0) {
2281  out_str.append(1, ',');
2282  out_str.append(left_ptr, 3);
2283  left_ptr += 3;
2284  digs_left -= 3;
2285  }
2286  }
2287  if (num_end != dot_ptr) {
2288  out_str.append(1, '.');
2289  out_str.append(dot_ptr, num_end - dot_ptr);
2290  }
2291 
2292  if (suff_idx == 0) {
2293  if (flags & fDS_PutBSuffixToo) {
2295  out_str.append(1, ' ');
2296  out_str.append(1, 'B');
2297  }
2298  }
2299  else {
2300  --suff_idx;
2302  out_str.append(1, ' ');
2303  out_str.append(1, s_Suffixes[suff_idx]);
2304  if (!(flags & fDS_ShortSuffix)) {
2305  if (flags & fDS_Binary)
2306  out_str.append(1, 'i');
2307  out_str.append(1, 'B');
2308  }
2309  }
2310  errno = 0;
2312 
2313 
2314 // A maximal double precision used in the double to string conversion
2315 #if defined(NCBI_OS_MSWIN)
2316  const int kMaxDoublePrecision = 200;
2317 #else
2318  const int kMaxDoublePrecision = 308;
2319 #endif
2320 // A maximal size of a double value in a string form.
2321 // Exponent size + sign + dot + ending '\0' + max.precision
2322 const int kMaxDoubleStringSize = 308 + 3 + kMaxDoublePrecision;
2323 
2324 
2325 void NStr::DoubleToString(string& out_str, double value,
2327 {
2328  char buffer[kMaxDoubleStringSize]; // inludes ending '\0'
2329  int n = 0;
2330  if (precision >= 0 ||
2331  ((flags & fDoublePosix) && (!finite(value) || value == 0.))) {
2333  buffer[n] = '\0';
2334  } else {
2335  const char* format;
2336  switch (flags & fDoubleGeneral) {
2337  case fDoubleFixed:
2338  format = "%f";
2339  break;
2340  case fDoubleScientific:
2341  format = "%e";
2342  break;
2343  case fDoubleGeneral: // default
2344  default:
2345  format = "%g";
2346  break;
2347  }
2348  n = ::snprintf(buffer, kMaxDoubleStringSize, format, value);
2349  if (n < 0) {
2350  buffer[0] = '\0';
2351  }
2352  if (flags & fDoublePosix) {
2353  struct lconv* conv = localeconv();
2354  if ('.' != *(conv->decimal_point)) {
2355  char* pos = strchr(buffer, *(conv->decimal_point));
2356  if (pos) {
2357  *pos = '.';
2358  }
2359  }
2360  }
2361  }
2362  out_str = buffer;
2363  errno = 0;
2364 }
2365 
2366 
2367 SIZE_TYPE NStr::DoubleToString(double value, unsigned int precision,
2368  char* buf, SIZE_TYPE buf_size,
2369  TNumToStringFlags flags)
2370 {
2371  char buffer[kMaxDoubleStringSize]; // inludes ending '\0'
2372  int n = 0;
2373  if ((flags & fDoublePosix) && (!finite(value) || value == 0.)) {
2374  if (value == 0.) {
2375  double zero = 0.;
2376  if (memcmp(&value, &zero, sizeof(double)) == 0) {
2377  strcpy(buffer, "0");
2378  n = 2;
2379  } else {
2380  strcpy(buffer, "-0");
2381  n = 3;
2382  }
2383  } else if (isnan(value)) {
2384  strcpy(buffer, "NaN");
2385  n = 4;
2386  } else if (value > 0.) {
2387  strcpy(buffer, "INF");
2388  n = 4;
2389  } else {
2390  strcpy(buffer, "-INF");
2391  n = 5;
2392  }
2393  } else {
2394  if (precision > (unsigned int)kMaxDoublePrecision) {
2395  precision = (unsigned int)kMaxDoublePrecision;
2396  }
2397  const char* format;
2398  switch (flags & fDoubleGeneral) {
2399  case fDoubleScientific:
2400  format = "%.*e";
2401  break;
2402  case fDoubleGeneral:
2403  format = "%.*g";
2404  break;
2405  case fDoubleFixed: // default
2406  default:
2407  format = "%.*f";
2408  break;
2409  }
2410  n = ::snprintf(buffer, kMaxDoubleStringSize, format, (int)precision, value);
2411  if (n < 0) {
2412  n = 0;
2413  }
2414  if (flags & fDoublePosix) {
2415  struct lconv* conv = localeconv();
2416  if ('.' != *(conv->decimal_point)) {
2417  char* pos = strchr(buffer, *(conv->decimal_point));
2418  if (pos) {
2419  *pos = '.';
2420  }
2421  }
2422  }
2423  }
2424  SIZE_TYPE n_copy = min((SIZE_TYPE) n, buf_size);
2425  memcpy(buf, buffer, n_copy);
2426  errno = 0;
2427  return n_copy;
2428 }
2429 
2430 
2431 static char* s_ncbi_append_int2str(char* buffer, unsigned int value, size_t digits, bool zeros)
2432 {
2433  char* buffer_start = buffer;
2434  char* buffer_end = (buffer += digits-1);
2435  if (zeros) {
2436  do {
2437  *buffer-- = (char)('0' + (value % 10));
2438  value /= 10;
2439  } while (--digits);
2440  } else {
2441  do {
2442  *buffer-- = (char)('0' + (value % 10));
2443  } while (value /= 10);
2444 
2445  if (++buffer != buffer_start) {
2446  memmove(buffer_start, buffer, buffer_end-buffer+1);
2447  buffer_end -= buffer - buffer_start;
2448  }
2449  }
2450  return ++buffer_end;
2452 
2453 
2454 #define __NLG NCBI_CONST_LONGDOUBLE
2455 
2456 SIZE_TYPE NStr::DoubleToString_Ecvt(double val, unsigned int precision,
2457  char* buffer, SIZE_TYPE bufsize,
2458  int* dec, int* sign)
2459 {
2460  //errno = 0;
2461  *dec = *sign = 0;
2462  if (precision==0) {
2463  return 0;
2464  }
2465  if (precision > DBL_DIG) {
2466  precision = DBL_DIG;
2467  }
2468  if (val == 0.) {
2469  double zero = 0.;
2470  if (memcmp(&val, &zero, sizeof(double)) == 0) {
2471  *buffer='0';
2472  return 1;
2473  }
2474  *buffer++='-';
2475  *buffer='0';
2476  *sign = -1;
2477  return 2;
2478  }
2479  *sign = val < 0. ? -1 : 1;
2480  if (*sign < 0) {
2481  val = -val;
2482  }
2483  bool high_precision = precision > 9;
2484 
2485 // calculate exponent
2486  unsigned int exp=0;
2487  bool exp_positive = val >= 1.;
2488  unsigned int first, second=0;
2489  long double mult = __NLG(1.);
2490  long double value = val;
2491 
2492  if (exp_positive) {
2493  while (value>=__NLG(1.e256))
2494  {value*=__NLG(1.e-256); exp+=256;}
2495  if (value >= __NLG(1.e16)) {
2496  if (value>=__NLG(1.e240)) {value*=__NLG(1.e-240); exp+=240;}
2497  else if (value>=__NLG(1.e224)) {value*=__NLG(1.e-224); exp+=224;}
2498  else if (value>=__NLG(1.e208)) {value*=__NLG(1.e-208); exp+=208;}
2499  else if (value>=__NLG(1.e192)) {value*=__NLG(1.e-192); exp+=192;}
2500  else if (value>=__NLG(1.e176)) {value*=__NLG(1.e-176); exp+=176;}
2501  else if (value>=__NLG(1.e160)) {value*=__NLG(1.e-160); exp+=160;}
2502  else if (value>=__NLG(1.e144)) {value*=__NLG(1.e-144); exp+=144;}
2503  else if (value>=__NLG(1.e128)) {value*=__NLG(1.e-128); exp+=128;}
2504  else if (value>=__NLG(1.e112)) {value*=__NLG(1.e-112); exp+=112;}
2505  else if (value>=__NLG(1.e96)) {value*=__NLG(1.e-96); exp+=96;}
2506  else if (value>=__NLG(1.e80)) {value*=__NLG(1.e-80); exp+=80;}
2507  else if (value>=__NLG(1.e64)) {value*=__NLG(1.e-64); exp+=64;}
2508  else if (value>=__NLG(1.e48)) {value*=__NLG(1.e-48); exp+=48;}
2509  else if (value>=__NLG(1.e32)) {value*=__NLG(1.e-32); exp+=32;}
2510  else if (value>=__NLG(1.e16)) {value*=__NLG(1.e-16); exp+=16;}
2511  }
2512  if (value< __NLG(1.)) {mult=__NLG(1.e+9); exp-= 1;}
2513  else if (value< __NLG(10.)) {mult=__NLG(1.e+8); }
2514  else if (value< __NLG(1.e2)) {mult=__NLG(1.e+7); exp+= 1;}
2515  else if (value< __NLG(1.e3)) {mult=__NLG(1.e+6); exp+= 2;}
2516  else if (value< __NLG(1.e4)) {mult=__NLG(1.e+5); exp+= 3;}
2517  else if (value< __NLG(1.e5)) {mult=__NLG(1.e+4); exp+= 4;}
2518  else if (value< __NLG(1.e6)) {mult=__NLG(1.e+3); exp+= 5;}
2519  else if (value< __NLG(1.e7)) {mult=__NLG(1.e+2); exp+= 6;}
2520  else if (value< __NLG(1.e8)) {mult= __NLG(10.); exp+= 7;}
2521  else if (value< __NLG(1.e9)) {mult= __NLG(1.); exp+= 8;}
2522  else if (value<__NLG(1.e10)) {mult= __NLG(0.1); exp+= 9;}
2523  else if (value<__NLG(1.e11)) {mult=__NLG(1.e-2); exp+=10;}
2524  else if (value<__NLG(1.e12)) {mult=__NLG(1.e-3); exp+=11;}
2525  else if (value<__NLG(1.e13)) {mult=__NLG(1.e-4); exp+=12;}
2526  else if (value<__NLG(1.e14)) {mult=__NLG(1.e-5); exp+=13;}
2527  else if (value<__NLG(1.e15)) {mult=__NLG(1.e-6); exp+=14;}
2528  else if (value<__NLG(1.e16)) {mult=__NLG(1.e-7); exp+=15;}
2529  else {mult=__NLG(1.e-8); exp+=16;}
2530  } else {
2531  while (value<=__NLG(1.e-256))
2532  {value*=__NLG(1.e256); exp+=256;}
2533  if (value <= __NLG(1.e-16)) {
2534  if (value<=__NLG(1.e-240)) {value*=__NLG(1.e240); exp+=240;}
2535  else if (value<=__NLG(1.e-224)) {value*=__NLG(1.e224); exp+=224;}
2536  else if (value<=__NLG(1.e-208)) {value*=__NLG(1.e208); exp+=208;}
2537  else if (value<=__NLG(1.e-192)) {value*=__NLG(1.e192); exp+=192;}
2538  else if (value<=__NLG(1.e-176)) {value*=__NLG(1.e176); exp+=176;}
2539  else if (value<=__NLG(1.e-160)) {value*=__NLG(1.e160); exp+=160;}
2540  else if (value<=__NLG(1.e-144)) {value*=__NLG(1.e144); exp+=144;}
2541  else if (value<=__NLG(1.e-128)) {value*=__NLG(1.e128); exp+=128;}
2542  else if (value<=__NLG(1.e-112)) {value*=__NLG(1.e112); exp+=112;}
2543  else if (value<=__NLG(1.e-96)) {value*=__NLG(1.e96); exp+=96;}
2544  else if (value<=__NLG(1.e-80)) {value*=__NLG(1.e80); exp+=80;}
2545  else if (value<=__NLG(1.e-64)) {value*=__NLG(1.e64); exp+=64;}
2546  else if (value<=__NLG(1.e-48)) {value*=__NLG(1.e48); exp+=48;}
2547  else if (value<=__NLG(1.e-32)) {value*=__NLG(1.e32); exp+=32;}
2548  else if (value<=__NLG(1.e-16)) {value*=__NLG(1.e16); exp+=16;}
2549  }
2550  if (value<__NLG(1.e-15)) {mult=__NLG(1.e24); exp+=16;}
2551  else if (value<__NLG(1.e-14)) {mult=__NLG(1.e23); exp+=15;}
2552  else if (value<__NLG(1.e-13)) {mult=__NLG(1.e22); exp+=14;}
2553  else if (value<__NLG(1.e-12)) {mult=__NLG(1.e21); exp+=13;}
2554  else if (value<__NLG(1.e-11)) {mult=__NLG(1.e20); exp+=12;}
2555  else if (value<__NLG(1.e-10)) {mult=__NLG(1.e19); exp+=11;}
2556  else if (value<__NLG(1.e-9)) {mult=__NLG(1.e18); exp+=10;}
2557  else if (value<__NLG(1.e-8)) {mult=__NLG(1.e17); exp+=9;}
2558  else if (value<__NLG(1.e-7)) {mult=__NLG(1.e16); exp+=8;}
2559  else if (value<__NLG(1.e-6)) {mult=__NLG(1.e15); exp+=7;}
2560  else if (value<__NLG(1.e-5)) {mult=__NLG(1.e14); exp+=6;}
2561  else if (value<__NLG(1.e-4)) {mult=__NLG(1.e13); exp+=5;}
2562  else if (value<__NLG(1.e-3)) {mult=__NLG(1.e12); exp+=4;}
2563  else if (value<__NLG(1.e-2)) {mult=__NLG(1.e11); exp+=3;}
2564  else if (value<__NLG(1.e-1)) {mult=__NLG(1.e10); exp+=2;}
2565  else if (value<__NLG(1.)) {mult=__NLG(1.e9); exp+=1;}
2566  else {mult=__NLG(1.e8); }
2567  }
2568 
2569 // get all digits
2570  long double t1 = value * mult;
2571  if (t1 >= __NLG(1.e9)) {
2572  first = 999999999;
2573  } else if (t1 < __NLG(1.e8)) {
2574  first = 100000000;
2575  t1 = first;
2576  } else {
2577  first = (unsigned int)t1;
2578  }
2579  if (high_precision) {
2580  long double t2 = (t1-first) * __NLG(1.e8);
2581  if (t2 >= __NLG(1.e8)) {
2582  second = 99999999;
2583  } else {
2584  second = (unsigned int)t2;
2585  }
2586  }
2587 
2588 // convert them into string
2589  bool use_ext_buffer = bufsize > 20;
2590  char tmp[32];
2591  char *digits = use_ext_buffer ? buffer : tmp;
2592  char *digits_end = s_ncbi_append_int2str(digits,first,9,false);
2593  if (high_precision) {
2594  digits_end = s_ncbi_append_int2str(digits_end,second,8,true);
2595  }
2596  size_t digits_len = digits_end - digits;
2597  size_t digits_got = digits_len;
2598  size_t digits_expected = high_precision ? 17 : 9;
2599 
2600 // get significant digits according to requested precision
2601  size_t pos = precision;
2602  if (digits_len > precision) {
2603  digits_len = precision;
2604 
2605  // this is questionable, but in fact,
2606  // improves the result (on average)
2607 #if 1
2608  if (high_precision) {
2609  if (digits[pos] == '4') {
2610  size_t pt = pos-1;
2611  while (pt != 0 && digits[--pt] == '9')
2612  ;
2613  if (pt != 0 && (pos-pt) > precision/2)
2614  digits[pos]='5';
2615  } else if (digits[pos] == '5') {
2616  size_t pt = pos;
2617  while (pt != 0 && digits[--pt] == '0')
2618  ;
2619  if (pt != 0 && (pos-pt) > precision/2)
2620  digits[pos]='4';
2621  }
2622  }
2623 #endif
2624 
2625  if (digits[pos] >= '5') {
2626  do {
2627  if (digits[--pos] < '9') {
2628  ++digits[pos++];
2629  break;
2630  }
2631  digits[pos]='0';
2632  } while (pos > 0);
2633  if (pos == 0) {
2634  if (digits_expected <= digits_got) {
2635  if (exp_positive) {
2636  ++exp;
2637  } else {
2638 // exp cannot be 0, by design
2639  exp_positive = --exp == 0;
2640  }
2641  }
2642  *digits = '1';
2643  digits_len = 1;
2644  }
2645  }
2646  }
2647 
2648 // truncate trailing zeros
2649  for (pos = digits_len; pos-- > 0 && digits[pos] == '0';)
2650  --digits_len;
2651 
2652  *dec = exp_positive ? int(exp) : -int(exp);
2653 
2654  if (!use_ext_buffer) {
2655  if (digits_len <= bufsize) {
2656  memcpy(buffer, digits, digits_len);
2657  } else {
2658  NCBI_THROW2(CStringException, eConvert,
2659  "Destination buffer too small", 0);
2660  }
2661  }
2662  return digits_len;
2663 }
2664 #undef __NLG
2665 
2666 
2667 SIZE_TYPE NStr::DoubleToStringPosix(double val, unsigned int precision,
2668  char* buffer, SIZE_TYPE bufsize)
2669 {
2670  if (bufsize < precision+8) {
2671  NCBI_THROW2(CStringException, eConvert,
2672  "Destination buffer too small", 0);
2673  }
2674  int dec=0, sign=0;
2675  char digits[32];
2676  size_t digits_len = DoubleToString_Ecvt(
2677  val, precision, digits, sizeof(digits), &dec, &sign);
2678  if (digits_len == 0) {
2679  errno = 0;
2680  return 0;
2681  }
2682  if (val == 0.) {
2683  strncpy(buffer,digits, digits_len);
2684  return digits_len;
2685  }
2686  if (digits_len == 1 && dec == 0 && sign >=0) {
2687  *buffer = digits[0];
2688  errno = 0;
2689  return 1;
2690  }
2691  bool exp_positive = dec >= 0;
2692  unsigned int exp= (unsigned int)(exp_positive ? dec : (-dec));
2693 
2694  // assemble the result
2695  char *buffer_pos = buffer;
2696 // char *buffer_end = buffer + bufsize;
2697  char *digits_pos = digits;
2698 
2699  if (sign < 0) {
2700  *buffer_pos++ = '-';
2701  }
2702  // The 'e' format is used when the exponent of the value is less than -4
2703  // or greater than or equal to the precision argument
2704  if ((exp_positive && exp >= precision) || (!exp_positive && exp > 4)) {
2705  *buffer_pos++ = *digits_pos++;
2706  --digits_len;
2707  if (digits_len != 0) {
2708  *buffer_pos++ = '.';
2709  strncpy(buffer_pos,digits_pos,digits_len);
2710  buffer_pos += digits_len;
2711  }
2712  *buffer_pos++ = 'e';
2713  *buffer_pos++ = exp_positive ? '+' : '-';
2714 
2715 //#if defined(NCBI_OS_MSWIN)
2716 #if NCBI_COMPILER_MSVC && _MSC_VER < 1900
2717  bool need_zeros = true;
2718  size_t need_digits = 3;
2719 #else
2720  bool need_zeros = exp < 10 ? true : false;
2721  size_t need_digits = exp < 100 ? 2 : 3;
2722 #endif
2723  // assuming exp < 1000
2724  buffer_pos = s_ncbi_append_int2str(buffer_pos, exp, need_digits,need_zeros);
2725  } else if (exp_positive) {
2726  *buffer_pos++ = *digits_pos++;
2727  --digits_len;
2728  if (digits_len > exp) {
2729  strncpy(buffer_pos,digits_pos,exp);
2730  buffer_pos += exp;
2731  *buffer_pos++ = '.';
2732  strncpy(buffer_pos,digits_pos+exp,digits_len-exp);
2733  buffer_pos += digits_len-exp;
2734  } else {
2735  strncpy(buffer_pos,digits_pos,digits_len);
2736  buffer_pos += digits_len;
2737  exp -= (unsigned int)digits_len;
2738  while (exp--) {
2739  *buffer_pos++ = '0';
2740  }
2741  }
2742  } else {
2743  *buffer_pos++ = '0';
2744  *buffer_pos++ = '.';
2745  for (--exp; exp--;) {
2746  *buffer_pos++ = '0';
2747  }
2748  strncpy(buffer_pos,digits_pos, digits_len);
2749  buffer_pos += digits_len;
2750  }
2751  errno = 0;
2752  return buffer_pos - buffer;
2753 }
2754 
2755 
2756 string NStr::SizetToString(size_t value, TNumToStringFlags flags, int base)
2757 {
2758 #if (SIZEOF_SIZE_T > 4)
2759  return UInt8ToString(value, flags, base);
2760 #else
2761  return UIntToString(static_cast<unsigned int>(value), flags, base);
2762 #endif
2763 }
2764 
2765 
2766 string NStr::PtrToString(const void* value)
2767 {
2768  errno = 0;
2769  const int kBufSize = 64;
2770  char buffer[kBufSize];
2771  ::snprintf(buffer, kBufSize, "%p", value);
2772  return buffer;
2773 }
2774 
2775 
2776 void NStr::PtrToString(string& out_str, const void* value)
2777 {
2778  errno = 0;
2779  const int kBufSize = 64;
2780  char buffer[kBufSize];
2781  ::snprintf(buffer, kBufSize, "%p", value);
2782  out_str = buffer;
2783 }
2784 
2785 
2786 const void* NStr::StringToPtr(const CTempStringEx str, TStringToNumFlags flags)
2787 {
2788  errno = 0;
2789  void *ptr = NULL;
2790  int res;
2791  if ( str.HasZeroAtEnd() ) {
2792  res = ::sscanf(str.data(), "%p", &ptr);
2793  } else {
2794  res = ::sscanf(string(str).c_str(), "%p", &ptr);
2795  }
2796  if (res != 1) {
2797  if (flags & fConvErr_NoErrMessage) {
2798  CNcbiError::SetErrno(errno = EINVAL);
2799  } else {
2800  CNcbiError::SetErrno(errno = EINVAL, str);
2801  }
2802  return NULL;
2803  }
2804  return ptr;
2808 static const char* s_kTrueString = "true";
2809 static const char* s_kFalseString = "false";
2810 static const char* s_kTString = "t";
2811 static const char* s_kFString = "f";
2812 static const char* s_kYesString = "yes";
2813 static const char* s_kNoString = "no";
2814 static const char* s_kYString = "y";
2815 static const char* s_kNString = "n";
2816 static const char* s_kOnString = "on";
2817 static const char* s_kOffString = "off";
2818 
2819 
2820 const string NStr::BoolToString(bool value)
2822  return value ? s_kTrueString : s_kFalseString;
2823 }
2824 
2825 
2827 {
2828  if ( str == "1" ||
2833  AStrEquiv(str, s_kOnString, PNocase()) ) {
2834  errno = 0;
2835  return true;
2836  }
2837  if ( str == "0" ||
2843  errno = 0;
2844  return false;
2845  }
2847  "String cannot be converted to bool", 0);
2848 }
2849 
2850 
2851 string NStr::FormatVarargs(const char* format, va_list args)
2852 {
2853 #ifdef HAVE_VASPRINTF
2854  char* s;
2855  int n = vasprintf(&s, format, args);
2856  if (n >= 0) {
2857  string str(s, n);
2858  free(s);
2859  return str;
2860  } else {
2861  return kEmptyStr;
2862  }
2863 
2864 #elif defined(HAVE_VSNPRINTF)
2865  // deal with implementation quirks
2866  SIZE_TYPE size = 1024;
2868  buf.get()[size-1] = buf.get()[size-2] = 0;
2869  SIZE_TYPE n = vsnprintf(buf.get(), size, format, args);
2870  while (n >= size || buf.get()[size-2]) {
2871  if (buf.get()[size-1]) {
2872  ERR_POST_X(1, Warning << "Buffer overrun by buggy vsnprintf");
2873  }
2874  size = max(size << 1, n);
2875  buf.reset(new char[size]);
2876  buf.get()[size-1] = buf.get()[size-2] = 0;
2877  n = vsnprintf(buf.get(), size, format, args);
2878  }
2879  return (n > 0) ? string(buf.get(), n) : kEmptyStr;
2880 
2881 #elif defined(HAVE_VPRINTF)
2882  char buf[1024];
2883  buf[sizeof(buf) - 1] = 0;
2884  vsprintf(buf, format, args);
2885  if (buf[sizeof(buf) - 1]) {
2886  ERR_POST_X(2, Warning << "Buffer overrun by vsprintf");
2887  }
2888  return buf;
2889 
2890 #else
2891 # error Please port this code to your system.
2892 #endif
2893 }
2894 
2895 
2897  const CTempString pattern,
2898  ECase use_case,
2899  EDirection direction,
2900  SIZE_TYPE occurence)
2901 {
2902  const SIZE_TYPE slen = str.length();
2903  const SIZE_TYPE plen = pattern.length();
2904  SIZE_TYPE current_occurence = 0;
2905  SIZE_TYPE pos = 0;
2906  SIZE_TYPE current_pos = 0; // saved position of last search
2907  SIZE_TYPE search_pos = 0; // next search position
2908 
2909  if (plen > slen) {
2910  return NPOS;
2911  }
2912 
2913  if (use_case == eCase) {
2914 
2915  if (direction == eForwardSearch) {
2916  do {
2917  pos = str.find(pattern, search_pos);
2918  if (pos == NPOS) {
2919  return NPOS;
2920  }
2921  current_pos = pos;
2922  search_pos = pos + plen;
2923  ++current_occurence;
2924  }
2925  while (current_occurence <= occurence);
2926 
2927  } else {
2928  _ASSERT(direction == eReverseSearch);
2929  search_pos = slen - plen;
2930  do {
2931  pos = str.rfind(pattern, search_pos);
2932  if (pos == NPOS) {
2933  return NPOS;
2934  }
2935  current_pos = pos;
2936  search_pos = (pos < plen) ? 0 : pos - plen;
2937  ++current_occurence;
2938  }
2939  while (current_occurence <= occurence);
2940  }
2941 
2942  } else {
2943  _ASSERT(use_case == eNocase);
2944 
2945  // A set of lower/upper characters for pattern[0].
2946  string x_first(pattern, 0, 1);
2947  if (isupper((unsigned char)x_first[0])) {
2948  x_first += (char)tolower((unsigned char)x_first[0]);
2949  } else if (islower((unsigned char)x_first[0])) {
2950  x_first += (char)toupper((unsigned char)x_first[0]);
2951  }
2952 
2953  if (direction == eForwardSearch) {
2954  do {
2955  pos = str.find_first_of(x_first, search_pos);
2956  while (pos != NPOS) {
2957  if ( (pos + plen) > slen ) {
2958  return NPOS;
2959  }
2960  if ( CompareNocase(str, pos, plen, pattern) == 0 ) {
2961  break;
2962  }
2963  pos = str.find_first_of(x_first, pos + 1);
2964  }
2965  if (pos > slen) {
2966  return NPOS;
2967  }
2968  current_pos = pos;
2969  search_pos = pos + plen;
2970  ++current_occurence;
2971  }
2972  while (current_occurence <= occurence);
2973 
2974  } else {
2975  _ASSERT(direction == eReverseSearch);
2976  search_pos = slen - plen;
2977  do {
2978  pos = str.find_last_of(x_first, search_pos);
2979  while (pos != NPOS && pos
2980  && CompareNocase(str, pos, plen, pattern) != 0) {
2981  if (pos == 0) {
2982  return NPOS;
2983  }
2984  pos = str.find_last_of(x_first, pos - 1);
2985  }
2986  current_pos = pos;
2987  search_pos = (pos < plen) ? 0 : pos - plen;
2988  ++current_occurence;
2989  }
2990  while (current_occurence <= occurence);
2991  }
2992  }
2993  return current_pos;
2994 }
2995 
2996 
2997 // @deprecated
2999  SIZE_TYPE start, SIZE_TYPE end, EOccurrence where)
3000 {
3001  string pat(pattern, 0, 1);
3002  SIZE_TYPE l = pattern.size();
3003  if (isupper((unsigned char) pat[0])) {
3004  pat += (char) tolower((unsigned char) pat[0]);
3005  } else if (islower((unsigned char) pat[0])) {
3006  pat += (char) toupper((unsigned char) pat[0]);
3007  }
3008 
3009  if (where == eFirst) {
3010  SIZE_TYPE pos = str.find_first_of(pat, start);
3011  while (pos != NPOS && (pos + l) <= end
3012  && CompareNocase(str, pos, l, pattern) != 0) {
3013  pos = str.find_first_of(pat, pos + 1);
3014  }
3015  return pos > end ? NPOS : pos;
3016 
3017  } else { // eLast
3018  SIZE_TYPE pos = str.find_last_of(pat, end);
3019  while (pos != NPOS && pos >= start
3020  && CompareNocase(str, pos, l, pattern) != 0) {
3021  if (pos == 0) {
3022  return NPOS;
3023  }
3024  pos = str.find_last_of(pat, pos - 1);
3025  }
3026  return pos < start ? NPOS : pos;
3027  }
3028 }
3029 
3030 
3031 const string* NStr::Find(const list <string>& lst, const CTempString val,
3032  ECase use_case)
3033 {
3034  if (lst.empty()) return NULL;
3035  ITERATE (list<string>, st_itr, lst) {
3036  if (Equal(*st_itr, val, use_case)) {
3037  return &*st_itr;
3038  }
3039  }
3040  return NULL;
3041 }
3042 
3043 const string* NStr::Find(const vector <string>& vec, const CTempString val,
3044  ECase use_case)
3045 {
3046  if (vec.empty()) return NULL;
3047  ITERATE (vector<string>, st_itr, vec) {
3048  if (Equal(*st_itr, val, use_case)) {
3049  return &*st_itr;
3050  }
3051  }
3052  return NULL;
3053 }
3055 
3056 /// @internal
3057 // Check that symbol 'ch' is a word boundary character (don't matches [a-zA-Z0-9_]).
3058 static inline
3059 bool s_IsWordBoundaryChar(char ch)
3061  return !(ch == '_' || isalnum((unsigned char)ch));
3062 }
3063 
3064 
3066  const CTempString word,
3067  ECase use_case,
3068  EDirection direction)
3069 {
3070  const SIZE_TYPE slen = str.length();
3071  const SIZE_TYPE plen = word.length();
3072 
3073  SIZE_TYPE start = 0;
3074  SIZE_TYPE end = slen;
3075 
3076  SIZE_TYPE pos = Find(str, word, use_case, direction);
3077 
3078  while (pos != NPOS) {
3079  // Check word boundaries
3080  if ( ((pos == 0) || s_IsWordBoundaryChar(str[pos-1])) &&
3081  ((pos + plen == slen) || s_IsWordBoundaryChar(str[pos+plen])) ) {
3082  return pos;
3083  }
3084  // Find next occurrence
3085  if (direction == eForwardSearch) {
3086  if (pos + plen == slen) {
3087  return NPOS;
3088  }
3089  ++start;
3090  } else {
3091  if (pos == 0) {
3092  return NPOS;
3093  }
3094  --end;
3095  }
3096  pos = Find(CTempString(str, start, end - start), word, use_case, direction);
3097  if (pos != NPOS) {
3098  // update position: from start of the string "str"
3099  pos += start;
3100  }
3101  }
3102  return pos;
3103 }
3104 
3105 
3107 {
3108  const SIZE_TYPE len1 = s1.length();
3109  const SIZE_TYPE len2 = s2.length();
3110 
3111  // Eliminate the null case
3112  if (len1 == 0 || len2 == 0) {
3113  return 0;
3114  }
3115  SIZE_TYPE len = min(len1, len2);
3116 
3117  // Truncate the longer string
3118  CTempString t1, t2;
3119  if (len1 > len2) {
3120  t1 = s1.substr(len1-len, len);
3121  t2 = s2;
3122  } else {
3123  t1 = s1;
3124  t2 = s2.substr(0, len);
3125  }
3126  // Quick check for the worst case
3127  if (memcmp(t1.data(), t2.data(), len) == 0) {
3128  return len;
3129  }
3130 
3131  // Start by looking for a single character match
3132  // and increase length until no match is found.
3133  // Performance analysis: http://neil.fraser.name/news/2010/11/04/
3134  SIZE_TYPE best = 0;
3135  SIZE_TYPE n = 1;
3136  for (;;) {
3137  // Right 'n' symbols of 't1'
3138  CTempString pattern(t1.data() + len - n, n);
3139  SIZE_TYPE pos = t2.find(pattern);
3140  if (pos == NPOS) {
3141  return best;
3142  }
3143  n += pos;
3144  if (pos == 0 || memcmp(t1.data() + len - n, t2.data(), n) == 0) {
3145  best = n;
3146  n++;
3147  }
3148  }
3149  // Unreachable
3150  return best;
3151 }
3152 
3153 
3154 template <class TStr>
3155 TStr s_TruncateSpaces(const TStr& str, NStr::ETrunc where,
3156  const TStr& empty_str)
3157 {
3158  SIZE_TYPE length = str.length();
3159  if (length == 0) {
3160  return empty_str;
3161  }
3162  SIZE_TYPE beg = 0;
3163  if (where == NStr::eTrunc_Begin || where == NStr::eTrunc_Both) {
3164  _ASSERT(beg < length);
3165  while ( isspace((unsigned char) str[beg]) ) {
3166  if (++beg == length) {
3167  return empty_str;
3168  }
3169  }
3170  }
3171  SIZE_TYPE end = length;
3172  if ( where == NStr::eTrunc_End || where == NStr::eTrunc_Both ) {
3173  _ASSERT(beg < end);
3174  while (isspace((unsigned char) str[--end])) {
3175  if (beg == end) {
3176  return empty_str;
3177  }
3178  }
3179  _ASSERT(beg <= end && !isspace((unsigned char) str[end]));
3180  ++end;
3181  }
3182  _ASSERT(beg < end && end <= length);
3183  if ( beg | (end - length) ) { // if either beg != 0 or end != length
3184  return str.substr(beg, end - beg);
3185  }
3186  else {
3187  return str;
3188  }
3189 }
3190 
3191 string NStr::TruncateSpaces(const string& str, ETrunc where)
3192 {
3193  return s_TruncateSpaces(str, where, kEmptyStr);
3194 }
3195 
3197 {
3198  return s_TruncateSpaces(str, where, CTempString());
3199 }
3200 
3202 {
3203  str = s_TruncateSpaces(str, where, CTempString());
3204 }
3205 
3206 void NStr::TruncateSpacesInPlace(string& str, ETrunc where)
3207 {
3208  SIZE_TYPE length = str.length();
3209  if (length == 0) {
3210  return;
3211  }
3212  SIZE_TYPE beg = 0;
3213  if ( where == eTrunc_Begin || where == eTrunc_Both ) {
3214  // It's better to use str.data()[] to check string characters
3215  // to avoid implicit modification of the string by non-const operator[]
3216  _ASSERT(beg < length);
3217  while ( isspace((unsigned char) str.data()[beg]) ) {
3218  if (++beg == length) {
3219  str.erase();
3220  return;
3221  }
3222  }
3223  }
3224 
3225  SIZE_TYPE end = length;
3226  if ( where == eTrunc_End || where == eTrunc_Both ) {
3227  // It's better to use str.data()[] to check string characters
3228  // to avoid implicit modification of the string by non-const operator[]
3229  _ASSERT(beg < end);
3230  while (isspace((unsigned char) str.data()[--end])) {
3231  if (beg == end) {
3232  str.erase();
3233  return;
3234  }
3235  }
3236  _ASSERT(beg <= end && !isspace((unsigned char) str.data()[end]));
3237  ++end;
3238  }
3239  _ASSERT(beg < end && end <= length);
3240 
3241  if ( beg | (end - length) ) { // if either beg != 0 or end != length
3242  str.replace(0, length, str, beg, end - beg);
3243  }
3244 }
3245 
3246 
3247 void NStr::TrimPrefixInPlace(string& str, const CTempString prefix,
3248  ECase use_case)
3249 {
3250  if (!str.length() ||
3251  !prefix.length() ||
3252  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3253  return;
3254  }
3255  str.erase(0, prefix.length());
3256 }
3257 
3258 
3260  ECase use_case)
3261 {
3262  if (!str.length() ||
3263  !prefix.length() ||
3264  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3265  return;
3266  }
3267  str.assign(str.data() + prefix.length(), str.length() - prefix.length());
3268 }
3269 
3270 
3272  ECase use_case)
3273 {
3274  if (!str.length() ||
3275  !prefix.length() ||
3276  !Equal(str, 0, prefix.length(), prefix, use_case)) {
3277  return str;
3278  }
3279  return CTempString(str.data() + prefix.length(), str.length() - prefix.length());
3280 }
3281 
3282 
3283 void NStr::TrimSuffixInPlace(string& str, const CTempString suffix,
3284  ECase use_case)
3285 {
3286  if (!str.length() ||
3287  !suffix.length() ||
3288  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3289  return;
3290  }
3291  str.erase(str.length() - suffix.length());
3292 }
3293 
3294 
3296  ECase use_case)
3297 {
3298  if (!str.length() ||
3299  !suffix.length() ||
3300  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3301  return;
3302  }
3303  str.erase(str.length() - suffix.length());
3304 }
3305 
3306 
3308  ECase use_case)
3309 {
3310  if (!str.length() ||
3311  !suffix.length() ||
3312  !Equal(str, str.length() - suffix.length(), suffix.length(), suffix, use_case)) {
3313  return str;
3314  }
3315  return CTempString(str.data(), str.length() - suffix.length());
3316 }
3317 
3318 
3319 string& NStr::Replace(const string& src,
3320  const string& search, const string& replace,
3321  string& dst, SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3322  SIZE_TYPE* num_replace)
3323 {
3324  // source and destination should not be the same
3325  if (&src == &dst) {
3326  NCBI_THROW2(CStringException, eBadArgs,
3327  "NStr::Replace(): source and destination are the same", 0);
3328  }
3329  if (num_replace)
3330  *num_replace = 0;
3331  if (start_pos + search.size() > src.size() || search == replace) {
3332  dst = src;
3333  return dst;
3334  }
3335 
3336  // Use different algorithms depending on size or 'search' and 'replace'
3337  // for better performance (and for big strings only! > 16KB).
3338 
3339  if (replace.size() > search.size() && src.size() > 16*1024) {
3340  // Replacing string is longer -- worst case.
3341  // Try to avoid memory reallocations inside std::string.
3342  // Count replacing strings first
3343  SIZE_TYPE n = 0;
3344  SIZE_TYPE start_orig = start_pos;
3345  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3346  start_pos = src.find(search, start_pos);
3347  if (start_pos == NPOS)
3348  break;
3349  n++;
3350  start_pos += search.size();
3351  }
3352  // Reallocate memory for destination string
3353  dst.resize(src.size() - n*search.size() + n*replace.size());
3354 
3355  // Use copy() to create destination string
3356  start_pos = start_orig;
3357  string::const_iterator src_start = src.begin();
3358  string::const_iterator src_end = src.begin();
3359  string::iterator dst_pos = dst.begin();
3360 
3361  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3362  start_pos = src.find(search, start_pos);
3363  if (start_pos == NPOS)
3364  break;
3365  // Copy from source string up to 'search'
3366  src_end = src.begin() + start_pos;
3367  copy(src_start, src_end, dst_pos);
3368  dst_pos += (src_end - src_start);
3369  // Append 'replace'
3370  copy(replace.begin(), replace.end(), dst_pos);
3371  dst_pos += replace.size();
3372  start_pos += search.size();
3373  src_start = src.begin() + start_pos;
3374  }
3375  // Copy source's string tail to the place
3376  copy(src_start, src.end(), dst_pos);
3377  if (num_replace)
3378  *num_replace = n;
3379 
3380  } else {
3381  // Replacing string is shorter or have the same length.
3382  // ReplaceInPlace() can be faster on some platform, but not much,
3383  // so we use regular algorithm even for equal lengths here.
3384  dst = src;
3385  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3386  start_pos = dst.find(search, start_pos);
3387  if (start_pos == NPOS)
3388  break;
3389  dst.replace(start_pos, search.size(), replace);
3390  start_pos += replace.size();
3391  if (num_replace)
3392  (*num_replace)++;
3393  }
3394  }
3395  return dst;
3396 }
3397 
3398 
3399 string NStr::Replace(const string& src,
3400  const string& search, const string& replace,
3401  SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3402  SIZE_TYPE* num_replace)
3403 {
3404  string dst;
3405  Replace(src, search, replace, dst, start_pos, max_replace, num_replace);
3406  return dst;
3407 }
3408 
3409 
3410 string& NStr::ReplaceInPlace(string& src,
3411  const string& search, const string& replace,
3412  SIZE_TYPE start_pos, SIZE_TYPE max_replace,
3413  SIZE_TYPE* num_replace)
3414 {
3415  if ( num_replace )
3416  *num_replace = 0;
3417  if ( start_pos + search.size() > src.size() || search == replace )
3418  return src;
3419 
3420  bool equal_len = (search.size() == replace.size());
3421  for (SIZE_TYPE count = 0; !(max_replace && count >= max_replace); count++){
3422  start_pos = src.find(search, start_pos);
3423  if (start_pos == NPOS)
3424  break;
3425  // On some platforms string's replace() implementation
3426  // is not optimal if size of search and replace strings are equal
3427  if ( equal_len ) {
3428  copy(replace.begin(), replace.end(), src.begin() + start_pos);
3429  } else {
3430  src.replace(start_pos, search.size(), replace);
3431  }
3432  start_pos += replace.size();
3433  if (num_replace)
3434  (*num_replace)++;
3435  }
3436  return src;
3437 }
3438 
3439 
3440 template<typename TString, typename TContainer>
3441 TContainer& s_Split(const TString& str, const TString& delim,
3442  TContainer& arr, NStr::TSplitFlags flags,
3443  vector<SIZE_TYPE>* token_pos,
3444  CTempString_Storage* storage = NULL)
3445 {
3446  typedef CStrTokenPosAdapter<vector<SIZE_TYPE> > TPosArray;
3448  typedef CStrTokenize<TString, TContainer, TPosArray,
3449  CStrDummyTokenCount, TReserve> TSplitter;
3450 
3451  TPosArray token_pos_proxy(token_pos);
3452  TSplitter splitter(str, delim, flags, storage);
3453  splitter.Do(arr, token_pos_proxy, kEmptyStr);
3454  return arr;
3455 }
3456 
3457 #define CHECK_SPLIT_TEMPSTRING_FLAGS(where) \
3458  { \
3459  if ((flags & (NStr::fSplit_CanEscape | NStr::fSplit_CanQuote)) && !storage) { \
3460  NCBI_THROW2(CStringException, eBadArgs, \
3461  "NStr::" #where "(): the selected flags require non-NULL storage", 0); \
3462  } \
3463 }
3464 
3465 
3466 list<string>& NStr::Split(const CTempString str, const CTempString delim,
3467  list<string>& arr, TSplitFlags flags,
3468  vector<SIZE_TYPE>* token_pos)
3469 {
3470  return s_Split(str, delim, arr, flags, token_pos);
3471 }
3472 
3473 vector<string>& NStr::Split(const CTempString str, const CTempString delim,
3474  vector<string>& arr, TSplitFlags flags,
3475  vector<SIZE_TYPE>* token_pos)
3476 {
3477  return s_Split(str, delim, arr, flags, token_pos);
3478 }
3479 
3480 list<CTempString>& NStr::Split(const CTempString str, const CTempString delim,
3481  list<CTempString>& arr, TSplitFlags flags,
3482  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3485  return s_Split(str, delim, arr, flags, token_pos, storage);
3486 }
3487 
3488 vector<CTempString>& NStr::Split(const CTempString str, const CTempString delim,
3489  vector<CTempString>& arr, TSplitFlags flags,
3490  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3493  return s_Split(str, delim, arr, flags, token_pos, storage);
3494 }
3495 
3496 list<CTempStringEx>& NStr::Split(const CTempString str, const CTempString delim,
3497  list<CTempStringEx>& arr, TSplitFlags flags,
3498  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3501  return s_Split(str, delim, arr, flags, token_pos, storage);
3502 }
3503 
3504 vector<CTempStringEx>& NStr::Split(const CTempString str, const CTempString delim,
3505  vector<CTempStringEx>& arr, TSplitFlags flags,
3506  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3509  return s_Split(str, delim, arr, flags, token_pos, storage);
3510 }
3511 
3512 list<string>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3513  list<string>& arr, TSplitFlags flags,
3514  vector<SIZE_TYPE>* token_pos)
3515 {
3516  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos);
3517 }
3518 
3519 vector<string>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3520  vector<string>& arr, TSplitFlags flags,
3521  vector<SIZE_TYPE>* token_pos)
3522 {
3523  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos);
3524 }
3525 
3526 list<CTempString>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3527  list<CTempString>& arr, TSplitFlags flags,
3528  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3531  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3532 }
3533 
3534 vector<CTempString>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3535  vector<CTempString>& arr, TSplitFlags flags,
3536  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3539  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3540 }
3541 
3542 list<CTempStringEx>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3543  list<CTempStringEx>& arr, TSplitFlags flags,
3544  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3547  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3548 }
3549 
3550 vector<CTempStringEx>& NStr::SplitByPattern(const CTempString str, const CTempString delim,
3551  vector<CTempStringEx>& arr, TSplitFlags flags,
3552  vector<SIZE_TYPE>* token_pos, CTempString_Storage* storage)
3553 {
3555  return s_Split(str, delim, arr, fSplit_ByPattern | flags, token_pos, storage);
3556 }
3557 
3558 
3559 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3560  string& str1, string& str2, TSplitFlags flags)
3561 {
3562  CTempStringEx ts1, ts2;
3563  CTempString_Storage storage;
3564  bool result = SplitInTwo(str, delim, ts1, ts2, flags, &storage);
3565  str1 = ts1;
3566  str2 = ts2;
3567  return result;
3568 }
3569 
3570 
3571 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3572  CTempString& str1, CTempString& str2, TSplitFlags flags,
3573  CTempString_Storage* storage)
3574 {
3575  CTempStringEx ts1, ts2;
3576  bool result = SplitInTwo(str, delim, ts1, ts2, flags, storage);
3577  str1 = ts1;
3578  str2 = ts2;
3579  return result;
3580 }
3581 
3582 
3583 bool NStr::SplitInTwo(const CTempString str, const CTempString delim,
3584  CTempStringEx& str1, CTempStringEx& str2,
3585  TSplitFlags flags, CTempString_Storage* storage)
3586 {
3591 
3592  CTempStringList part_collector(storage);
3593  TSplitter splitter(str, delim, flags, storage);
3594  SIZE_TYPE delim_pos = NPOS;
3595 
3596  // get first part
3597  splitter.Advance(&part_collector, NULL, &delim_pos);
3598  part_collector.Join(&str1);
3599  part_collector.Clear();
3600 
3601  // don't need further splitting, just quote and escape parsing
3602  splitter.SetDelim(kEmptyStr);
3603  splitter.Advance(&part_collector);
3604  part_collector.Join(&str2);
3606  return delim_pos != NPOS;
3607 }
3608 
3610 #define SS_ADD_CHAR(c) \
3611  out.push_back(c); \
3612  last = c;
3613 
3614 string NStr::Sanitize(CTempString str, CTempString allow_chars, CTempString reject_chars,
3615  char reject_replacement, TSS_Flags flags)
3616 {
3617  string out;
3618  out.reserve(str.size());
3619 
3620  // Use fSS_print by default if no any other filter, including custom
3621  bool have_class = (flags & (fSS_alpha | fSS_digit | fSS_alnum | fSS_print | fSS_cntrl | fSS_punct)) > 0;
3622  if ( allow_chars.empty() && reject_chars.empty() && !have_class ) {
3623  flags |= fSS_print;
3624  have_class = true;
3625  }
3626 
3627  bool have_allowed = false;
3628  char last = '\0';
3629 
3630  for (char c : str) {
3631 
3632  // Check against filters: character classes via flags, allowed chars, rejected chars.
3633  bool allowed = false;
3634  if ( have_class ) {
3635  allowed = ((flags & fSS_Reject) != 0);
3636  if (((flags & fSS_print) && isprint((unsigned char)c)) ||
3637  ((flags & fSS_alnum) && isalnum((unsigned char)c)) ||
3638  ((flags & fSS_alpha) && isalpha((unsigned char)c)) ||
3639  ((flags & fSS_digit) && isdigit((unsigned char)c)) ||
3640  ((flags & fSS_cntrl) && iscntrl((unsigned char)c)) ||
3641  ((flags & fSS_punct) && ispunct((unsigned char)c)) ) {
3642 
3643  // If matched and reverse logic -- treat char as rejected
3644  allowed = ((flags & fSS_Reject) == 0);
3645  }
3646  }
3647  else {
3648  // Special case: no any character class specified in flags
3649 
3650  // If <allow_chars> and fSS_Reject flag, then no any character allowed except <allow_chars>
3651  // -- "allow" already FALSE, no need to check this;
3652  // -- <allow_chars> will be checked below.
3653 
3654  // If <reject_chars> and no fSS_Reject flag, then all characters allowed except <reject_chars>.
3655  if (!reject_chars.empty() && ((flags & fSS_Reject) == 0)) {
3656  allowed = true;
3657  }
3658  // -- <reject_chars> will be checked below.
3659  }
3660  if (!allowed && !allow_chars.empty() && allow_chars.find(c) != NPOS ) {
3661  allowed = true;
3662  }
3663  if (allowed && !reject_chars.empty() && reject_chars.find(c) != NPOS ) {
3664  allowed = false;
3665  }
3666 
3667  // Good character?
3668  if ( allowed ) {
3669  // Special processing for allowed spaces.
3670  // Truncate leading spaces and merge if necessary
3671  if ( c == ' ' ) {
3672  if (!have_allowed && !(flags & fSS_NoTruncate_Begin)) {
3673  // Skip spaces at start of the string
3674  continue;
3675  }
3676  if (flags & fSS_NoMerge) {
3677  SS_ADD_CHAR(c);
3678  }
3679  else {
3680  // Merge spaces
3681  if (last != ' ') {
3682  SS_ADD_CHAR(c);
3683  }
3684  }
3685  }
3686  else {
3687  // Some other allowed character
3688  SS_ADD_CHAR(c);
3689  have_allowed = true;
3690  }
3691  continue;
3692  }
3693 
3694  // Rejected
3695  if ( flags & fSS_Remove ) {
3696  continue;
3697  }
3698  // Special check on leading spaces, if <reject_replacement> is a space
3699  if (reject_replacement == ' ') {
3700  if (!have_allowed && !(flags & fSS_NoTruncate_Begin)) {
3701  // Skip spaces at start of the string
3702  continue;
3703  }
3704  }
3705  // Replace rejected character
3706  if (flags & fSS_NoMerge) {
3707  SS_ADD_CHAR(reject_replacement);
3708  have_allowed = true;
3709  }
3710  else {
3711  // Merge rejected
3712  if (last != reject_replacement) {
3713  SS_ADD_CHAR(reject_replacement);
3714  have_allowed = true;
3715  }
3716  }
3717  }
3718 
3719  // Truncate trailing spaces if necessary
3720  if (last == ' ' && !(flags & fSS_NoTruncate_End)) {
3721  SIZE_TYPE pos = out.find_last_not_of(last);
3722  if (pos == NPOS) {
3723  out.clear();
3724  }
3725  else {
3726  out.resize(pos+1);
3727  }
3728  }
3729 
3730  return out;
3732 
3734 
3735 enum ELanguage {
3738 };
3739 
3740 
3741 static string s_PrintableString(const CTempString str,
3743  ELanguage lang)
3744 {
3745  unique_ptr<CNcbiOstrstream> out;
3746  SIZE_TYPE i, j = 0;
3747 
3748  for (i = 0; i < str.size(); ++i) {
3749  bool octal = false;
3750  char c = str[i];
3751  switch (c) {
3752  case '\a':
3753  if (lang == eLanguage_C)
3754  c = 'a';
3755  else
3756  octal = true;
3757  break;
3758  case '\b':
3759  c = 'b';
3760  break;
3761  case '\f':
3762  c = 'f';
3763  break;
3764  case '\r':
3765  c = 'r';
3766  break;
3767  case '\t':
3768  c = 't';
3769  break;
3770  case '\v':
3771  c = 'v';
3772  break;
3773  case '\n':
3774  if (!(mode & NStr::fNewLine_Passthru))
3775  c = 'n';
3776  /*FALLTHRU*/
3777  case '\\':
3778  case '\'':
3779  case '"':
3780  break;
3781  case '&':
3782  if (lang == eLanguage_Javascript)
3783  break;
3784  continue;
3785  case '?':
3786  if (lang == eLanguage_C) {
3787  if (i && str[i - 1] == '?')
3788  break;
3789  if (i < str.size() - 1 && str[i + 1] == '?')
3790  break;
3791  }
3792  continue;
3793  default:
3794  if (!isascii((unsigned char) c)) {
3795  if (mode & NStr::fNonAscii_Quote) {
3796  octal = true;
3797  break;
3798  }
3799  }
3800  if (!isprint((unsigned char) c)) {
3801  octal = true;
3802  break;
3803  }
3804  continue;
3805  }
3806  if (!out.get()) {
3807  out.reset(new CNcbiOstrstream);
3808  }
3809  if (i > j) {
3810  out->write(str.data() + j, i - j);
3811  }
3812  out->put('\\');
3813  if (c == '\n') {
3814  out->write("n\\\n", 3);
3815  } else if (octal) {
3816  bool reduce;
3817  if (!(mode & NStr::fPrintable_Full)) {
3818  reduce = (i == str.size() - 1 ||
3819  str[i + 1] < '0' || '7' < str[i + 1] ? true : false);
3820  } else {
3821  reduce = false;
3822  }
3823  unsigned char v;
3824  char val[3];
3825  int k = 0;
3826  v = (unsigned char) c >> 6;
3827  if (v || !reduce) {
3828  val[k++] = char('0' + v);
3829  reduce = false;
3830  }
3831  v = ((unsigned char) c >> 3) & 7;
3832  if (v || !reduce) {
3833  val[k++] = char('0' + v);
3834  }
3835  v = (unsigned char) c & 7;
3836  val[k++] = char('0' + v);
3837  out->write(val, k);
3838  } else {
3839  out->put(c);
3840  }
3841  j = i + 1;
3842  }
3843  if (j && i > j) {
3844  _ASSERT(out.get());
3845  out->write(str.data() + j, i - j);
3846  }
3847  if (out.get()) {
3848  // Return encoded string
3849  return CNcbiOstrstreamToString(*out);
3850  }
3851 
3852  // All characters are good - return (a copy of) the original string
3853  return str;
3854 }
3855 
3856 
3857 string NStr::Escape(const CTempString str, const CTempString metacharacters, char escape_char)
3858 {
3859  string out;
3860  if ( str.empty() ) {
3861  return out;
3862  }
3863  out.reserve(str.size() * 2); // maximum size for a new string (have all metacharacters)
3864 
3865  for (char c : str) {
3866  if (c == escape_char || metacharacters.find(c) != NPOS) {
3867  out += escape_char;
3868  }
3869  out += c;
3870  }
3871  return out;
3872 }
3873 
3874 
3875 string NStr::Unescape(const CTempString str, char escape_char)
3876 {
3877  string out;
3878  if ( str.empty() ) {
3879  return out;
3880  }
3881  out.reserve(str.size());
3882  bool escaped = false;
3883 
3884  for (char c : str) {
3885  if (escaped) {
3886  out += c;
3887  escaped = false;
3888  }
3889  else {
3890  if (c == escape_char) {
3891  escaped = true;
3892  }
3893  else {
3894  out += c;
3895  }
3896  }
3897  }
3898  return out;
3899 }
3900 
3901 
3902 string NStr::Quote(const CTempString str, char quote_char, char escape_char)
3903 {
3904  string out;
3905  if (str.empty()) {
3906  return out;
3907  }
3908  out.reserve(str.size() * 2); // maximum size for a new string
3909 
3910  out.push_back(quote_char);
3911  for (char c : str) {
3912  if (c == quote_char || c == escape_char) {
3913  out += escape_char;
3914  }
3915  out += c;
3916  }
3917  out.push_back(quote_char);
3919  return out;
3920 }
3921 
3922 
3923 string NStr::Unquote(const CTempString str, char escape_char)
3924 {
3925  string out;
3926  if (str.empty()) {
3927  return out;
3928  }
3929  out.reserve(str.size());
3930  bool escaped = false;
3931  char quote_char = str[0];
3932 
3933  if (str.length() < 2 || str[str.length()-1] != quote_char) {
3934  NCBI_THROW2(CStringException, eFormat,
3935  "The source string must start and finish with the same character", 0);
3936  }
3937  // Remove first and last characters ("quotes")
3938  CTempString s(str, 1, str.length() - 2);
3939 
3940  for (char c : s) {
3941  if (escaped) {
3942  out += c;
3943  escaped = false;
3944  }
3945  else {
3946  if (c == escape_char) {
3947  escaped = true;
3948  }
3949  else {
3950  out += c;
3951  }
3952  }
3953  }
3954  return out;
3955 }
3956 
3957 
3961 }
3962 
3963 
3965 {
3966  return s_PrintableString(str,
3969 }
3970 
3971 
3972 string NStr::CEncode(const CTempString str, EQuoted quoted)
3973 {
3974  switch (quoted) {
3975  case eNotQuoted:
3976  return PrintableString(str);
3977  case eQuoted:
3978  return '"' + PrintableString(str) + '"';
3979  }
3980  _TROUBLE;
3981  // Unreachable
3982  return str;
3983 }
3984 
3985 
3986 string NStr::CParse(const CTempString str, EQuoted quoted)
3987 {
3988  if (quoted == eNotQuoted) {
3989  return ParseEscapes(str);
3990  }
3991  _ASSERT(quoted == eQuoted);
3992 
3993  SIZE_TYPE pos;
3994  SIZE_TYPE len = str.length();
3995  const char quote_char = '"';
3996 
3997  if (len < 2 || str[0] != quote_char || str[len-1] != quote_char) {
3998  NCBI_THROW2(CStringException, eFormat,
3999  "The source string must start and finish with a double quote", 0);
4000  }
4001 
4002  // Flag that next char is escaped, ignore it
4003  bool escaped = false;
4004  // We have a quote mark, start collect string chars
4005  bool collect = true;
4006  // Position of last quote
4007  SIZE_TYPE last_quote = 0;
4008 
4009  string out;
4010  out.reserve(str.size());
4011 
4012  for (pos = 1; pos < len; ++pos) {
4013  unsigned char ch = str[pos];
4014  if (ch == quote_char && !escaped) {
4015  // Have a substring
4016  CTempString sub(str.data() + last_quote + 1, pos - last_quote - 1);
4017  if (collect) {
4018  // Parse escape sequences and add it to result
4019  out += ParseEscapes(sub);
4020  } else {
4021  // Possible we have adjacent strings ("A""B").
4022  if (pos != last_quote + 1) {
4023  NCBI_THROW2(CStringException, eFormat,
4024  "Quoted string format error", pos);
4025  }
4026  }
4027  last_quote = pos;
4028  collect = !collect;
4029  } else {
4030  escaped = ch == '\\' ? !escaped : false;
4031  }
4032  }
4033  if (escaped || last_quote != len-1) {
4034  NCBI_THROW2(CStringException, eFormat,
4035  "Unterminated quoted string", str.length());
4036  }
4037  return out;
4038 }
4039 
4040 
4041 string NStr::XmlEncode(const CTempString str, TXmlEncode flags)
4042 // http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent
4043 {
4044  string result;
4045  SIZE_TYPE i;
4046 
4047  // wild guess...
4048  result.reserve(str.size());
4049 
4050  for (i = 0; i < str.size(); i++) {
4051  char c = str[i];
4052  switch ( c ) {
4053  case '&':
4054  result.append("&amp;");
4055  break;
4056  case '<':
4057  result.append("&lt;");
4058  break;
4059  case '>':
4060  result.append("&gt;");
4061  break;
4062  case '\'':
4063  result.append("&apos;");
4064  break;
4065  case '"':
4066  result.append("&quot;");
4067  break;
4068  case '-':
4069  // translate double hyphen and ending hyphen
4070  // http://www.w3.org/TR/xml11/#sec-comments
4071  if (flags & eXmlEnc_CommentSafe) {
4072  if (i+1 == str.size()) {
4073  result.append("&#x2d;");
4074  break;
4075  } else if (str[i+1] == '-') {
4076  ++i;
4077  result.append(1, c).append("&#x2d;");
4078  break;
4079  }
4080  }
4081  result.append(1, c);
4082  break;
4083 
4084  default:
4085  unsigned int uc = (unsigned int)(c);
4086 
4088  // Optional check on non-safe characters:
4089  // [#x1-#x8], [#xB-#xC], [#xE-#x1F], [#x7F-#x84], [#x86-#x9F]
4090  // https://www.w3.org/TR/xml11/#NT-Char
4091 
4092  if ((uc < 0x8) || (uc == 0xB) || (uc == 0xC) ||
4093  (uc >= 0x0E && uc <=0x1F) ||
4094  (uc >= 0x7F && uc <=0x84) ||
4095  (uc >= 0x86 && uc <=0x9F) )
4096  {
4097  // Skip unsafe characters
4098  if (flags & eXmlEnc_Unsafe_Skip) {
4099  continue;
4100  }
4101  // else, throw
4102  NCBI_THROW2(CStringException, eConvert,
4103  "NStr::XmlEncode -- Unsafe character '0x" + NStr::NumericToString(c, 0, 16) + "'", i);
4104  }
4105  }
4106  // Default behavior
4107  if (uc < 0x20) {
4108  const char* charmap = "0123456789abcdef";
4109  result.append("&#x");
4110  Uint1 ch = c;
4111  unsigned hi = ch >> 4;
4112  unsigned lo = ch & 0xF;
4113  if ( hi ) {
4114  result.append(1, charmap[hi]);
4115  }
4116  result.append(1, charmap[lo]).append(1, ';');
4117  } else {
4118  result.append(1, c);
4119  }
4120  break;
4121  }
4122  }
4123  return result;
4124 }
4125 
4126 
4127 string NStr::HtmlEncode(const CTempString str, THtmlEncode flags)
4128 {
4129  string result;
4130  SIZE_TYPE i;
4131  SIZE_TYPE semicolon = 0;
4132 
4133  // wild guess...
4134  result.reserve(str.size());
4135 
4136  const char* begin = str.data();
4137  const char* end = begin + str.size();
4138  for ( const char* curr = begin; curr < end; ++curr ) {
4139  TUnicodeSymbol c = CUtf8::Decode(curr);
4140  switch ( c ) {
4141  case '&':
4142  {{
4143  i = curr - begin;
4144  result.append("&");
4145  // Check on HTML entity
4146  bool is_entity = false;
4147  if ((flags & fHtmlEnc_SkipEntities) &&
4148  (i+2 < str.size()) && (semicolon != NPOS)) {
4149 
4150  if ( i >= semicolon ) {
4151  semicolon = str.find(";", i+1);
4152  }
4153  if ( semicolon != NPOS ) {
4154  SIZE_TYPE len = semicolon - i;
4155  SIZE_TYPE p = i + 1;
4156  if (str[i+1] == '#') {
4157  // Check on numeric character reference encoding
4159  p++;
4160  if (len || len <= 4) {
4161  for (; p < semicolon; ++p) {
4162  if (!isdigit((unsigned char)(str[p])))
4163  break;
4164  }
4165  }
4166  }
4167  } else {
4168  // Check on literal entity
4170  if (len && len <= 10) {
4171  for (; p < semicolon; ++p) {
4172  if (!isalpha((unsigned char)(str[p])))
4173  break;
4174  }
4175  }
4176  }
4177  }
4178  is_entity = (p == semicolon);
4179  }
4180  }
4181  if ( is_entity ) {
4183  ERR_POST_X_ONCE(5, Info << "string \"" << str <<
4184  "\" contains HTML encoded entities");
4185  }
4186  } else {
4187  result.append("amp;");
4188  }
4189  }}
4190  break;
4191  case '<':
4192  result.append("&lt;");
4193  break;
4194  case '>':
4195  result.append("&gt;");
4196  break;
4197  case '\'':
4198  result.append("&apos;");
4199  break;
4200  case '"':
4201  result.append("&quot;");
4202  break;
4203  default:
4204  if ((unsigned int)c < 0x20) {
4205  const char* charmap = "0123456789abcdef";
4206  result.append("&#x");
4207  Uint1 ch = c;
4208  unsigned hi = ch >> 4;
4209  unsigned lo = ch & 0xF;
4210  if ( hi ) {
4211  result.append(1, charmap[hi]);
4212  }
4213  result.append(1, charmap[lo]).append(1, ';');
4214  } else if (c > 0x7F) {
4215  result.append("&#x").append( NStr::NumericToString(c, 0, 16)).append(1, ';');;
4216  } else {
4217  result.append(1, c);
4218  }
4219  break;
4220  }
4221  }
4222  return result;
4223 }
4224 
4225 
4226 // Character entity references
4227 // http://www.w3.org/TR/html4/sgml/entities.html
4228 // http://www.w3.org/TR/1998/REC-html40-19980424/charset.html#h-5.3
4229 // only some entities from here were added (those shifted to right):
4230 // http://dev.w3.org/html5/html-author/charref
4231 
4232 static struct tag_HtmlEntities
4233 {
4234  TUnicodeSymbol u;
4235  const char* s;
4236 }
4237 const s_HtmlEntities[] = {
4238  { 9, "Tab" },
4239  { 10, "NewLine" },
4240  { 33, "excl" },
4241  { 34, "quot" },
4242  { 35, "num" },
4243  { 36, "dollar" },
4244  { 37, "percnt" },
4245  { 38, "amp" },
4246  { 39, "apos" },
4247  { 40, "lpar" },
4248  { 41, "rpar" },
4249  { 42, "ast" },
4250  { 43, "plus" },
4251  { 44, "comma" },
4252  { 46, "period" },
4253  { 47, "sol" },
4254  { 58, "colon" },
4255  { 59, "semi" },
4256  { 60, "lt" },
4257  { 61, "equals" },
4258  { 62, "gt" },
4259  { 63, "quest" },
4260  { 64, "commat" },
4261  { 91, "lsqb" },
4262  { 92, "bsol" },
4263  { 93, "rsqb" },
4264  { 94, "Hat" },
4265  { 95, "lowbar" },
4266  { 96, "grave" },
4267  { 123, "lcub" },
4268  { 124, "verbar" },
4269  { 125, "rcub" },
4270  { 160, "nbsp" },
4271  { 161, "iexcl" },
4272  { 162, "cent" },
4273  { 163, "pound" },
4274  { 164, "curren" },
4275  { 165, "yen" },
4276  { 166, "brvbar" },
4277  { 167, "sect" },
4278  { 168, "uml" },
4279  { 169, "copy" },
4280  { 170, "ordf" },
4281  { 171, "laquo" },
4282  { 172, "not" },
4283  { 173, "shy" },
4284  { 174, "reg" },
4285  { 175, "macr" },
4286  { 176, "deg" },
4287  { 177, "plusmn" },
4288  { 178, "sup2" },
4289  { 179, "sup3" },
4290  { 180, "acute" },
4291  { 181, "micro" },
4292  { 182, "para" },
4293  { 183, "middot" },
4294  { 184, "cedil" },
4295  { 185, "sup1" },
4296  { 186, "ordm" },
4297  { 187, "raquo" },
4298  { 188, "frac14" },
4299  { 189, "frac12" },
4300  { 190, "frac34" },
4301  { 191, "iquest" },
4302  { 192, "Agrave" },
4303  { 193, "Aacute" },
4304  { 194, "Acirc" },
4305  { 195, "Atilde" },
4306  { 196, "Auml" },
4307  { 197, "Aring" },
4308  { 198, "AElig" },
4309  { 199, "Ccedil" },
4310  { 200, "Egrave" },
4311  { 201, "Eacute" },
4312  { 202, "Ecirc" },
4313  { 203, "Euml" },
4314  { 204, "Igrave" },
4315  { 205, "Iacute" },
4316  { 206, "Icirc" },
4317  { 207, "Iuml" },
4318  { 208, "ETH" },
4319  { 209, "Ntilde" },
4320  { 210, "Ograve" },
4321  { 211, "Oacute" },
4322  { 212, "Ocirc" },
4323  { 213, "Otilde" },
4324  { 214, "Ouml" },
4325  { 215, "times" },
4326  { 216, "Oslash" },
4327  { 217, "Ugrave" },
4328  { 218, "Uacute" },
4329  { 219, "Ucirc" },
4330  { 220, "Uuml" },
4331  { 221, "Yacute" },
4332  { 222, "THORN" },
4333  { 223, "szlig" },
4334  { 224, "agrave" },
4335  { 225, "aacute" },
4336  { 226, "acirc" },
4337  { 227, "atilde" },
4338  { 228, "auml" },
4339  { 229, "aring" },
4340  { 230, "aelig" },
4341  { 231, "ccedil" },
4342  { 232, "egrave" },
4343  { 233, "eacute" },
4344  { 234, "ecirc" },
4345  { 235, "euml" },
4346  { 236, "igrave" },
4347  { 237, "iacute" },
4348  { 238, "icirc" },
4349  { 239, "iuml" },
4350  { 240, "eth" },
4351  { 241, "ntilde" },
4352  { 242, "ograve" },
4353  { 243, "oacute" },
4354  { 244, "ocirc" },
4355  { 245, "otilde" },
4356  { 246, "ouml" },
4357  { 247, "divide" },
4358  { 248, "oslash" },
4359  { 249, "ugrave" },
4360  { 250, "uacute" },
4361  { 251, "ucirc" },
4362  { 252, "uuml" },
4363  { 253, "yacute" },
4364  { 254, "thorn" },
4365  { 255, "yuml" },
4366  { 338, "OElig" },
4367  { 339, "oelig" },
4368  { 352, "Scaron" },
4369  { 353, "scaron" },
4370  { 376, "Yuml" },
4371  { 402, "fnof" },
4372  { 710, "circ" },
4373  { 732, "tilde" },
4374  { 913, "Alpha" },
4375  { 914, "Beta" },
4376  { 915, "Gamma" },
4377  { 916, "Delta" },
4378  { 917, "Epsilon" },
4379  { 918, "Zeta" },
4380  { 919, "Eta" },
4381  { 920, "Theta" },
4382  { 921, "Iota" },
4383  { 922, "Kappa" },
4384  { 923, "Lambda" },
4385  { 924, "Mu" },
4386  { 925, "Nu" },
4387  { 926, "Xi" },
4388  { 927, "Omicron" },
4389  { 928, "Pi" },
4390  { 929, "Rho" },
4391  { 931, "Sigma" },
4392  { 932, "Tau" },
4393  { 933, "Upsilon" },
4394  { 934, "Phi" },
4395  { 935, "Chi" },
4396  { 936, "Psi" },
4397  { 937, "Omega" },
4398  { 945, "alpha" },
4399  { 946, "beta" },
4400  { 947, "gamma" },
4401  { 948, "delta" },
4402  { 949, "epsilon" },
4403  { 950, "zeta" },
4404  { 951, "eta" },
4405  { 952, "theta" },
4406  { 953, "iota" },
4407  { 954, "kappa" },
4408  { 955, "lambda" },
4409  { 956, "mu" },
4410  { 957, "nu" },
4411  { 958, "xi" },
4412  { 959, "omicron" },
4413  { 960, "pi" },
4414  { 961, "rho" },
4415  { 962, "sigmaf" },
4416  { 963, "sigma" },
4417  { 964, "tau" },
4418  { 965, "upsilon" },
4419  { 966, "phi" },
4420  { 967, "chi" },
4421  { 968, "psi" },
4422  { 969, "omega" },
4423  { 977, "thetasym" },
4424  { 978, "upsih" },
4425  { 982, "piv" },
4426  { 8194, "ensp" },
4427  { 8195, "emsp" },
4428  { 8201, "thinsp" },
4429  { 8204, "zwnj" },
4430  { 8205, "zwj" },
4431  { 8206, "lrm" },
4432  { 8207, "rlm" },
4433  { 8211, "ndash" },
4434  { 8212, "mdash" },
4435  { 8216, "lsquo" },
4436  { 8217, "rsquo" },
4437  { 8218, "sbquo" },
4438  { 8220, "ldquo" },
4439  { 8221, "rdquo" },
4440  { 8222, "bdquo" },
4441  { 8224, "dagger" },
4442  { 8225, "Dagger" },
4443  { 8226, "bull" },
4444  { 8230, "hellip" },
4445  { 8240, "permil" },
4446  { 8242, "prime" },
4447  { 8243, "Prime" },
4448  { 8249, "lsaquo" },
4449  { 8250, "rsaquo" },
4450  { 8254, "oline" },
4451  { 8260, "frasl" },
4452  { 8364, "euro" },
4453  { 8472, "weierp" },
4454  { 8465, "image" },
4455  { 8476, "real" },
4456  { 8482, "trade" },
4457  { 8501, "alefsym" },
4458  { 8592, "larr" },
4459  { 8593, "uarr" },
4460  { 8594, "rarr" },
4461  { 8595, "darr" },
4462  { 8596, "harr" },
4463  { 8629, "crarr" },
4464  { 8656, "lArr" },
4465  { 8657, "uArr" },
4466  { 8658, "rArr" },
4467  { 8659, "dArr" },
4468  { 8660, "hArr" },
4469  { 8704, "forall" },
4470  { 8706, "part" },
4471  { 8707, "exist" },
4472  { 8709, "empty" },
4473  { 8711, "nabla" },
4474  { 8712, "isin" },
4475  { 8713, "notin" },
4476  { 8715, "ni" },
4477  { 8719, "prod" },
4478  { 8721, "sum" },
4479  { 8722, "minus" },
4480  { 8727, "lowast" },
4481  { 8730, "radic" },
4482  { 8733, "prop" },
4483  { 8734, "infin" },
4484  { 8736, "ang" },
4485  { 8743, "and" },
4486  { 8744, "or" },
4487  { 8745, "cap" },
4488  { 8746, "cup" },
4489  { 8747, "int" },
4490  { 8756, "there4" },
4491  { 8764, "sim" },
4492  { 8773, "cong" },
4493  { 8776, "asymp" },
4494  { 8800, "ne" },
4495  { 8801, "equiv" },
4496  { 8804, "le" },
4497  { 8805, "ge" },
4498  { 8834, "sub" },
4499  { 8835, "sup" },
4500  { 8836, "nsub" },
4501  { 8838, "sube" },
4502  { 8839, "supe" },
4503  { 8853, "oplus" },
4504  { 8855, "otimes" },
4505  { 8869, "perp" },
4506  { 8901, "sdot" },
4507  { 8968, "lceil" },
4508  { 8969, "rceil" },
4509  { 8970, "lfloor" },
4510  { 8971, "rfloor" },
4511  { 9001, "lang" },
4512  { 9002, "rang" },
4513  { 9674, "loz" },
4514  { 9824, "spades" },
4515  { 9827, "clubs" },
4516  { 9829, "hearts" },
4517  { 9830, "diams" },
4518  { 0, 0 }
4519 };
4520 
4522 {
4523  const struct tag_HtmlEntities* p = s_HtmlEntities;
4524  for ( ; p->u != 0; ++p) {
4525  if (uch == p->u) {
4526  return p->s;
4527  }
4528  }
4529  return kEmptyStr;
4530 }
4531 
4532 string NStr::HtmlDecode(const CTempString str, EEncoding encoding, THtmlDecode* result_flags)
4533 {
4534  string ustr;
4535  THtmlDecode result = 0;
4536 
4537  if (encoding == eEncoding_Unknown) {
4538  encoding = CUtf8::GuessEncoding(str);
4539  if (encoding == eEncoding_Unknown) {
4540  NCBI_THROW2(CStringException, eBadArgs,
4541  "Unable to guess the source string encoding", 0);
4542  }
4543  }
4544  // wild guess...
4545  ustr.reserve(str.size());
4546 
4547  CTempString::const_iterator i, e = str.end();
4548  char ch;
4550 
4551  for (i = str.begin(); i != e;) {
4552  ch = *(i++);
4553  //check for HTML entities and character references
4554  if (i != e && ch == '&') {
4555  CTempString::const_iterator start_of_entity, end_of_entity, itmp;
4556  end_of_entity = itmp = i;
4557  bool ent, dec, hex, parsed=false;
4558  ent = isalpha((unsigned char)(*itmp)) != 0;
4559  dec = !ent && *itmp == '#' && ++itmp != e &&
4560  isdigit((unsigned char)(*itmp)) != 0;
4561  hex = !dec && itmp != e &&
4562  (*itmp == 'x' || *itmp == 'X') && ++itmp != e &&
4563  isxdigit((unsigned char)(*itmp)) != 0;
4564  start_of_entity = itmp;
4565 
4566  if (itmp != e && (ent || dec || hex)) {
4567  // do not look too far
4568  for (int len=0; len<16 && itmp != e; ++len, ++itmp) {
4569  if (*itmp == '&' || *itmp == '#') {
4570  break;
4571  }
4572  if (*itmp == ';') {
4573  end_of_entity = itmp;
4574  break;
4575  }
4576  ent = ent && isalnum( (unsigned char)(*itmp)) != 0;
4577  dec = dec && isdigit( (unsigned char)(*itmp)) != 0;
4578  hex = hex && isxdigit((unsigned char)(*itmp)) != 0;
4579  }
4580  if (end_of_entity != i && (ent || dec || hex)) {
4581  uch = 0;
4582  if (ent) {
4583  string entity(start_of_entity, end_of_entity);
4584  const struct tag_HtmlEntities* p = s_HtmlEntities;
4585  for ( ; p->u != 0; ++p) {
4586  if (entity.compare(p->s) == 0) {
4587  uch = p->u;
4588  parsed = true;
4590  break;
4591  }
4592  }
4593  } else {
4594  parsed = true;
4596  for (itmp = start_of_entity; itmp != end_of_entity; ++itmp) {
4597  TUnicodeSymbol ud = *itmp;
4598  if (dec) {
4599  uch = 10 * uch + (ud - '0');
4600  } else if (hex) {
4601  if (ud >='0' && ud <= '9') {
4602  ud -= '0';
4603  } else if (ud >='a' && ud <= 'f') {
4604  ud -= 'a';
4605  ud += 10;
4606  } else if (ud >='A' && ud <= 'F') {
4607  ud -= 'A';
4608  ud += 10;
4609  }
4610  uch = 16 * uch + ud;
4611  }
4612  }
4613  }
4614  if (parsed) {
4615  ustr += CUtf8::AsUTF8(&uch,1);
4616  i = ++end_of_entity;
4617  continue;
4618  }
4619  }
4620  }
4621  }
4622  // no entity - append as is
4623  if (encoding == eEncoding_UTF8 || encoding == eEncoding_Ascii) {
4624  ustr.append( 1, ch );
4625  } else {
4627  ustr += CUtf8::AsUTF8(CTempString(&ch,1), encoding);
4628  }
4629  }
4630  if (result_flags) {
4631  *result_flags = result;
4632  }
4633  return ustr;
4635 
4636 
4637 // http://www.json.org/
4638 
4639 string NStr::JsonEncode(const CTempString str, EJsonEncode encoding)
4640 {
4641  string result;
4642  // wild guess...
4643  result.reserve(str.size()+2);
4644 
4645  auto encode_char = [&](char c)
4646  {
4647  static const char* charmap = "0123456789abcdef";
4648  result.append("\\u00");
4649  Uint1 ch = c;
4650  unsigned hi = ch >> 4;
4651  unsigned lo = ch & 0xF;
4652  result.append(1, charmap[hi]);
4653  result.append(1, charmap[lo]);
4654  };
4655 
4656  for (auto c : str) {
4657  switch ( c ) {
4658  case '"':
4659  result.append("\\\"");
4660  break;
4661  case '\\':
4662  result.append("\\\\");
4663  break;
4664  default:
4665  if ((unsigned int)c < 0x20) {
4666  // Control characters U+0000 through U+001F
4667  encode_char(c);
4668  } else {
4669  if (encoding == eJsonEnc_UTF8 && (unsigned int)c >= 0x80) {
4670  encode_char(c);
4671  } else {
4672  result.append(1, c);
4673  }
4674  }
4675  break;
4676  }
4677  }
4678  if (encoding == eJsonEnc_Quoted) {
4679  return '"' + result + '"';
4680  }
4681  return result;
4682 }
4683 
4684 
4685 string NStr::ShellEncode(const string& str)
4686 {
4687  // 1. Special-case of non-printable characters. We have no choice and
4688  // must use BASH extensions if we want printable output.
4689  //
4690  // Aesthetic issue: Most people are not familiar with the BASH-only
4691  // quoting style. Avoid it as much as possible.
4692 
4693  ITERATE ( string, it, str ) {
4694  if ( !isprint(Uchar(*it)) ) {
4695  return "$'" + NStr::PrintableString(str) + "'";
4696  }
4697  }
4698 
4699  /////////////////////////////////////////////////////////////////////////
4700  // Bourne Shell quoting as IEEE-standard without special extensions.
4701  //
4702  // There are 3 basic ways to quote/escape in Bourne Shell:
4703  //
4704  // - Single-quotes. All characters (including non-printable
4705  // characters newlines, backslashes), are literal. There is no escape.
4706  // - Double-quotes. Need to escape some metacharacters, such as literal
4707  // escape (\‍), variable expansion ($) and command substitution (`).
4708  // - Escape without quotes. Use backslash.
4709  /////////////////////////////////////////////////////////////////////////
4710 
4711  // 2. Non-empty printable string without meta-characters.
4712  //
4713  // Shell special characters, according to IEEE Std 1003.1,
4714  // plus ! (Bourne shell exit status negation and Bash history expansion),
4715  // braces (Bourne enhanced expansion), space, tab, and newline.
4716  //
4717  // See http://www.opengroup.org/onlinepubs/009695399/toc.htm
4718  // See Bourne and Bash man pages.
4719 
4720  if (!str.empty() &&
4721  str.find_first_of("!{} \t\r\n[|&;<>()$`\"'*?#~=%\\") == NPOS) {
4722  return str;
4723  }
4724 
4725  // 3. Printable string, but either empty or some shell meta-characters.
4726  //
4727  // Aesthetics preference:
4728  // i) If the string includes literal single-quotes, then prefer
4729  // double-quoting provided there is no need to escape embedded
4730  // literal double-quotes, escapes (\‍), variable substitution ($),
4731  // or command substitution (`).
4732 
4733  if (str.find('\'') != NPOS &&
4734  str.find_first_of("\"\\$`") == NPOS) {
4735  return "\"" + str + "\"";
4736  }
4737 
4738  // Use single-quoting. The only special case for Bourne shell
4739  // single-quoting is a literal single-quote, which needs to
4740  // be pulled out of the quoted region.
4741  //
4742  // Single-quoting does not have any escape character, so close
4743  // the quoted string ('), then emit an escaped or quoted literal
4744  // single-quote (\' or "'"), and resume the quoted string (').
4745  //
4746  // Aesthetics preferences:
4747  // ii) Prefer single-quoting over escape characters, especially
4748  // escaped whitespace. However, this is in compromise to optimal
4749  // quoting: if there are many literal single-quotes and the
4750  // use of double-quotes would involve the need to escape embedded
4751  // characters, then it may be more pleasing to escape the
4752  // shell meta-characters, and avoid the need for single-quoting
4753  // in the presence of literal single-quotes.
4754  // iii) If there are no literal double-quotes, then all else being equal,
4755  // avoid double-quotes and prefer escaping. Double-quotes are
4756  // more commonly used by enclosing formats such as ASN.1 Text
4757  // and CVS, and would thus need to be escaped. If there are
4758  // literal double-quotes, then having them is in the output is
4759  // unavoidable, and this aesthetics rule becomes secondary to
4760  // the preference for avoiding escape characters. If there are
4761  // literal escape characters, then having them is unavoidable
4762  // and avoidance of double-quotes is once again recommended.
4763 
4764  // TODO: Should simplify runs of multiple quotes, for example:
4765  // '\'''\'''\'' -> '"'''"'
4766 
4767  bool avoid_double_quotes = (str.find('"') == NPOS ||
4768  str.find('\\') != NPOS);
4769  string s = "'" + NStr::Replace(str, "'",
4770  avoid_double_quotes ? "'\\''" : "'\"'\"'") + "'";
4771 
4772  // Aesthetic improvement: Remove paired single-quotes ('')
4773  // that aren't escaped, as these evaluate to an empty string.
4774  // Don't apply this simplification for the degenerate case when
4775  // the string is the empty string ''. (Non degenerate strings
4776  // must be length greater than 2). Implement the equivalent
4777  // of the Perl regexp:
4778  //
4779  // s/(?<!\\‍)''//g
4780  //
4781  if (s.size() > 2) {
4782  size_t pos = 0;
4783  while ( true ) {
4784  pos = s.find("''", pos);
4785  if (pos == NPOS) break;
4786  if (pos == 0 || s[pos-1] != '\\') {
4787  s.erase(pos, 2);
4788  } else {
4789  ++pos;
4790  }
4791  }
4792  }
4794  return s;
4795 }
4796 
4797 
4798 string NStr::ParseEscapes(const CTempString str, EEscSeqRange mode, char user_char)
4799 {
4800  string out;
4801  out.reserve(str.size()); // result string can only be smaller
4802  SIZE_TYPE pos = 0;
4803  bool is_error = false;
4804 
4805  while (pos < str.size() || !is_error) {
4806  SIZE_TYPE pos2 = str.find('\\', pos);
4807  if (pos2 == NPOS) {
4808  //~ out += str.substr(pos);
4809  CTempString sub(str, pos);
4810  out += sub;
4811  break;
4812  }
4813  //~ out += str.substr(pos, pos2 - pos);
4814  CTempString sub(str, pos, pos2-pos);
4815  out += sub;
4816  if (++pos2 == str.size()) {
4817  NCBI_THROW2(CStringException, eFormat,
4818  "Unterminated escape sequence", pos2);
4819  }
4820  switch (str[pos2]) {
4821  case 'a': out += '\a'; break;
4822  case 'b': out += '\b'; break;
4823  case 'f': out += '\f'; break;
4824  case 'n': out += '\n'; break;
4825  case 'r': out += '\r'; break;
4826  case 't': out += '\t'; break;
4827  case 'v': out += '\v'; break;
4828  case 'x':
4829  {{
4830  pos = ++pos2;
4831  while (pos < str.size()
4832  && isxdigit((unsigned char) str[pos])) {
4833  pos++;
4834  }
4835  if (pos > pos2) {
4836  SIZE_TYPE len = pos-pos2;
4837  if ((mode == eEscSeqRange_FirstByte) && (len > 2)) {
4838  // Take only 2 first hex-digits
4839  len = 2;
4840  pos = pos2 + 2;
4841  }
4842  unsigned int value =
4843  StringToUInt(CTempString(str, pos2, len), 0, 16);
4844  if ((mode != eEscSeqRange_Standard) && (value > 255)) {
4845  // eEscSeqRange_Standard -- by default
4846  switch (mode) {
4848  // Already have right value
4849  break;
4850  case eEscSeqRange_Throw:
4851  NCBI_THROW2(CStringException, eFormat,
4852  "Escape sequence '" + NStr::PrintableString(CTempString(str, pos2, len)) +
4853  "' is out of range [0-255]", pos2);
4854  break;
4855  case eEscSeqRange_Errno:
4856  CNcbiError::SetErrno(errno = ERANGE, str);
4857  is_error = true;
4858  continue;
4859  case eEscSeqRange_User:
4860  value = (unsigned)user_char;
4861  break;
4862  default:
4863  NCBI_THROW2(CStringException, eFormat, "Wrong set of flags", pos2);
4864  }
4865  }
4866  out += static_cast<char>(value);
4867  } else {
4868  NCBI_THROW2(CStringException, eFormat,
4869  "\\x followed by no hexadecimal digits", pos);
4870  }
4871  }}
4872  continue;
4873  case '0': case '1': case '2': case '3':
4874  case '4': case '5': case '6': case '7':
4875  {{
4876  pos = pos2;
4877  unsigned char c = (unsigned char)(str[pos++] - '0');
4878  while (pos < pos2 + 3 && pos < str.size()
4879  && str[pos] >= '0' && str[pos] <= '7') {
4880  c = (unsigned char)((c << 3) | (str[pos++] - '0'));
4881  }
4882  out += c;
4883  }}
4884  continue;
4885  case '\n':
4886  // quoted EOL means no EOL
4887  break;
4888  default:
4889  out += str[pos2];
4890  break;
4891  }
4892  pos = pos2 + 1;
4893  }
4894  if (mode == eEscSeqRange_Errno) {
4895  if (is_error) {
4896  return kEmptyStr;
4897  }
4898  errno = 0;
4899  }
4900  return out;
4901 }
4902 
4903 
4904 CTempString s_Unquote(const CTempString str, size_t* n_read)
4905 {
4906  const char* str_pos = str.data();
4907  char quote_char;
4908 
4909  if (str.empty() || ((quote_char = *str_pos) != '"' && quote_char != '\'')) {
4910  NCBI_THROW2(CStringException, eFormat,
4911  "The source string must start with a quote", 0);
4912  }
4913 
4914  const char* str_end = str_pos + str.length();
4915  bool escaped = false;
4916 
4917  while (++str_pos < str_end) {
4918  if (*str_pos == quote_char && !escaped) {
4919  size_t pos = str_pos - str.data();
4920  if (n_read != NULL)
4921  *n_read = pos + 1;
4922  return CTempString(str.data() + 1, pos - 1);
4923  } else {
4924  escaped = *str_pos == '\\' ? !escaped : false;
4925  }
4926  }
4928  "Unterminated quoted string", str.length());
4929 }
4930 
4931 
4932 string NStr::ParseQuoted(const CTempString str, size_t* n_read /*= NULL*/)
4933 {
4934  return ParseEscapes(s_Unquote(std::move(str), n_read));
4935 }
4936 
4937 
4938 // An adjusted copy-paste of NStr::ParseEscapes
4940 {
4941  string out;
4942  out.reserve(str.size()); // result string can only be smaller
4943  SIZE_TYPE pos = 0;
4944 
4945  while (pos < str.size()) {
4946  SIZE_TYPE pos2 = str.find('\\', pos);
4947  if (pos2 == NPOS) {
4948  //~ out += str.substr(pos);
4949  CTempString sub(str, pos);
4950  out += sub;
4951  break;
4952  }
4953  //~ out += str.substr(pos, pos2 - pos);
4954  CTempString sub(str, pos, pos2-pos);
4955  out += sub;
4956  if (++pos2 == str.size()) {
4957  NCBI_THROW2(CStringException, eFormat,
4958  "Unterminated escape sequence", pos2);
4959  }
4960  switch (str[pos2]) {
4961  case '"':
4962  case '\\':
4963  case '/': out += str[pos2]; break;
4964  case 'b': out += '\b'; break;
4965  case 'f': out += '\f'; break;
4966  case 'n': out += '\n'; break;
4967  case 'r': out += '\r'; break;
4968  case 't': out += '\t'; break;
4969  case 'u':
4970  pos = ++pos2;
4971  while (pos < str.size() && isxdigit((unsigned char) str[pos])) {
4972  pos++;
4973  }
4974  if (auto len = pos - pos2) {
4975  if (len < 4) {
4976  NCBI_THROW2(CStringException, eFormat, "Invalid JSON escape sequence", pos2);
4977  } else if (len > 4) {
4978  len = 4;
4979  pos = pos2 + 4;
4980  }
4981  unsigned int value = NStr::StringToUInt(CTempString(str, pos2, len), 0, 16);
4982  if (value > 0xff) {
4983  NCBI_THROW2(CStringException, eConvert,
4984  "Escaped UTF-8 characters after '\\u00ff' are not supported", pos2);
4985  }
4986  out += static_cast<char>(value);
4987  continue;
4988  } else {
4989  NCBI_THROW2(CStringException, eFormat, "\\u followed by no hexadecimal digits", pos);
4990  }
4991  default:
4992  NCBI_THROW2(CStringException, eFormat, "Invalid JSON escape sequence", pos2);
4993  }
4994  pos = pos2 + 1;
4995  }
4996  return out;
4997 }
4998 
4999 
5000 string NStr::JsonDecode(const CTempString str, size_t* n_read /*= NULL*/)
5001 {
5002  return s_ParseJsonEncodeEscapes(s_Unquote(std::move(str), n_read));
5004 
5005 
5006 // Determines the end of an HTML <...> tag, accounting for attributes
5007 // and comments (the latter allowed only within <!...>).
5008 static SIZE_TYPE s_EndOfTag(const string& str, SIZE_TYPE start)
5009 {
5010  _ASSERT(start < str.size() && str[start] == '<');
5011  bool comments_ok = (start + 1 < str.size() && str[start + 1] == '!');
5012  for (SIZE_TYPE pos = start + 1; pos < str.size(); ++pos) {
5013  switch (str[pos]) {
5014  case '>': // found the end
5015  return pos;
5016 
5017  case '\"': // start of "string"; advance to end
5018  pos = str.find('\"', pos + 1);
5019  if (pos == NPOS) {
5020  NCBI_THROW2(CStringException, eFormat,
5021  "Unclosed string in HTML tag", start);
5022  // return pos;
5023  }
5024  break;
5025 
5026  case '-': // possible start of -- comment --; advance to end
5027  if (comments_ok && pos + 1 < str.size()
5028  && str[pos + 1] == '-') {
5029  pos = str.find("--", pos + 2);
5030  if (pos == NPOS) {
5031  NCBI_THROW2(CStringException, eFormat,
5032  "Unclosed comment in HTML tag", start);
5033  // return pos;
5034  } else {
5035  ++pos;
5036  }
5037  }
5038  }
5039  }
5040  NCBI_THROW2(CStringException, eFormat, "Unclosed HTML tag", start);
5041  // return NPOS;
5043 
5044 
5045 // Determines the end of an HTML &foo; character/entity reference
5046 // (which might not actually end with a semicolon :-/ , but we ignore that case)
5047 static SIZE_TYPE s_EndOfReference(const string& str, SIZE_TYPE start)
5048 {
5049  _ASSERT(start < str.size() && str[start] == '&');
5050 
5051  SIZE_TYPE pos = str.find_first_not_of
5052  ("#0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
5053  start + 1);
5054  if (pos != NPOS && str[pos] == ';') {
5055  // found terminating semicolon, so it's valid, and we return that
5056  return pos;
5057  } else {
5058  // We consider it just a '&' by itself since it's invalid
5059  return start;
5060  }
5061 }
5062 
5063 
5064 static SIZE_TYPE s_VisibleHtmlWidth(const string& str)
5065 {
5066  SIZE_TYPE width = 0, pos = 0;
5067  for (;;) {
5068  SIZE_TYPE pos2 = str.find_first_of("<&", pos);
5069  if (pos2 == NPOS) {
5070  width += str.size() - pos;
5071  break;
5072  } else {
5073  width += pos2 - pos;
5074  if (str[pos2] == '&') {
5075  ++width;
5076  pos = s_EndOfReference(str, pos);
5077  } else {
5078  pos = s_EndOfTag(str, pos);
5079  }
5080  if (pos == NPOS) {
5081  break;
5082  } else {
5083  ++pos;
5084  }
5085  }
5086  }
5087  return width;
5088 }
5089 
5090 static
5091 inline bool _isspace(unsigned char c)
5093  return ((c>=0x09 && c<=0x0D) || (c==0x20));
5094 }
5095 
5096 template<typename _D>
5097 void NStr::WrapIt(const string& str, SIZE_TYPE width,
5098  _D& dest, TWrapFlags flags,
5099  const string* prefix,
5100  const string* prefix1)
5101 {
5102  if (prefix == 0) {
5103  prefix = &kEmptyStr;
5104  }
5105 
5106  if (prefix1 == 0)
5107  prefix1 = prefix;
5108 
5109  SIZE_TYPE pos = 0, len = str.size(), nl_pos = 0;
5110 
5111  const bool is_html = flags & fWrap_HTMLPre ? true : false;
5112  const bool do_flat = (flags & fWrap_FlatFile) != 0;
5113  string temp_back; temp_back.reserve(width);
5114 
5115  enum EScore { // worst to best
5116  eForced,
5117  ePunct,
5118  eComma,
5119  eSpace,
5120  eNewline
5121  };
5122 
5123  // To avoid copying parts of str when we need to store a
5124  // substr of str, we store the substr as a pair
5125  // representing start (inclusive) and end (exclusive).
5126  typedef pair<SIZE_TYPE, SIZE_TYPE> TWrapSubstr;
5127 
5128  // This variable is used for HTML links that cross line boundaries.
5129  // Since it's aesthetically displeasing for a link to cross a boundary, we
5130  // close it at the end of each line and re-open it after the next line's
5131  // prefix
5132  // (This is needed in, e.g. AE017351)
5133  TWrapSubstr best_link(0, 0); // last link found before current best_pos
5134  TWrapSubstr latest_link(0, 0); // last link found at all
5135 
5136  while (pos < len) {
5137  bool hyphen = false; // "-" or empty
5138  SIZE_TYPE column = is_html ? s_VisibleHtmlWidth(*prefix1) : prefix1->size();
5139  SIZE_TYPE column0 = column;
5140  // the next line will start at best_pos
5141  SIZE_TYPE best_pos = NPOS;
5142  EScore best_score = eForced;
5143 
5144  // certain logic can be skipped if this part has no backspace,
5145  // which is, by far, the most common case
5146  bool thisPartHasBackspace = false;
5147 
5148  temp_back = *prefix1;
5149 
5150  // append any still-open links from previous lines
5151  if (is_html && best_link.second != 0) {
5152  temp_back.append(
5153  str.begin() + best_link.first,
5154  str.begin() + best_link.second);
5155  }
5156 
5157  SIZE_TYPE pos0 = pos;
5158 
5159  // we can't do this in HTML mode because we might have to deal with
5160  // link tags that go across lines.
5161  if (!is_html) {
5162  if (nl_pos <= pos) {
5163  nl_pos = str.find('\n', pos);
5164  if (nl_pos == NPOS) {
5165  nl_pos = len;
5166  }
5167  }
5168  if (column + (nl_pos - pos) <= width) {
5169  pos0 = nl_pos;
5170  }
5171  }
5172 
5173  for (SIZE_TYPE pos2 = pos0; pos2 < len && column <= width;
5174  ++pos2, ++column) {
5175  EScore score = eForced;
5176  SIZE_TYPE score_pos = pos2;
5177  const char c = str[pos2];
5178 
5179  if (c == '\n') {
5180  best_pos = pos2;
5181  best_score = eNewline;
5182  best_link = latest_link;
5183  break;
5184  }
5185  else if (_isspace((unsigned char)c)) {
5186  if (!do_flat && pos2 > 0 &&
5187  _isspace((unsigned char)str[pos2 - 1])) {
518