NCBI C++ ToolKit
ncbistr_util.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef CORELIB___STR_UTIL__HPP
2 #define CORELIB___STR_UTIL__HPP
3 
4 /* $Id: ncbistr_util.hpp 82095 2018-05-01 14:45:23Z ivanov $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Eugene Vasilchenko, Aaron Ucko, Denis Vakatov, Anatoliy Kuznetsov
30  *
31  * File Description:
32  * String algorithms
33  *
34  */
35 
36 /// @file ncbistr_util.hpp
37 /// Algorithms for string processing
38 
39 #include <corelib/ncbistr.hpp>
40 #include <corelib/ncbiexpt.hpp>
41 
42 
44 
45 
46 /** @addtogroup String
47  *
48  * @{
49  */
50 
51 class CStrTokenizeBase;
52 
53 /// Do-nothing token position container
54 ///
56 {
57  void push_back(string::size_type /*pos*/) {}
58  void reserve(string::size_type) {}
59 };
60 
61 /// Do nothing token counter
62 ///
64 {
65  static
66  size_t Count(CStrTokenizeBase& /*tokenizer*/)
67  {
68  return 0;
69  }
70 };
71 
72 /// Do nothing target reservation trait
73 ///
74 /// applies for list<> or cases when reservation only makes things slower
75 /// (may be the case if we use deque<> as a target container and tokenize
76 /// large text)
77 ///
78 template<class TV, class TP>
80 {
81  static
82  void Reserve(CStrTokenizeBase& /*tokenizer*/,
83  TV& /*target*/,
84  TP& /*token_pos*/)
85  {}
86 };
87 
88 
89 /// Singly-linked list of substrings that will constitute a single
90 /// Split/Tokenize piece, optimized for the typical one-node case.
92 {
93 public:
95  : m_LastNode(NULL), m_Storage(storage) { }
96 
97  void Add(const CTempString& s);
98  void Clear(void);
99  void Join(string* s) const;
100  void Join(CTempString* s) const;
101  void Join(CTempStringEx* s) const;
102  size_t GetSize(void) const;
103 
104 private:
105  struct SNode
106  {
107  SNode() { }
108  SNode(const CTempString& s) : str(s) { }
109 
111  unique_ptr<SNode> next;
112  };
113 
117 };
118 
119 inline
121 {
122  if (m_LastNode == NULL) {
123  m_FirstNode.str = s;
125  } else {
126  m_LastNode->next.reset(new SNode(s));
127  m_LastNode = m_LastNode->next.get();
128  }
129 }
130 
131 inline
133 {
135  m_FirstNode.next.reset();
136  m_LastNode = NULL;
137 }
138 
140 {
141 public:
143 
144  CStrTokenizeBase(const CTempString& str, const CTempString& delim,
145  TFlags flags, CTempString_Storage* storage);
146 
147  SIZE_TYPE GetPos(void) const { return m_Pos; }
148  bool AtEnd (void) const { return m_Pos == NPOS; }
149 
150  /// Return TRUE if it found some text and put it into collector.
151  /// The collector can have empty strings, even if function returns FALSE.
152  /// @note
153  /// This method don't honor NStr::fSplit_Truncate_Begin flag,
154  /// due it's stream nature, so you should process it in the calling code.
155  /// NStr::fSplit_Truncate_Begin is honored.
156  bool Advance(CTempStringList* part_collector) {
157  return Advance(part_collector, NULL, NULL);
158  };
159  bool Advance(CTempStringList* part_collector,
160  SIZE_TYPE* ptr_part_start /* out */,
161  SIZE_TYPE* ptr_delim_pos /* out */);
162 
163  /// Skip all delimiters starting from current position.
164  void SkipDelims(void) { x_SkipDelims(true); }
165 
166  /// Assumes that we already have a delimiter on the previous
167  /// position, so just skip all subsequent, depending on flags.
168  void MergeDelims(void) { x_SkipDelims(false); }
169 
170  // Set new delimiters.
171  void SetDelim(const CTempString& delim);
172 
173 protected:
179 
180 private:
181  void x_ExtendInternalDelim();
182  void x_SkipDelims(bool force_skip);
183 
186 };
187 
188 inline
190  const CTempString& delim,
191  TFlags flags,
192  CTempString_Storage* storage)
193  : m_Str(str), m_Pos(0), m_Flags(flags), m_Storage(storage)
194 {
195  SetDelim(delim);
196 }
197 
198 inline
200 {
201  m_Delim = delim;
202 
203  if ((m_Flags & NStr::fSplit_ByPattern) == 0) {
205  } else {
207  }
210  }
211 }
212 
213 /// Main tokenization algorithm
214 ///
215 /// TStr - string type (must follow std::string interface)
216 /// TV - container type for tokens (std::vector)
217 /// TP - target container type for token positions (vector<size_t>)
218 /// TCount - token count trait for the target container space reservation
219 /// TReserve- target space reservation trait
220 ///
221 template<class TStr,
222  class TV,
223  class TP = CStrDummyTokenPos,
224  class TCount = CStrDummyTokenCount,
225  class TReserve = CStrDummyTargetReserve<TV, TP> >
227 {
228 public:
229  typedef TStr TString;
230  typedef TV TContainer;
231  typedef TP TPosContainer;
232  typedef TCount TCountTrait;
233  typedef TReserve TReserveTrait;
234 
235  /// Constructor
236  ///
237  /// @param str
238  /// String to be tokenized.
239  /// @param delim
240  /// Set of char delimiters used to tokenize string "str".
241  /// If delimiter is empty, then input string is appended to "arr" as is.
242  /// @param flags
243  /// Flags governing splitting.
244  /// Without NStr::fSplit_MergeDelimiters, delimiters that immediately follow
245  /// each other are treated as separate delimiters - empty string(s)
246  /// appear in the target output.
247  ///
248  CStrTokenize(const TString& str, const TString& delim, TFlags flags, CTempString_Storage* storage)
249  : CStrTokenizeBase(str, delim, flags, storage)
250  { }
251 
252  /// Tokenize the string using the specified set of char delimiters.
253  ///
254  /// @param target
255  /// Output container.
256  /// Tokens defined in "str" by using symbols from "delim" are added
257  /// to the list "target".
258  /// @param token_pos
259  /// Container for the tokens' positions in "str".
260  /// @param empty_str
261  /// String added to target when there are no other characters between
262  /// delimiters
263  ///
264  void Do(TContainer& target,
265  TPosContainer& token_pos,
266  const TString& empty_str = TString())
267  {
268  auto target_initial_size = target.size();
269 
270  // Special cases
271  if (m_Str.empty()) {
272  return;
273  } else if (m_Delim.empty()) {
274  target.push_back(m_Str);
275  token_pos.push_back(0);
276  return;
277  }
278  // Do target space reservation (if applicable)
279  TReserveTrait::Reserve(*this, target, token_pos);
280 
281  // Tokenization
282 
283  CTempStringList part_collector(m_Storage);
284  SIZE_TYPE prev_pos;
285  SIZE_TYPE delim_pos = NPOS;
286  m_Pos = 0;
287  do {
288  Advance(&part_collector, &prev_pos, &delim_pos);
289  target.push_back(empty_str); // reserve space for value added next line
290  part_collector.Join(&target.back());
291  part_collector.Clear();
292  token_pos.push_back(prev_pos);
293  } while ( !AtEnd() );
294 
295  if ( (m_Flags & NStr::fSplit_Truncate_End) == 0 ) {
296  // account training delimiter
297  if (delim_pos != NPOS) {
298  // add empty token after last delimiter
299  target.push_back(empty_str);
300  token_pos.push_back(delim_pos + 1);
301  }
302  }
303  else {
304  // truncate trailing delimiters
305 
306  // number of newly added items
307  SIZE_TYPE num_new = target.size() - target_initial_size;
308  // number of empty trailing items to delete
309  SIZE_TYPE num_del = 0;
310  for (auto i = target.rbegin(); i != target.rend() && num_new--; ++i) {
311  if (!i->empty()) {
312  break;
313  }
314  num_del++;
315  }
316  if (num_del) {
317  target.resize(target.size()-num_del);
318  token_pos.resize(token_pos.size()-num_del);
319  }
320  }
321  }
322 };
323 
324 
325 /// token count trait for std::string
326 ///
328 {
329  static
330  size_t Count(CStrTokenizeBase& tokenizer)
331  {
332  size_t tokens = 0;
333 
334  do {
335  if (tokenizer.Advance(NULL, NULL, NULL)) {
336  ++tokens;
337  }
338  } while ( !tokenizer.AtEnd() );
339 
340  return tokens;
341  }
342 };
343 
344 
345 /// Target reservation trait (applies for vector<>)
346 ///
347 template<class TV, class TP, class TCount>
349 {
350  static
351  void Reserve(CStrTokenizeBase& tokenizer,
352  TV& target,
353  TP& token_pos)
354  {
355  // Reserve vector size only for empty vectors. For vectors which
356  // already have items this has been known to work more slowly.
357  if ( target.empty() ) {
358  size_t tokens = TCount::Count(tokenizer);
359  if (tokens) {
360  token_pos.reserve(tokens);
361  target.reserve(tokens);
362  }
363  }
364  }
365 };
366 
367 
368 /// Adapter for token position container pointer(NULL legal)
369 /// Makes pointer to a container look as a legal container
370 ///
371 template<class TPosContainer>
373 {
374 public:
375  /// If token_pos construction parameter is NULL all calls are ignored
376  CStrTokenPosAdapter(TPosContainer* token_pos)
377  : m_TokenPos(token_pos)
378  {}
379 
380  void push_back(string::size_type pos)
381  {
382  if (m_TokenPos) m_TokenPos->push_back(pos);
383  }
384  void reserve(string::size_type capacity)
385  {
386  if (m_TokenPos) m_TokenPos->reserve(capacity);
387  }
388  string::size_type size()
389  {
390  return m_TokenPos ? m_TokenPos->size() : 0;
391  }
392  void resize(string::size_type newsize)
393  {
394  if (m_TokenPos) m_TokenPos->resize(newsize);
395  }
396 private:
397  TPosContainer* m_TokenPos;
398 };
399 
400 
401 
402  /*
403  * @}
404  */
405 
406 /////////////////////////////////////////////////////////////////////////////
407 
408 
410 
411 #endif // CORELIB___STR_UTIL__HPP
Adapter for token position container pointer(NULL legal) Makes pointer to a container look as a legal...
Main tokenization algorithm.
Singly-linked list of substrings that will constitute a single Split/Tokenize piece,...
Helper class to allocate memory for CTempString[Ex] on demand in the functions which need to modify t...
Definition: tempstr.hpp:1051
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
The NCBI C++ standard methods for dealing with std::string.
static uch flags
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CTempString m_Delim
CTempStringEx m_InternalDelim
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static void Reserve(CStrTokenizeBase &tokenizer, TV &target, TP &token_pos)
const CTempString & m_Str
TReserve TReserveTrait
void push_back(string::size_type pos)
bool AtEnd(void) const
void reserve(string::size_type)
CTempStringList(CTempString_Storage *storage)
CStrTokenizeBase(const CTempString &str, const CTempString &delim, TFlags flags, CTempString_Storage *storage)
void x_ExtendInternalDelim()
Definition: ncbistr.cpp:7546
#define NPOS
Definition: ncbistr.hpp:133
static size_t Count(CStrTokenizeBase &tokenizer)
void x_SkipDelims(bool force_skip)
Definition: ncbistr.cpp:7525
NStr::TSplitFlags TFlags
void resize(string::size_type newsize)
CTempString_Storage * m_Storage
static size_t Count(CStrTokenizeBase &)
size_t GetSize(void) const
Definition: ncbistr.cpp:7383
SNode(const CTempString &s)
int TSplitFlags
Bitwise OR of ESplitFlags.
Definition: ncbistr.hpp:2510
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
CStrTokenize(const TString &str, const TString &delim, TFlags flags, CTempString_Storage *storage)
Constructor.
void reserve(string::size_type capacity)
void SetDelim(const CTempString &delim)
static void Reserve(CStrTokenizeBase &, TV &, TP &)
bool Advance(CTempStringList *part_collector)
Return TRUE if it found some text and put it into collector.
void clear(void)
Clears the string.
Definition: tempstr.hpp:351
void MergeDelims(void)
Assumes that we already have a delimiter on the previous position, so just skip all subsequent,...
CTempString_Storage * m_Storage
void Clear(void)
void Join(string *s) const
Definition: ncbistr.cpp:7341
void Do(TContainer &target, TPosContainer &token_pos, const TString &empty_str=TString())
Tokenize the string using the specified set of char delimiters.
TPosContainer * m_TokenPos
void Add(const CTempString &s)
void push_back(string::size_type)
TCount TCountTrait
string::size_type size()
void SkipDelims(void)
Skip all delimiters starting from current position.
CTempStringEx & assign(const char *str, size_type len)
Assign new values to the content of the a string.
Definition: tempstr.hpp:970
CTempString_Storage m_DelimStorage
SIZE_TYPE GetPos(void) const
CStrTokenPosAdapter(TPosContainer *token_pos)
If token_pos construction parameter is NULL all calls are ignored.
unique_ptr< SNode > next
@ fSplit_CanEscape
Allow \... escaping.
Definition: ncbistr.hpp:2503
@ fSplit_Truncate_End
Truncate trailing delimiters.
Definition: ncbistr.hpp:2500
@ fSplit_CanQuote
Definition: ncbistr.hpp:2506
@ fSplit_ByPattern
Require full delimiter strings.
Definition: ncbistr.hpp:2502
int i
Defines NCBI C++ exception handling.
static const char * str(char *buf, int n)
Definition: stats.c:84
Do nothing target reservation trait.
Do nothing token counter.
Do-nothing token position container.
Target reservation trait (applies for vector<>)
token count trait for std::string
Modified on Fri Dec 01 04:45:25 2023 by modify_doxy.py rev. 669887