64 bool x_IsWordCharacter(
char c) {
65 return (c >=
'0' && c <=
'9') || (c >=
'a' && c <=
'z') || (c >=
'A' && c <=
'Z') || c ==
'_' || c ==
'-';
68 void x_Split(
const string& s, vector<string>& v)
72 for (
i = 0;
i < s.length();
i++) {
75 v.push_back(s.substr(
n,
i -
n));
79 else if (x_IsWordCharacter(s[
i])) {
83 else if (!x_IsWordCharacter(s[
n])) {
84 v.push_back(s.substr(
n,
i -
n));
92 else if (x_IsWordCharacter(s[
n])) {
93 v.push_back(s.substr(
n,
i -
n));
99 v.push_back(s.substr(
n,
i -
n));
103 string x_Assemble(vector<string>& v, vector<bool>& skip)
107 for (
size_t i = 0;
i < v.size();
i++) {
109 if (!
first && x_IsWordCharacter(v[
i][0])) {
126 if (!strip_space && !strip_punct)
129 bool has_stripped =
false;
131 const char* s =
str.data();
132 for (;
i <
str.size();
i++, s++)
138 storage.reserve(
str.size()-1);
140 storage.append(
str.data(),
i);
147 storage.push_back(*s);
158 bool x_DisallowCharacter(
const char ch,
bool disallow_slash)
161 else if (disallow_slash && ch ==
'/')
return true;
171 const auto& callback = [&](
size_t n,
size_t p) {
186 #include "weasel.inc"
188 static const TLocalFSM s_FSM{s_compact, s_hits_init_1, s_hits_init_2, s_states,
nullptr};
232 for (
unsigned i=0;
i<
match.size();
i++) {
264 while (it !=
str.end() && !
isalpha((
unsigned char) (*it))) {
265 if (
isdigit( (
unsigned char) (*it))) {
271 if (it !=
str.end()) {
272 return isalpha((
unsigned char) (*it)) &&
isupper((
unsigned char) (*it));
282 for (
size_t i = 0;
i <
str.size() && rval; ++
i) {
288 }
else if (
str[
i] ==
'-' ){
310 else if (start.
empty() || found == string::npos) {
315 if (x_DisallowCharacter (start[found-1], disallow_slash)) {
319 after_idx = found + match_len;
320 if (after_idx < start.
size() && x_DisallowCharacter(start[after_idx], disallow_slash)) {
333 vector<size_t> match_lens = (*word)->GetMatchLens(
str, pattern, prev_char);
334 if (match_lens.size() > 0) {
335 size_t word_len = (*word)->GetWord().length();
337 size_t this_match = 0;
338 char this_prev_char = 0;
340 this_prev_char =
str.c_str()[(*len) - 1];
342 this_prev_char = prev_char;
344 bool require_end =
false;
349 (!require_end || this_match ==
str.substr(*len).length())) {
351 match_len += this_match;
359 if (pattern.length() == 0) {
363 if (
str.length() == 0) {
390 if (
str[0] == pattern[0]) {
406 size_t match_len = 0;
408 if (ini_target_match_len !=
NULL) {
409 *ini_target_match_len = match_len;
422 size_t match_len = 0;
433 while (!rval && pos <
len) {
438 size_t sub_match_len = 0;
520 return search == pattern;
543 if (
str.original().original().empty()) {
569 cout <<
"eString_location_inlist is not supported!\n";
585 cout << pattern <<
" <===> " << search <<
"\nSelf-weasel case with ignored words is not supported!\n";
591 string s_search, p_search;
604 vector<bool> skip(v.size(),
false);
606 for (
size_t i = 0;
i < v.size();
i++) {
608 unsigned m = (1 << k);
623 string guess = x_Assemble(v, skip);
628 for (
size_t i = 0;
i <
test.size();
i++) {
630 skip[
test[
i]] =
false;
634 skip[
test[
i]] =
true;
635 if (
i ==
test.size() - 1) {
658 size_t match_len = 0;
664 offset += replace.length();
684 }
else if (
Empty()) {
697 size_t match_len = 0;
700 result.append(
val.data()+match_len,
val.length()-match_len);
713 size_t match_len = 0;
static constexpr auto s_WeaselWords
User-defined methods of the data storage class.
const string & uppercase() const
const string & original() const
const string & lowercase() const
CTempString GetNoweaselLC() const
CTempString GetNoweasel() const
CTempString GetNoweaselUC() const
CTempString::size_type m_noweasel_start
CAutoLowerCase m_original
unsigned GetWeaselMask() const
const CAutoLowerCase & original() const
void Search(const char *input, VoidCall1 found_callback) const
CTempString x_GetCompareString(const CMatchString &s, ECase e_case=e_automatic) const
bool x_IsAllSkippable(const CTempString &str) const
bool x_IsWholeWordMatch(const CTempString &start, size_t found, size_t match_len, bool disallow_slash=false) const
CTempString x_GetConstraintString(ECase e_case=e_automatic) const
bool x_MatchFound(CTempString &search, CTempString &pattern) const
bool x_IsFirstCap(const CMatchString &str) const
bool ReplaceStringConstraintPortionInString(string &result, const CMatchString &str, const string &replace) const
bool x_IsAllLowerCase(const CMatchString &str) const
bool x_IsAllPunctuation(const CMatchString &str) const
virtual ~CString_constraint()
bool x_ReplaceContains(string &val, const string &replace) const
bool Match(const CMatchString &str) const
bool x_DoesSingleStringMatchConstraint(const CMatchString &str) const
bool x_IsAllCaps(const CMatchString &str) const
bool x_IsFirstEachCap(const CMatchString &str) const
bool x_AdvancedStringCompare(const string &str, const string &str_match, const char prev_char, size_t *ini_target_match_len=0) const
bool x_IsSkippable(const char ch) const
bool x_AdvancedStringMatch(const string &str, const string &tmp_match) const
bool x_PartialCompare(const string &str, const string &pattern, char prev_char, size_t &match_len) const
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
#define test(a, b, c, d, e)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
uint8_t Uint1
1-byte (8-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
NCBI_NS_STD::string::size_type SIZE_TYPE
const char * const_iterator
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
size_type size(void) const
Return the length of the represented array.
static string & ToLower(string &str)
Convert string to lower case – string& version.
TCase_sensitive GetCase_sensitive(void) const
Get the Case_sensitive member data.
const TIgnore_words & GetIgnore_words(void) const
Get the Ignore_words member data.
bool IsSetMatch_location(void) const
Check if a value has been assigned to Match_location data member.
TIs_first_each_cap GetIs_first_each_cap(void) const
Get the Is_first_each_cap member data.
TIgnore_space GetIgnore_space(void) const
Get the Ignore_space member data.
TMatch_location GetMatch_location(void) const
Get the Match_location member data.
bool IsSetCase_sensitive(void) const
Check if a value has been assigned to Case_sensitive data member.
TIs_all_caps GetIs_all_caps(void) const
Get the Is_all_caps member data.
TIs_first_cap GetIs_first_cap(void) const
Get the Is_first_cap member data.
bool IsSetNot_present(void) const
Check if a value has been assigned to Not_present data member.
TWhole_word GetWhole_word(void) const
Get the Whole_word member data.
TIgnore_weasel GetIgnore_weasel(void) const
Get the Ignore_weasel member data.
TIgnore_punct GetIgnore_punct(void) const
Get the Ignore_punct member data.
EString_location
simple constraints
const TMatch_text & GetMatch_text(void) const
Get the Match_text member data.
TNot_present GetNot_present(void) const
Get the Not_present member data.
list< CRef< CWord_substitution > > Tdata
TIs_all_punct GetIs_all_punct(void) const
Get the Is_all_punct member data.
TIs_all_lower GetIs_all_lower(void) const
Get the Is_all_lower member data.
bool CanGetMatch_text(void) const
Check if it is safe to call GetMatch_text method.
bool CanGetIgnore_words(void) const
Check if it is safe to call GetIgnore_words method.
bool IsSetIgnore_words(void) const
Check if a value has been assigned to Ignore_words data member.
@ eString_location_inlist
@ eString_location_equals
@ eString_location_contains
@ eString_location_starts
const TYPE & Get(const CNamedParameterList *param)
constexpr bool empty(list< Ts... >) noexcept
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)