85 for (
int i = 0;
i < 256; ++
i) {
107 string clause_ends(
".?!;:\"{}[]()");
108 ITERATE (
string, it, clause_ends) {
139 bool is_alpha =
false;
162 for ( ;
i < s.size(); ++
i) {
167 return (
i == s.size() ? string::npos :
i);
173 for ( ;
i < s.size(); ++
i) {
178 return (
i == s.size() ? string::npos :
i);
184 for ( ;
i < s.size(); ++
i) {
189 return (
i == s.size() ? string::npos :
i);
201 if (iter->first[0] ==
'p' && iter->first.find(
"phrase: ") == 0) {
202 phrase_out.
insert(phrase_out.
end(), *iter);
218 freq.
Add(iter->first, iter->second);
224 const string& prefix,
228 if (iter->first.find_first_of(
":") != string::npos) {
234 freq.
Add(prefix +
": " + iter->first, iter->second);
340 _TRACE(
"CTextUtil::GetWordFrequencies(): text = " <<
text);
341 string::size_type clause_start = 0;
342 string::size_type clause_end =
text.size();
344 list<string> prev_words;
350 while (clause_start != clause_end) {
351 clause_end =
text.size();
353 if (pos != string::npos) {
358 _TRACE(
"clause: |" <<
text.substr(clause_start, clause_end - clause_start) <<
"|");
359 for ( ; clause_start != clause_end; clause_start = pos) {
363 if (clause_start == clause_end) {
368 pos =
min(clause_end,
376 word.assign(
text, clause_start, pos - clause_start);
379 string::size_type pos1 =
380 word.find_first_not_of(
"0123456789");
381 if (pos1 == string::npos) {
390 string::iterator copy_to = word.begin();
392 if (*copy_from ==
'\'') {
397 if (copy_to != word.end()) {
398 word.erase(copy_to, word.end());
409 _TRACE(
" word: " << word);
413 typedef pair<string, string> TDiphPair;
414 static const TDiphPair sc_DiphPairs[] = {
415 TDiphPair(
"oe",
"e"),
420 for (
size_t i = 0;
i <
sizeof(sc_DiphPairs) /
sizeof(TDiphPair); ++
i) {
421 if (word.find(sc_DiphPairs[
i].first) != string::npos) {
424 sc_DiphPairs[
i].second,
441 prev_words.push_back(word);
444 prev_words.push_back(stem);
449 while (prev_words.size() > 3) {
450 prev_words.pop_front();
452 if (prev_words.size() > 1) {
453 list<string>::iterator pit = prev_words.begin();
454 list<string>::iterator end = prev_words.end();
456 for ( ; pit != end; ++pit) {
476 _TRACE(
" phrase: |" << phrase <<
"|");
495 if (clause_start == string::npos) {
502 _TRACE(
" word: " << it->first <<
" count: " << it->second);
514 if (iter->first.find_first_of(
":") != string::npos) {
521 if (it != stem_freq.
end()) {
522 it->second += iter->second;
655 return (iter != sc_StopWords.end());
668 for ( ; stop_it != stop_end && it != end; ) {
669 if (it->first == *stop_it) {
673 if (it->first < *stop_it) {
685 string::size_type pos = 0;
686 while ( (pos = title.find_first_of(
".,[](){};:'\"/?<>", pos)) != string::npos) {
695 vector<unsigned char>&
data)
716 const vector<unsigned char>&
data)
723 const vector<char>&
data)
737 const void*
data,
size_t data_len)
static void Stem(const string &in_str, string *out_str)
Compute the Porter stem for a given word.
iterator find(const Key &key)
pair< iterator, bool > insert(const value_type &val)
void Add(Key idx, Score weight=Score(1))
TVector::value_type value_type
TVector::iterator iterator
Reallocable memory buffer (no memory copy overhead) Mimics vector<>, without the overhead of explicit...
TBase::const_iterator const_iterator
static void GetStemFrequencies(const TWordFreq &freq, TWordFreq &stems, TFlags flags=fDefaults)
retrieve stem frequencies from a set of word frequencies
static bool IsStopWord(const string &str)
return true if the provided word is a stop word
static void TrimStopWords(TWordFreq &freq)
eliminate the stop words frm a set of word frequencies
static void CleanJournalTitle(string &title)
perform a set of punctuational clean-ups on a string suitable for a journal or book title
static void SplitWordFrequencies(const TWordFreq &wf_in, TWordFreq &wf_out, TWordFreq &phrase_out)
split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two ...
static void EncodeFreqs(const TWordFreq &freq, vector< char > &data)
static void AddWordFrequencies(TWordFreq &freq, const TWordFreq &wf, TFlags flags=0)
add a set of frequencies into another set
static void GetWordFrequencies(const string &text, TWordFreq &freq, TFlags flags=fDefaults)
retrieve word frequencies for a given piece of text
static void DecodeFreqs(TWordFreq &freq, const vector< char > &data)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
double wf(double lambda, double D_LR, double D_LU, double D_LD, double D_RU, double D_RD, double D_DU)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
uint16_t Uint2
2-byte (16-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static string Int8ToString(Int8 value, TNumToStringFlags flags=0, int base=10)
Convert Int8 to string.
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
unsigned int
A callback function used to compare two keys in a database.
static void text(MDB_val *v)
CStaticArraySet< const char *, PCase_CStr > TStopWords
static string::size_type s_NextClauseStop(const string &s, string::size_type i)
static bool s_IsNumeric(unsigned char c)
static SLoadTokens s_ForceTokenLoad
void s_NumericToFreq(const T &val, CTextUtil::TWordFreq &freq)
static const char *const sc_StopWordArray[]
Stop Word Pruning.
static bool s_IsAlphaNumeric(unsigned char c)
string s_ValToString(Int4 i)
static string::size_type s_NextTokenStart(const string &s, string::size_type i)
static char s_ToLower(unsigned char c)
DEFINE_STATIC_ARRAY_MAP(TStopWords, sc_StopWords, sc_StopWordArray)
static string::size_type s_NextTokenStop(const string &s, string::size_type i)
static Uint2 sc_Tokens[256]
void Encode(const CRawScoreVector< Key, Score > &, vector< char > &)
void Decode(const vector< char > &, CRawScoreVector< Key, Score > &)