NCBI C++ ToolKit
|
Search Toolkit Book for CTextUtil
#include <algo/text/text_util.hpp>
Public Types | |
enum | EOptions { fDiphthongReplace = 0x01 , fPorterStem = 0x02 , fTrimStops = 0x04 , fNoNumeric = 0x08 , fIncludePhrases = 0x10 , fPhrase_NoStems = 0x20 , fPhrase_NoPrefix = 0x40 , fDefaults } |
typedef int | TFlags |
typedef CScoreVector< string, float > | TWordFreq |
typedef for word frequencies More... | |
Static Public Member Functions | |
static void | GetWordFrequencies (const string &text, TWordFreq &freq, TFlags flags=fDefaults) |
retrieve word frequencies for a given piece of text More... | |
static void | GetWordFrequencies (CNcbiIstream &istr, TWordFreq &freq, TFlags flags=fDefaults) |
retrieve word frequencies from a file More... | |
static void | GetWordFrequencies (Int4 i, TWordFreq &freq) |
convert an integer into a set of word frequencies this maps ints to smaller strings to compress the dictionary More... | |
static void | GetWordFrequencies (Uint4 i, TWordFreq &freq) |
static void | GetWordFrequencies (Int8 i, TWordFreq &freq) |
static void | GetWordFrequencies (Uint8 i, TWordFreq &freq) |
static void | GetWordFrequencies (float i, TWordFreq &freq) |
static void | GetWordFrequencies (double i, TWordFreq &freq) |
static void | SplitWordFrequencies (const TWordFreq &wf_in, TWordFreq &wf_out, TWordFreq &phrase_out) |
split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two separately More... | |
static void | GetStemFrequencies (const TWordFreq &freq, TWordFreq &stems, TFlags flags=fDefaults) |
retrieve stem frequencies from a set of word frequencies More... | |
static void | AddWordFrequencies (TWordFreq &freq, const TWordFreq &wf, TFlags flags=0) |
add a set of frequencies into another set More... | |
static void | AddWordFrequencies (TWordFreq &freq, const TWordFreq &wf, const string &prefix, TFlags flags=0) |
static bool | IsStopWord (const string &str) |
return true if the provided word is a stop word More... | |
static void | TrimStopWords (TWordFreq &freq) |
eliminate the stop words frm a set of word frequencies More... | |
static void | EncodeFreqs (const TWordFreq &freq, vector< char > &data) |
static void | EncodeFreqs (const TWordFreq &freq, vector< unsigned char > &data) |
encode word frequencies in a serializable blob of data More... | |
static void | EncodeFreqs (const TWordFreq &freq, CSimpleBuffer &data) |
static void | DecodeFreqs (TWordFreq &freq, const vector< char > &data) |
static void | DecodeFreqs (TWordFreq &freq, const vector< unsigned char > &data) |
decode from a serializable blob of data More... | |
static void | DecodeFreqs (TWordFreq &freq, const CSimpleBuffer &data) |
static void | DecodeFreqs (TWordFreq &freq, const void *data, size_t data_len) |
static void | CleanJournalTitle (string &title) |
perform a set of punctuational clean-ups on a string suitable for a journal or book title More... | |
Definition at line 117 of file text_util.hpp.
typedef int CTextUtil::TFlags |
Definition at line 133 of file text_util.hpp.
typedef CScoreVector<string, float> CTextUtil::TWordFreq |
typedef for word frequencies
Definition at line 136 of file text_util.hpp.
enum CTextUtil::EOptions |
Enumerator | |
---|---|
fDiphthongReplace | |
fPorterStem | |
fTrimStops | |
fNoNumeric | |
fIncludePhrases | |
fPhrase_NoStems | |
fPhrase_NoPrefix | |
fDefaults |
Definition at line 121 of file text_util.hpp.
|
static |
Definition at line 223 of file text_util.cpp.
References CScoreVector< Key, Score >::Add(), flags, fNoNumeric, ITERATE, s_IsNumeric(), and wf().
|
static |
add a set of frequencies into another set
Definition at line 211 of file text_util.cpp.
References CScoreVector< Key, Score >::Add(), flags, fNoNumeric, ITERATE, s_IsNumeric(), and wf().
|
static |
perform a set of punctuational clean-ups on a string suitable for a journal or book title
Definition at line 683 of file text_util.cpp.
References NStr::ToLower().
|
static |
Definition at line 729 of file text_util.cpp.
|
static |
Definition at line 722 of file text_util.cpp.
|
static |
decode from a serializable blob of data
Definition at line 715 of file text_util.cpp.
|
static |
Definition at line 736 of file text_util.cpp.
|
static |
Definition at line 707 of file text_util.cpp.
Definition at line 700 of file text_util.cpp.
encode word frequencies in a serializable blob of data
Definition at line 694 of file text_util.cpp.
|
static |
retrieve stem frequencies from a set of word frequencies
Definition at line 508 of file text_util.cpp.
References CScoreVector< Key, Score >::end(), CScoreVector< Key, Score >::find(), flags, fTrimStops, CScoreVector< Key, Score >::insert(), ITERATE, CDictionaryUtil::Stem(), and TrimStopWords().
|
static |
retrieve word frequencies from a file
Definition at line 536 of file text_util.cpp.
References flags, GetWordFrequencies(), and NcbiGetlineEOL().
|
static |
retrieve word frequencies for a given piece of text
Definition at line 336 of file text_util.cpp.
References _TRACE, CScoreVector< Key, Score >::Add(), fDiphthongReplace, fIncludePhrases, first(), flags, fNoNumeric, fPhrase_NoPrefix, fPhrase_NoStems, fTrimStops, i, IsStopWord(), ITERATE, min(), NON_CONST_ITERATE, NStr::Replace(), s_NextClauseStop(), s_NextTokenStart(), s_NextTokenStop(), s_NumericToFreq(), s_ToLower(), CDictionaryUtil::Stem(), and text().
Referenced by GetWordFrequencies().
|
static |
Definition at line 324 of file text_util.cpp.
References i, and s_NumericToFreq().
|
static |
Definition at line 330 of file text_util.cpp.
References i, and s_NumericToFreq().
convert an integer into a set of word frequencies this maps ints to smaller strings to compress the dictionary
Definition at line 300 of file text_util.cpp.
References i, and s_NumericToFreq().
Definition at line 306 of file text_util.cpp.
References i, and s_NumericToFreq().
Definition at line 312 of file text_util.cpp.
References i, and s_NumericToFreq().
Definition at line 318 of file text_util.cpp.
References i, and s_NumericToFreq().
return true if the provided word is a stop word
Definition at line 652 of file text_util.cpp.
References str().
Referenced by GetWordFrequencies().
|
static |
split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two separately
Definition at line 197 of file text_util.cpp.
References CScoreVector< Key, Score >::end(), CScoreVector< Key, Score >::insert(), and ITERATE.
|
static |
eliminate the stop words frm a set of word frequencies
Definition at line 659 of file text_util.cpp.
References CScoreVector< Key, Score >::begin(), CScoreVector< Key, Score >::end(), and CScoreVector< Key, Score >::erase().
Referenced by GetStemFrequencies().