NCBI C++ ToolKit
Public Types | Static Public Member Functions | List of all members
CTextUtil Class Reference

Search Toolkit Book for CTextUtil

#include <algo/text/text_util.hpp>

Public Types

enum  EOptions {
  fDiphthongReplace = 0x01 , fPorterStem = 0x02 , fTrimStops = 0x04 , fNoNumeric = 0x08 ,
  fIncludePhrases = 0x10 , fPhrase_NoStems = 0x20 , fPhrase_NoPrefix = 0x40 , fDefaults
}
 
typedef int TFlags
 
typedef CScoreVector< string, float > TWordFreq
 typedef for word frequencies More...
 

Static Public Member Functions

static void GetWordFrequencies (const string &text, TWordFreq &freq, TFlags flags=fDefaults)
 retrieve word frequencies for a given piece of text More...
 
static void GetWordFrequencies (CNcbiIstream &istr, TWordFreq &freq, TFlags flags=fDefaults)
 retrieve word frequencies from a file More...
 
static void GetWordFrequencies (Int4 i, TWordFreq &freq)
 convert an integer into a set of word frequencies this maps ints to smaller strings to compress the dictionary More...
 
static void GetWordFrequencies (Uint4 i, TWordFreq &freq)
 
static void GetWordFrequencies (Int8 i, TWordFreq &freq)
 
static void GetWordFrequencies (Uint8 i, TWordFreq &freq)
 
static void GetWordFrequencies (float i, TWordFreq &freq)
 
static void GetWordFrequencies (double i, TWordFreq &freq)
 
static void SplitWordFrequencies (const TWordFreq &wf_in, TWordFreq &wf_out, TWordFreq &phrase_out)
 split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two separately More...
 
static void GetStemFrequencies (const TWordFreq &freq, TWordFreq &stems, TFlags flags=fDefaults)
 retrieve stem frequencies from a set of word frequencies More...
 
static void AddWordFrequencies (TWordFreq &freq, const TWordFreq &wf, TFlags flags=0)
 add a set of frequencies into another set More...
 
static void AddWordFrequencies (TWordFreq &freq, const TWordFreq &wf, const string &prefix, TFlags flags=0)
 
static bool IsStopWord (const string &str)
 return true if the provided word is a stop word More...
 
static void TrimStopWords (TWordFreq &freq)
 eliminate the stop words frm a set of word frequencies More...
 
static void EncodeFreqs (const TWordFreq &freq, vector< char > &data)
 
static void EncodeFreqs (const TWordFreq &freq, vector< unsigned char > &data)
 encode word frequencies in a serializable blob of data More...
 
static void EncodeFreqs (const TWordFreq &freq, CSimpleBuffer &data)
 
static void DecodeFreqs (TWordFreq &freq, const vector< char > &data)
 
static void DecodeFreqs (TWordFreq &freq, const vector< unsigned char > &data)
 decode from a serializable blob of data More...
 
static void DecodeFreqs (TWordFreq &freq, const CSimpleBuffer &data)
 
static void DecodeFreqs (TWordFreq &freq, const void *data, size_t data_len)
 
static void CleanJournalTitle (string &title)
 perform a set of punctuational clean-ups on a string suitable for a journal or book title More...
 

Detailed Description

Definition at line 117 of file text_util.hpp.

Member Typedef Documentation

◆ TFlags

Definition at line 133 of file text_util.hpp.

◆ TWordFreq

typedef for word frequencies

Definition at line 136 of file text_util.hpp.

Member Enumeration Documentation

◆ EOptions

Enumerator
fDiphthongReplace 
fPorterStem 
fTrimStops 
fNoNumeric 
fIncludePhrases 
fPhrase_NoStems 
fPhrase_NoPrefix 
fDefaults 

Definition at line 121 of file text_util.hpp.

Member Function Documentation

◆ AddWordFrequencies() [1/2]

void CTextUtil::AddWordFrequencies ( TWordFreq freq,
const TWordFreq wf,
const string prefix,
TFlags  flags = 0 
)
static

◆ AddWordFrequencies() [2/2]

void CTextUtil::AddWordFrequencies ( TWordFreq freq,
const TWordFreq wf,
TFlags  flags = 0 
)
static

add a set of frequencies into another set

Definition at line 211 of file text_util.cpp.

References CScoreVector< Key, Score >::Add(), flags, fNoNumeric, ITERATE, s_IsNumeric(), and wf().

◆ CleanJournalTitle()

void CTextUtil::CleanJournalTitle ( string title)
static

perform a set of punctuational clean-ups on a string suitable for a journal or book title

Definition at line 683 of file text_util.cpp.

References NStr::ToLower().

◆ DecodeFreqs() [1/4]

void CTextUtil::DecodeFreqs ( CTextUtil::TWordFreq freq,
const CSimpleBuffer data 
)
static

Definition at line 729 of file text_util.cpp.

References Decode().

◆ DecodeFreqs() [2/4]

void CTextUtil::DecodeFreqs ( CTextUtil::TWordFreq freq,
const vector< char > &  data 
)
static

Definition at line 722 of file text_util.cpp.

References Decode().

◆ DecodeFreqs() [3/4]

void CTextUtil::DecodeFreqs ( CTextUtil::TWordFreq freq,
const vector< unsigned char > &  data 
)
static

decode from a serializable blob of data

Definition at line 715 of file text_util.cpp.

References Decode().

◆ DecodeFreqs() [4/4]

void CTextUtil::DecodeFreqs ( CTextUtil::TWordFreq freq,
const void *  data,
size_t  data_len 
)
static

Definition at line 736 of file text_util.cpp.

References Decode().

◆ EncodeFreqs() [1/3]

void CTextUtil::EncodeFreqs ( const TWordFreq freq,
CSimpleBuffer data 
)
static

Definition at line 707 of file text_util.cpp.

References Encode().

◆ EncodeFreqs() [2/3]

void CTextUtil::EncodeFreqs ( const TWordFreq freq,
vector< char > &  data 
)
static

Definition at line 700 of file text_util.cpp.

References Encode().

◆ EncodeFreqs() [3/3]

void CTextUtil::EncodeFreqs ( const TWordFreq freq,
vector< unsigned char > &  data 
)
static

encode word frequencies in a serializable blob of data

Definition at line 694 of file text_util.cpp.

References Encode().

◆ GetStemFrequencies()

void CTextUtil::GetStemFrequencies ( const TWordFreq freq,
TWordFreq stems,
TFlags  flags = fDefaults 
)
static

◆ GetWordFrequencies() [1/8]

void CTextUtil::GetWordFrequencies ( CNcbiIstream istr,
TWordFreq freq,
TFlags  flags = fDefaults 
)
static

retrieve word frequencies from a file

Definition at line 536 of file text_util.cpp.

References flags, GetWordFrequencies(), and NcbiGetlineEOL().

◆ GetWordFrequencies() [2/8]

void CTextUtil::GetWordFrequencies ( const string text,
TWordFreq freq,
TFlags  flags = fDefaults 
)
static

◆ GetWordFrequencies() [3/8]

void CTextUtil::GetWordFrequencies ( double  i,
TWordFreq freq 
)
static

Definition at line 324 of file text_util.cpp.

References i, and s_NumericToFreq().

◆ GetWordFrequencies() [4/8]

void CTextUtil::GetWordFrequencies ( float  i,
TWordFreq freq 
)
static

Definition at line 330 of file text_util.cpp.

References i, and s_NumericToFreq().

◆ GetWordFrequencies() [5/8]

void CTextUtil::GetWordFrequencies ( Int4  i,
TWordFreq freq 
)
static

convert an integer into a set of word frequencies this maps ints to smaller strings to compress the dictionary

Definition at line 300 of file text_util.cpp.

References i, and s_NumericToFreq().

◆ GetWordFrequencies() [6/8]

void CTextUtil::GetWordFrequencies ( Int8  i,
TWordFreq freq 
)
static

Definition at line 306 of file text_util.cpp.

References i, and s_NumericToFreq().

◆ GetWordFrequencies() [7/8]

void CTextUtil::GetWordFrequencies ( Uint4  i,
TWordFreq freq 
)
static

Definition at line 312 of file text_util.cpp.

References i, and s_NumericToFreq().

◆ GetWordFrequencies() [8/8]

void CTextUtil::GetWordFrequencies ( Uint8  i,
TWordFreq freq 
)
static

Definition at line 318 of file text_util.cpp.

References i, and s_NumericToFreq().

◆ IsStopWord()

bool CTextUtil::IsStopWord ( const string str)
static

return true if the provided word is a stop word

Definition at line 652 of file text_util.cpp.

References str().

Referenced by GetWordFrequencies().

◆ SplitWordFrequencies()

void CTextUtil::SplitWordFrequencies ( const TWordFreq wf_in,
TWordFreq wf_out,
TWordFreq phrase_out 
)
static

split a set of word frequencies into phrase and non-phrase frequencies this is done to treat the two separately

Definition at line 197 of file text_util.cpp.

References CScoreVector< Key, Score >::end(), CScoreVector< Key, Score >::insert(), and ITERATE.

◆ TrimStopWords()

void CTextUtil::TrimStopWords ( TWordFreq freq)
static

eliminate the stop words frm a set of word frequencies

Definition at line 659 of file text_util.cpp.

References CScoreVector< Key, Score >::begin(), CScoreVector< Key, Score >::end(), and CScoreVector< Key, Score >::erase().

Referenced by GetStemFrequencies().


The documentation for this class was generated from the following files:
Modified on Sun Mar 03 03:15:06 2024 by modify_doxy.py rev. 669887