56 #define THIS_FILE "utilfun.cpp"
71 "EST PROTO((expressed sequence tag)",
72 "expressed sequence tag",
73 "EST (expressed sequence tag)",
74 "EST (expressed sequence tags)",
75 "EST(expressed sequence tag)",
76 "transcribed sequence fragment",
82 "GSS (genome survey sequence)",
89 "STS(sequence tagged site)",
90 "STS (sequence tagged site)",
92 "sequence tagged site",
113 "CAGE (Cap Analysis Gene Expression)",
119 "CAGE (Cap Analysis Gene Expression)",
132 "Transcriptome Shotgun Assembly",
141 "Targeted Locus Study",
150 "THIRD PARTY ANNOTATION",
162 "THIRD PARTY ANNOTATION",
173 "Metagenome Assembled Genome",
181 Int4 sign = (m < 0) ? -1 : 1;
184 for (m *= sign; m > 9; m /= 10)
192 std::reverse(res.begin(), res.end());
204 for (
const string& acc : extra_accs) {
208 size_t dash = acc.find(
'-');
209 if (dash == string::npos) {
214 string first(acc.begin(), acc.begin() + dash),
215 last(acc.begin() + dash + 1, acc.end());
216 size_t acclen =
first.size();
219 for (; (*p >=
'A' && *p <=
'Z') || *p ==
'_';)
222 size_t preflen = p -
first.c_str();
229 for (q = p; *p >=
'0' && *p <=
'9';)
233 for (p =
last.c_str() + preflen; *p ==
'0';)
235 for (q = p; *p >=
'0' && *p <=
'9';)
239 ret.push_back(
first);
244 for (num1++; num1 <= num2; num1++) {
246 string num_str =
FTAitoa(num1);
247 size_t j = acclen - preflen - num_str.size();
249 for (
size_t i = 0;
i < j;
i++)
253 ret.push_back(new_acc);
262 return (
'A' <= c && c <=
'Z') || c ==
'_';
269 if (tokens.empty()) {
273 if (tokens.size() <= skip + 1) {
278 auto it = tokens.begin();
283 for (; it != tokens.end(); ++it) {
284 const auto& token = *it;
301 if (first_it ==
first.end()) {
309 if (last_it ==
last.end()) {
314 auto prefixLength = distance(
first.begin(), first_it);
315 if (prefixLength != distance(
last.begin(), last_it) ||
329 it = tokens.insert(it,
"-");
330 it = tokens.insert(it,
last);
359 else if (skip == 1) {
369 for (bad =
false; tbp; tbp = tbpnext) {
380 *
first >
'Z' || *last < 'A' || *last >
'Z') {
386 for (p =
first; (*p >=
'A' && *p <=
'Z') || *p ==
'_';)
388 if (*p < '0' || *p >
'9') {
393 for (q =
last; (*q >=
'A' && *q <=
'Z') || *q ==
'_';)
395 if (*q < '0' || *q >
'9') {
400 size_t preflen = p -
first;
409 for (q = p; *p >=
'0' && *p <=
'9';)
418 for (p =
last + preflen; *p ==
'0';)
420 for (q = p; *p >=
'0' && *p <=
'9';)
505 for (num = 0; *ptr !=
'\0' && *ptr !=
'\r' && *ptr !=
'\n';) {
506 for (bptr = ptr; *ptr !=
delimiter && *ptr !=
'\r' && *ptr !=
'\n' &&
507 *ptr !=
'\t' && *ptr !=
' ' && *ptr !=
'\0';)
519 while (*ptr ==
delimiter || *ptr ==
'\t' || *ptr ==
' ')
687 if (! where || *where ==
'\0' || ! what || *what ==
'\0')
691 for (; *where !=
'\0'; where++) {
692 for (q = what, p = where; *q !=
'\0' && *p !=
'\0'; q++, p++) {
696 if (*q >=
'A' && *q <=
'Z') {
699 }
else if (*q >=
'a' && *q <=
'z') {
705 if (*p ==
'\0' || *q ==
'\0')
709 return const_cast<char*
>(where);
742 string instr(bptr, eptr - bptr);
747 if (bptr + start_col_data >= eptr)
750 size_t size = eptr - bptr;
754 while (bptr < eptr) {
762 bptr += start_col_data;
769 if (*(ptr - 1) !=
'-' || *(ptr - 2) ==
' ') {
786 vector<string> lines;
789 for (
auto line : lines) {
793 replaced += line.substr(
indent);
794 auto last = line.size() - 1;
795 if (line[
last] !=
'-') {
797 }
else if (line[
last - 1] ==
' ') {
810 for (
size_t ret =
len; ret > 0;) {
812 if (c !=
' ' && c !=
'\n' && c !=
'\\' && c !=
',' &&
813 c !=
';' && c !=
'~' && c !=
'.' && c !=
':') {
839 if (!
str || *
str ==
'\0')
874 bptr = retptr = *ptr;
875 if (! retptr || *retptr ==
'\0')
878 while (*retptr !=
'\0' && *retptr !=
' ')
886 while (*retptr !=
'\0' && *retptr ==
' ')
905 while (bptr < eptr && *bptr !=
letter)
923 char*
SrchTheStr(
char* bptr,
char* eptr,
const char* leadstr)
950 new_text_id->SetVersion(text_id->
GetVersion());
952 SetTextId(
id.Which(), *new_id, *new_text_id);
957 ibp->
ids.push_back(new_id);
961 ibp->
ids.push_back(std::move(pId));
976 if (! s || *s ==
'\0')
986 static const vector<string>
months{
987 "JAN",
"FEB",
"MAR",
"APR",
"MAY",
"JUN",
"JUL",
"AUG",
"SEP",
"OCT",
"NOV",
"DEC"
990 auto it = find(
months.begin(),
months.end(), maybe_month);
1001 int parse_month =
int(it -
months.begin()) + 1;
1005 int parse_year = atoi(s);
1007 if (1900 <= parse_year && parse_year <= cur_year) {
1009 }
else if (0 <= parse_year && parse_year <= 99 &&
'0' <= s[1] && s[1] <=
'9') {
1011 (parse_year < 70) ? (parse_year += 2000) : (parse_year += 1900);
1044 SIZE_TYPE keywordCount = keywordList.size();
1046 for (
unsigned i = 0;
i < keywordCount; ++
i) {
1061 for (p = ptr; *p >=
'0' && *p <=
'9';)
1067 auto keywordCount = keywordList.size();
1068 for (
unsigned i = 0;
i < keywordCount;
i++) {
1069 auto keyword = keywordList[
i];
1070 if (
StringEquN(ptr, keyword.c_str(), keyword.size()))
1169 bool kwd_tpa =
false;
1170 bool kwd_party =
false;
1171 bool kwd_inf =
false;
1172 bool kwd_exp =
false;
1173 bool kwd_asm =
false;
1174 bool kwd_spedb =
false;
1185 for (
const string&
key : kwds) {
1189 const char* p =
key.c_str();
1193 else if (
i == 1 ||
i == 2)
1199 else if (
i == 5 ||
i == 6)
1207 }
else if (p[3] !=
'\0' && p[4] !=
'\0') {
1211 if (
i > 2 &&
i < 8 && j < 4) {
1218 if (kwd_tpa && ! kwd_party) {
1221 }
else if (! kwd_tpa && kwd_party) {
1225 if (! kwd_tpa && (kwd_inf || kwd_exp)) {
1228 }
else if (kwd_tpa && kwd_inf ==
false && kwd_exp ==
false &&
1229 kwd_asm ==
false && kwd_spedb ==
false) {
1234 for (
i = 0;
i < j;
i++) {
1249 bool kwd_tsa =
false;
1250 bool kwd_assembly =
false;
1257 for (
const string&
key : kwds) {
1264 kwd_assembly =
true;
1267 kwd_assembly =
true;
1270 if (kwd_tsa && ! kwd_assembly) {
1273 }
else if (! kwd_tsa && kwd_assembly) {
1283 bool kwd_tls =
false;
1284 bool kwd_study =
false;
1291 for (
const string&
key : kwds) {
1304 if (kwd_tls && ! kwd_study) {
1307 }
else if (! kwd_tls && kwd_study) {
1340 void fta_keywords_check(
const char*
str,
bool* estk,
bool* stsk,
bool* gssk,
bool* htck,
bool* flik,
bool* wgsk,
bool* tpak,
bool* envk,
bool* mgak,
bool* tsak,
bool* tlsk)
1399 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1413 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1427 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1442 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1457 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1471 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1481 const list<string> keywordList,
1490 if (keywordList.empty()) {
1493 for (
auto keyword : keywordList) {
1495 keyword.c_str(), &entry->
EST, &entry->
STS, &entry->
GSS, &entry->
HTC,
nullptr,
nullptr, (tpa_check ? &entry->
is_tpa :
nullptr),
nullptr,
nullptr,
nullptr,
nullptr);
1522 if (! kwds || ! kwds->
data ||
len < 1)
1527 for (; kwds; kwds = kwds->
next) {
1530 for (p = line; *p !=
'\0'; p++)
1531 if (*p ==
'\n' || *p ==
'\t')
1533 for (p = line; *p ==
' ' || *p ==
'.' || *p ==
';';)
1539 for (q = p; *q !=
'\0';)
1541 for (q--; *q ==
' ' || *q ==
'.' || *q ==
';'; q--)
1543 for (q = p, p = line; *q !=
'\0';) {
1544 if (*q !=
' ' && *q !=
';') {
1549 for (q++; *q ==
' ';)
1556 while (*q ==
' ' || *q ==
';')
1562 for (p = line;; p = q + 1) {
1567 fta_keywords_check(p, &entry->
EST, &entry->
STS, &entry->
GSS, &entry->
HTC,
nullptr,
nullptr, (tpa_check ? &entry->
is_tpa :
nullptr),
nullptr,
nullptr,
nullptr,
nullptr);
1570 specialist_db =
true;
1576 experimental =
true;
1597 TKeywordList::const_iterator key_it = kwds.end();
1601 for (TKeywordList::const_iterator
key = kwds.begin();
key != kwds.end(); ++
key) {
1620 for (is_sage =
false, is_cage =
false; key_it != kwds.end(); ++key_it) {
1621 const char* p = key_it->c_str();
1646 for (q = dst, p = src; *p !=
'\0';)
1710 for (
const string&
key : keywords) {
1721 for (
const string&
key : keywords) {
1722 if (
key ==
"HTG" ||
key ==
"HTGS_PHASE0" ||
1723 key ==
"HTGS_PHASE1" ||
key ==
"HTGS_PHASE2" ||
1724 key ==
"HTGS_PHASE3") {
1735 for (TKeywordList::iterator
key = keywords.begin();
key != keywords.end();) {
1736 const char* p =
key->c_str();
1738 (p[10] ==
'0' || p[10] ==
'1' || p[10] ==
'2' ||
1741 key = keywords.erase(
key);
1750 for (
const string&
key : keywords) {
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
The NCBI C++ standard methods for dealing with std::string.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
#define ERR_REFERENCE_IllegalDate
#define ERR_DATE_IllegalDate
std::list< std::string > TKeywordList
char * StringSave(const char *s)
bool StringEquN(const char *s1, const char *s2, size_t n)
void StringCpy(char *d, const char *s)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
void StringCat(char *d, const char *s)
void MemCpy(void *p, const void *q, size_t sz)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
NCBI_NS_STD::string::size_type SIZE_TYPE
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
@ eTrunc_End
Truncate trailing spaces only.
int Year(void) const
Get year.
list< string > TExtra_accessions
void SetYear(TYear value)
Assign a value to Year data member.
void SetMonth(TMonth value)
Assign a value to Month data member.
void SetDay(TDay value)
Assign a value to Day data member.
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TEmbl & SetEmbl(void)
Select the variant.
TOther & SetOther(void)
Select the variant.
const TName & GetName(void) const
Get the Name member data.
TTpe & SetTpe(void)
Select the variant.
TTpg & SetTpg(void)
Select the variant.
TPir & SetPir(void)
Select the variant.
TTpd & SetTpd(void)
Select the variant.
TVersion GetVersion(void) const
Get the Version member data.
TGpipe & SetGpipe(void)
Select the variant.
TDdbj & SetDdbj(void)
Select the variant.
TPrf & SetPrf(void)
Select the variant.
TGenbank & SetGenbank(void)
Select the variant.
TSwissprot & SetSwissprot(void)
Select the variant.
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
void SetChain_id(const TChain_id &value)
Assign a value to Chain_id data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TPdb & SetPdb(void)
Select the variant.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
@ e_Tpe
Third Party Annot/Seq EMBL.
@ e_Tpd
Third Party Annot/Seq DDBJ.
@ e_Named_annot_track
Internal named annotation tracking ID.
@ e_Tpg
Third Party Annot/Seq Genbank.
bool IsSetTechexp(void) const
explanation if tech not enough
const TTechexp & GetTechexp(void) const
Get the Techexp member data.
void SetTechexp(const TTechexp &value)
Assign a value to Techexp data member.
@ eTech_htc
high throughput cDNA
@ eTech_sts
Sequence Tagged Site.
@ eTech_wgs
whole genome shotgun sequencing
@ eTech_survey
one-pass genomic sequence
@ eTech_fli_cdna
full length insert cDNA
@ eTech_est
Expressed Sequence Tag.
unsigned int
A callback function used to compare two keys in a database.
#define ERR_KEYWORD_MissingTPAKeywords
#define ERR_ACCESSION_Invalid2ndAccRange
#define ERR_ACCESSION_2ndAccPrefixMismatch
#define ERR_KEYWORD_InvalidTPATier
#define ERR_KEYWORD_UnexpectedTPA
#define ERR_KEYWORD_MissingTSAKeywords
#define ERR_KEYWORD_MissingTPATier
#define ERR_KEYWORD_ConflictingTPATiers
#define ERR_KEYWORD_MissingTLSKeywords
#define ERR_ENTRY_InvalidLineType
#define ERR_KEYWORD_MissingMGAKeywords
#define ERR_KEYWORD_ConflictingMGAKeywords
static void text(MDB_val *v)
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Defines: CTimeFormat - storage class for time format.
static const char * prefix[]
static const char delimiter[]
static const char * str(char *buf, int n)
list< SectionPtr > mSections
Int2 MatchArraySubString(const Char **array, const Char *text)
static const char * ParFlat_TLS_kw_array[]
Int2 MatchArrayIString(const Char **array, const Char *text)
bool HasHtg(const TKeywordList &keywords)
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
static const char * ParFlat_STS_kw_array[]
bool HasHtc(const TKeywordList &keywords)
static const char * ParFlat_MGA_kw_array[]
char * GetBlkDataReplaceNewLine(char *bptr, char *eptr, Int2 start_col_data)
char * SrchTheChar(char *bptr, char *eptr, Char letter)
static const char * ParFlat_MAG_kw_array[]
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void RemoveHtgPhase(TKeywordList &keywords)
bool fta_is_tsa_keyword(const char *str)
static bool sIsPrefixChar(char c)
bool fta_is_tls_keyword(const char *str)
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
static const char * ParFlat_TPA_kw_array_to_remove[]
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
void fta_remove_tpa_keywords(TKeywordList &kwds)
static TokenBlkPtr TokenNodeNew(TokenBlkPtr tbp)
Int2 fta_StringMatch(const Char **array, const Char *text)
static const char * ParFlat_FLI_kw_array[]
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
static const char * ParFlat_ENV_kw_array[]
bool fta_is_tpa_keyword(const char *str)
void CleanTailNoneAlphaCharInString(string &str)
static const char * ParFlat_TPA_kw_array[]
const Section * xTrackNodeType(const Entry &entry, int type)
char * SrchNodeType(DataBlkPtr entry, Int4 type, size_t *len)
void xGetBlkDataReplaceNewLine(string &instr, int indent)
static const char * ParFlat_TSA_kw_array[]
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
static const char * ParFlat_MGA_more_kw_array[]
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
string xGetNodeData(const DataBlk &entry, int nodeType)
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
char * GetTheCurrentToken(char **ptr)
static void InsertTokenVal(TokenBlkPtr *tbp, const char *str)
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Int2 StringMatchIcase(const Char **array, const Char *text)
static const char * ParFlat_WGS_kw_array[]
void fta_StringCpy(char *dst, const char *src)
TokenStatBlkPtr TokenString(char *str, Char delimiter)
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
void FreeTokenblk(TokenBlkPtr tbp)
static const char * ParFlat_HTC_kw_array[]
void fta_remove_mag_keywords(TKeywordList &kwds)
void CleanTailNoneAlphaChar(char *str)
static size_t SeekLastAlphaChar(const Char *str, size_t len)
bool IsCancelled(const TKeywordList &keywords)
Int2 MatchArrayString(const char **array, const char *text)
static const char * ParFlat_GSS_kw_array[]
static string FTAitoa(Int4 m)
Char * StringIStr(const Char *where, const Char *what)
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
static const char * ParFlat_EST_kw_array[]
void fta_remove_env_keywords(TKeywordList &kwds)
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
bool fta_tpa_keywords_check(const TKeywordList &kwds)
void FreeTokenstatblk(TokenStatBlkPtr tsbp)
char * PointToNextToken(char *ptr)
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
Int2 MatchArrayISubString(const Char **array, const Char *text)
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
static Uint4 letter(char c)