56 #define THIS_FILE "utilfun.cpp"
71 "EST PROTO((expressed sequence tag)",
72 "expressed sequence tag",
73 "EST (expressed sequence tag)",
74 "EST (expressed sequence tags)",
75 "EST(expressed sequence tag)",
76 "transcribed sequence fragment",
82 "GSS (genome survey sequence)",
89 "STS(sequence tagged site)",
90 "STS (sequence tagged site)",
92 "sequence tagged site",
113 "CAGE (Cap Analysis Gene Expression)",
119 "CAGE (Cap Analysis Gene Expression)",
132 "Transcriptome Shotgun Assembly",
141 "Targeted Locus Study",
150 "THIRD PARTY ANNOTATION",
162 "THIRD PARTY ANNOTATION",
173 "Metagenome Assembled Genome",
181 Int4 sign = (m < 0) ? -1 : 1;
184 for (m *= sign; m > 9; m /= 10)
192 std::reverse(res.begin(), res.end());
204 for (
const string& acc : extra_accs) {
208 size_t dash = acc.find(
'-');
209 if (dash == string::npos) {
214 string first(acc.begin(), acc.begin() + dash),
215 last(acc.begin() + dash + 1, acc.end());
216 size_t acclen =
first.size();
219 for (; (*p >=
'A' && *p <=
'Z') || *p ==
'_';)
222 size_t preflen = p -
first.c_str();
229 for (q = p; *p >=
'0' && *p <=
'9';)
233 for (p =
last.c_str() + preflen; *p ==
'0';)
235 for (q = p; *p >=
'0' && *p <=
'9';)
239 ret.push_back(
first);
244 for (num1++; num1 <= num2; num1++) {
246 string num_str =
FTAitoa(num1);
247 size_t j = acclen - preflen - num_str.size();
249 for (
size_t i = 0;
i < j;
i++)
253 ret.push_back(new_acc);
262 return (
'A' <= c && c <=
'Z') || c ==
'_';
269 if (tokens.empty()) {
273 if (tokens.size() <= skip + 1) {
278 auto it = tokens.begin();
283 for (; it != tokens.end(); ++it) {
284 const auto& token = *it;
301 if (first_it ==
first.end()) {
309 if (last_it ==
last.end()) {
314 auto prefixLength = distance(
first.begin(), first_it);
315 if (prefixLength != distance(
last.begin(), last_it) ||
329 it = tokens.insert(it,
"-");
330 it = tokens.insert(it,
last);
342 return (
'A' <= c && c <=
'Z');
346 return (
'0' <= c && c <=
'9');
351 auto& tokens = tsbp->
list;
354 if ((
int)skip >= tsbp->
num)
357 auto tbp = tokens.begin();
361 bool bad =
false, msg_issued =
false;
362 for (; tbp != tokens.end(); ++tbp) {
363 const string& token = *tbp;
364 string_view tok_view = token;
367 size_t dash = token.find(
'-');
368 if (dash == string::npos)
370 if (dash == 0 || tok_view.size() != (dash + 1 + dash)) {
375 string_view
first(tok_view.substr(0, dash));
376 string_view
last(tok_view.substr(dash + 1));
388 if (last_it ==
last.end() || !
IsDigit(*last_it)) {
393 size_t preflen = first_it -
first.begin();
394 size_t preflen2 = last_it -
last.begin();
395 string_view first_prefix =
first.substr(0, preflen);
396 string_view last_prefix =
last.substr(0, preflen2);
397 if (first_prefix != last_prefix) {
404 string_view first_digits =
first.substr(preflen);
405 string_view last_digits =
last.substr(preflen);
406 if (! all_of(first_digits.begin(), first_digits.end(),
IsDigit) ||
407 ! all_of(last_digits.begin(), last_digits.end(),
IsDigit)) {
424 tbp = tokens.insert_after(tbp,
"-");
425 tbp = tokens.insert_after(tbp,
tmp);
453 auto tail = token->
list.before_begin();
460 for (num = 0; *ptr !=
'\0' && *ptr !=
'\r' && *ptr !=
'\n';) {
461 for (bptr = ptr; *ptr !=
delimiter && *ptr !=
'\r' && *ptr !=
'\n' &&
462 *ptr !=
'\t' && *ptr !=
' ' && *ptr !=
'\0';)
465 tail = token->
list.insert_after(tail,
string(bptr, ptr));
468 while (*ptr ==
delimiter || *ptr ==
'\t' || *ptr ==
' ')
474 return unique_ptr<TokenStatBlk>(token);
596 if (! where || *where ==
'\0' || ! what || *what ==
'\0')
600 for (; *where !=
'\0'; where++) {
601 for (q = what, p = where; *q !=
'\0' && *p !=
'\0'; q++, p++) {
605 if (*q >=
'A' && *q <=
'Z') {
608 }
else if (*q >=
'a' && *q <=
'z') {
614 if (*p ==
'\0' || *q ==
'\0')
618 return const_cast<char*
>(where);
646 vector<string> lines;
649 for (
auto line : lines) {
653 replaced += line.substr(
indent);
654 auto last = line.size() - 1;
655 if (line[
last] !=
'-') {
657 }
else if (line[
last - 1] ==
' ') {
670 for (
size_t ret =
len; ret > 0;) {
672 if (c !=
' ' && c !=
'\n' && c !=
'\\' && c !=
',' &&
673 c !=
';' && c !=
'~' && c !=
'.' && c !=
':') {
699 if (!
str || *
str ==
'\0')
733 bptr = retptr = *ptr;
734 if (! retptr || *retptr ==
'\0')
737 while (*retptr !=
'\0' && *retptr !=
' ')
742 while (*retptr !=
'\0' && *retptr ==
' ')
761 while (bptr < eptr && *bptr !=
letter)
779 char*
SrchTheStr(
char* bptr,
char* eptr,
const char* leadstr)
806 new_text_id->SetVersion(text_id->
GetVersion());
808 SetTextId(
id.Which(), *new_id, *new_text_id);
813 ibp->
ids.push_back(new_id);
817 ibp->
ids.push_back(std::move(pId));
832 if (! s || *s ==
'\0')
842 static const vector<string>
months{
843 "JAN",
"FEB",
"MAR",
"APR",
"MAY",
"JUN",
"JUL",
"AUG",
"SEP",
"OCT",
"NOV",
"DEC"
846 auto it = find(
months.begin(),
months.end(), maybe_month);
857 int parse_month =
int(it -
months.begin()) + 1;
861 int parse_year = atoi(s);
863 if (1900 <= parse_year && parse_year <= cur_year) {
865 }
else if (0 <= parse_year && parse_year <= 99 &&
'0' <= s[1] && s[1] <=
'9') {
867 (parse_year < 70) ? (parse_year += 2000) : (parse_year += 1900);
900 SIZE_TYPE keywordCount = keywordList.size();
902 for (
unsigned i = 0;
i < keywordCount; ++
i) {
917 for (p = ptr; *p >=
'0' && *p <=
'9';)
923 auto keywordCount = keywordList.size();
924 for (
unsigned i = 0;
i < keywordCount;
i++) {
925 auto keyword = keywordList[
i];
926 if (
StringEquN(ptr, keyword.c_str(), keyword.size()))
1025 bool kwd_tpa =
false;
1026 bool kwd_party =
false;
1027 bool kwd_inf =
false;
1028 bool kwd_exp =
false;
1029 bool kwd_asm =
false;
1030 bool kwd_spedb =
false;
1041 for (
const string&
key : kwds) {
1045 const char* p =
key.c_str();
1049 else if (
i == 1 ||
i == 2)
1055 else if (
i == 5 ||
i == 6)
1063 }
else if (p[3] !=
'\0' && p[4] !=
'\0') {
1067 if (
i > 2 &&
i < 8 && j < 4) {
1074 if (kwd_tpa && ! kwd_party) {
1077 }
else if (! kwd_tpa && kwd_party) {
1081 if (! kwd_tpa && (kwd_inf || kwd_exp)) {
1084 }
else if (kwd_tpa && kwd_inf ==
false && kwd_exp ==
false &&
1085 kwd_asm ==
false && kwd_spedb ==
false) {
1090 for (
i = 0;
i < j;
i++) {
1105 bool kwd_tsa =
false;
1106 bool kwd_assembly =
false;
1113 for (
const string&
key : kwds) {
1120 kwd_assembly =
true;
1123 kwd_assembly =
true;
1126 if (kwd_tsa && ! kwd_assembly) {
1129 }
else if (! kwd_tsa && kwd_assembly) {
1139 bool kwd_tls =
false;
1140 bool kwd_study =
false;
1147 for (
const string&
key : kwds) {
1160 if (kwd_tls && ! kwd_study) {
1163 }
else if (! kwd_tls && kwd_study) {
1196 void fta_keywords_check(
const char*
str,
bool* estk,
bool* stsk,
bool* gssk,
bool* htck,
bool* flik,
bool* wgsk,
bool* tpak,
bool* envk,
bool* mgak,
bool* tsak,
bool* tlsk)
1255 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1269 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1283 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1298 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1313 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1327 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1337 const list<string> keywordList,
1346 if (keywordList.empty()) {
1349 for (
auto keyword : keywordList) {
1351 keyword.c_str(), &entry->
EST, &entry->
STS, &entry->
GSS, &entry->
HTC,
nullptr,
nullptr, (tpa_check ? &entry->
is_tpa :
nullptr),
nullptr,
nullptr,
nullptr,
nullptr);
1378 if (! kwds || ! kwds->
data ||
len < 1)
1383 for (; kwds; kwds = kwds->
next) {
1386 for (p = line; *p !=
'\0'; p++)
1387 if (*p ==
'\n' || *p ==
'\t')
1389 for (p = line; *p ==
' ' || *p ==
'.' || *p ==
';';)
1395 for (q = p; *q !=
'\0';)
1397 for (q--; *q ==
' ' || *q ==
'.' || *q ==
';'; q--)
1399 for (q = p, p = line; *q !=
'\0';) {
1400 if (*q !=
' ' && *q !=
';') {
1405 for (q++; *q ==
' ';)
1412 while (*q ==
' ' || *q ==
';')
1418 for (p = line;; p = q + 1) {
1423 fta_keywords_check(p, &entry->
EST, &entry->
STS, &entry->
GSS, &entry->
HTC,
nullptr,
nullptr, (tpa_check ? &entry->
is_tpa :
nullptr),
nullptr,
nullptr,
nullptr,
nullptr);
1426 specialist_db =
true;
1432 experimental =
true;
1453 TKeywordList::const_iterator key_it = kwds.end();
1457 for (TKeywordList::const_iterator
key = kwds.begin();
key != kwds.end(); ++
key) {
1476 for (is_sage =
false, is_cage =
false; key_it != kwds.end(); ++key_it) {
1477 const char* p = key_it->c_str();
1502 for (q = dst, p = src; *p !=
'\0';)
1566 for (
const string&
key : keywords) {
1577 for (
const string&
key : keywords) {
1578 if (
key ==
"HTG" ||
key ==
"HTGS_PHASE0" ||
1579 key ==
"HTGS_PHASE1" ||
key ==
"HTGS_PHASE2" ||
1580 key ==
"HTGS_PHASE3") {
1591 for (TKeywordList::iterator
key = keywords.begin();
key != keywords.end();) {
1592 const char* p =
key->c_str();
1594 (p[10] ==
'0' || p[10] ==
'1' || p[10] ==
'2' ||
1597 key = keywords.erase(
key);
1606 for (
const string&
key : keywords) {
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
The NCBI C++ standard methods for dealing with std::string.
#define ERR_REFERENCE_IllegalDate
#define ERR_DATE_IllegalDate
std::list< std::string > TKeywordList
bool StringEquN(const char *s1, const char *s2, size_t n)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
void StringCat(char *d, const char *s)
char * StringNew(size_t sz)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint16_t Uint2
2-byte (16-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
NCBI_NS_STD::string::size_type SIZE_TYPE
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
@ fConvErr_NoThrow
Do not throw an exception on error.
int Year(void) const
Get year.
list< string > TExtra_accessions
void SetYear(TYear value)
Assign a value to Year data member.
void SetMonth(TMonth value)
Assign a value to Month data member.
void SetDay(TDay value)
Assign a value to Day data member.
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TEmbl & SetEmbl(void)
Select the variant.
TOther & SetOther(void)
Select the variant.
const TName & GetName(void) const
Get the Name member data.
TTpe & SetTpe(void)
Select the variant.
TTpg & SetTpg(void)
Select the variant.
TPir & SetPir(void)
Select the variant.
TTpd & SetTpd(void)
Select the variant.
TVersion GetVersion(void) const
Get the Version member data.
TGpipe & SetGpipe(void)
Select the variant.
TDdbj & SetDdbj(void)
Select the variant.
TPrf & SetPrf(void)
Select the variant.
TGenbank & SetGenbank(void)
Select the variant.
TSwissprot & SetSwissprot(void)
Select the variant.
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
void SetChain_id(const TChain_id &value)
Assign a value to Chain_id data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TPdb & SetPdb(void)
Select the variant.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
@ e_Tpe
Third Party Annot/Seq EMBL.
@ e_Tpd
Third Party Annot/Seq DDBJ.
@ e_Named_annot_track
Internal named annotation tracking ID.
@ e_Tpg
Third Party Annot/Seq Genbank.
bool IsSetTechexp(void) const
explanation if tech not enough
const TTechexp & GetTechexp(void) const
Get the Techexp member data.
void SetTechexp(const TTechexp &value)
Assign a value to Techexp data member.
@ eTech_htc
high throughput cDNA
@ eTech_sts
Sequence Tagged Site.
@ eTech_wgs
whole genome shotgun sequencing
@ eTech_survey
one-pass genomic sequence
@ eTech_fli_cdna
full length insert cDNA
@ eTech_est
Expressed Sequence Tag.
unsigned int
A callback function used to compare two keys in a database.
#define ERR_KEYWORD_MissingTPAKeywords
#define ERR_ACCESSION_Invalid2ndAccRange
#define ERR_ACCESSION_2ndAccPrefixMismatch
#define ERR_KEYWORD_InvalidTPATier
#define ERR_KEYWORD_UnexpectedTPA
#define ERR_KEYWORD_MissingTSAKeywords
#define ERR_KEYWORD_MissingTPATier
#define ERR_KEYWORD_ConflictingTPATiers
#define ERR_KEYWORD_MissingTLSKeywords
#define ERR_ENTRY_InvalidLineType
#define ERR_KEYWORD_MissingMGAKeywords
#define ERR_KEYWORD_ConflictingMGAKeywords
static void text(MDB_val *v)
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Defines: CTimeFormat - storage class for time format.
static const char * prefix[]
static const char delimiter[]
list< SectionPtr > mSections
static const char * ParFlat_TLS_kw_array[]
Int2 MatchArrayIString(const Char **array, const Char *text)
bool HasHtg(const TKeywordList &keywords)
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
static const char * ParFlat_STS_kw_array[]
bool HasHtc(const TKeywordList &keywords)
static const char * ParFlat_MGA_kw_array[]
char * SrchTheChar(char *bptr, char *eptr, Char letter)
static const char * ParFlat_MAG_kw_array[]
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void RemoveHtgPhase(TKeywordList &keywords)
bool fta_is_tsa_keyword(const char *str)
static bool sIsPrefixChar(char c)
bool fta_is_tls_keyword(const char *str)
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
static const char * ParFlat_TPA_kw_array_to_remove[]
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Int2 StringMatchIcase(const Char **array, string_view text)
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Int2 MatchArrayISubString(const Char **array, string_view text)
void fta_remove_tpa_keywords(TKeywordList &kwds)
Int2 MatchArraySubString(const Char **array, string_view text)
unique_ptr< TokenStatBlk > TokenString(const char *str, Char delimiter)
static const char * ParFlat_FLI_kw_array[]
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
static const char * ParFlat_ENV_kw_array[]
bool fta_is_tpa_keyword(const char *str)
void CleanTailNoneAlphaCharInString(string &str)
static const char * ParFlat_TPA_kw_array[]
const Section * xTrackNodeType(const Entry &entry, int type)
char * SrchNodeType(DataBlkPtr entry, Int4 type, size_t *len)
static const char * ParFlat_TSA_kw_array[]
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
static const char * ParFlat_MGA_more_kw_array[]
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
string xGetNodeData(const DataBlk &entry, int nodeType)
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
char * GetTheCurrentToken(char **ptr)
Int2 fta_StringMatch(const Char **array, string_view text)
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
bool IsLeadPrefixChar(char c)
static const char * ParFlat_WGS_kw_array[]
void fta_StringCpy(char *dst, const char *src)
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
static const char * ParFlat_HTC_kw_array[]
void fta_remove_mag_keywords(TKeywordList &kwds)
void CleanTailNoneAlphaChar(char *str)
static size_t SeekLastAlphaChar(const Char *str, size_t len)
bool IsCancelled(const TKeywordList &keywords)
Int2 MatchArrayString(const char **array, const char *text)
static const char * ParFlat_GSS_kw_array[]
static string FTAitoa(Int4 m)
Char * StringIStr(const Char *where, const Char *what)
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
static const char * ParFlat_EST_kw_array[]
void fta_remove_env_keywords(TKeywordList &kwds)
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
bool fta_tpa_keywords_check(const TKeywordList &kwds)
char * PointToNextToken(char *ptr)
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
static Uint4 letter(char c)