56 #define THIS_FILE "utilfun.cpp"
71 "EST PROTO((expressed sequence tag)",
72 "expressed sequence tag",
73 "EST (expressed sequence tag)",
74 "EST (expressed sequence tags)",
75 "EST(expressed sequence tag)",
76 "transcribed sequence fragment",
82 "GSS (genome survey sequence)",
89 "STS(sequence tagged site)",
90 "STS (sequence tagged site)",
92 "sequence tagged site",
113 "CAGE (Cap Analysis Gene Expression)",
119 "CAGE (Cap Analysis Gene Expression)",
132 "Transcriptome Shotgun Assembly",
141 "Targeted Locus Study",
150 "THIRD PARTY ANNOTATION",
162 "THIRD PARTY ANNOTATION",
173 "Metagenome Assembled Genome",
181 Int4 sign = (m < 0) ? -1 : 1;
184 for (m *= sign; m > 9; m /= 10)
192 std::reverse(res.begin(), res.end());
204 for (
const string& acc : extra_accs) {
208 size_t dash = acc.find(
'-');
209 if (dash == string::npos) {
214 string first(acc.begin(), acc.begin() + dash),
215 last(acc.begin() + dash + 1, acc.end());
216 size_t acclen =
first.size();
219 for (; (*p >=
'A' && *p <=
'Z') || *p ==
'_';)
222 size_t preflen = p -
first.c_str();
224 string prefix =
first.substr(0, preflen);
229 for (q = p; *p >=
'0' && *p <=
'9';)
233 for (p =
last.c_str() + preflen; *p ==
'0';)
235 for (q = p; *p >=
'0' && *p <=
'9';)
239 ret.push_back(
first);
244 for (num1++; num1 <= num2; num1++) {
245 string new_acc = prefix;
246 string num_str =
FTAitoa(num1);
247 size_t j = acclen - preflen - num_str.size();
249 for (
size_t i = 0;
i < j;
i++)
253 ret.push_back(new_acc);
262 return (
'A' <= c && c <=
'Z') || c ==
'_';
269 if (tokens.empty()) {
273 if (tokens.size() <= skip + 1) {
278 auto it = tokens.begin();
283 for (; it != tokens.end(); ++it) {
284 const auto& token = *it;
301 if (first_it ==
first.end()) {
309 if (last_it ==
last.end()) {
314 auto prefixLength = distance(
first.begin(), first_it);
315 if (prefixLength != distance(
last.begin(), last_it) ||
329 it = tokens.insert(it,
"-");
330 it = tokens.insert(it,
last);
342 return (
'A' <= c && c <=
'Z');
346 return (
'0' <= c && c <=
'9');
351 auto& tokens = tsbp->
list;
354 if ((
int)skip >= tsbp->
num)
357 auto tbp = tokens.begin();
361 bool bad =
false, msg_issued =
false;
362 for (; tbp != tokens.end(); ++tbp) {
363 const string& token = *tbp;
364 string_view tok_view = token;
367 size_t dash = token.find(
'-');
368 if (dash == string::npos)
370 if (dash == 0 || tok_view.size() != (dash + 1 + dash)) {
375 string_view
first(tok_view.substr(0, dash));
376 string_view
last(tok_view.substr(dash + 1));
388 if (last_it ==
last.end() || !
IsDigit(*last_it)) {
393 size_t preflen = first_it -
first.begin();
394 size_t preflen2 = last_it -
last.begin();
395 string_view first_prefix =
first.substr(0, preflen);
396 string_view last_prefix =
last.substr(0, preflen2);
397 if (first_prefix != last_prefix) {
404 string_view first_digits =
first.substr(preflen);
405 string_view last_digits =
last.substr(preflen);
406 if (! all_of(first_digits.begin(), first_digits.end(),
IsDigit) ||
407 ! all_of(last_digits.begin(), last_digits.end(),
IsDigit)) {
424 tbp = tokens.insert_after(tbp,
"-");
425 tbp = tokens.insert_after(tbp,
tmp);
453 auto tail = token->
list.before_begin();
460 for (num = 0; *ptr !=
'\0' && *ptr !=
'\r' && *ptr !=
'\n';) {
461 for (bptr = ptr; *ptr !=
delimiter && *ptr !=
'\r' && *ptr !=
'\n' &&
462 *ptr !=
'\t' && *ptr !=
' ' && *ptr !=
'\0';)
465 tail = token->
list.insert_after(tail,
string(bptr, ptr));
468 while (*ptr ==
delimiter || *ptr ==
'\t' || *ptr ==
' ')
474 return unique_ptr<TokenStatBlk>(token);
596 if (! where || *where ==
'\0' || ! what || *what ==
'\0')
600 for (; *where !=
'\0'; where++) {
601 for (q = what, p = where; *q !=
'\0' && *p !=
'\0'; q++, p++) {
605 if (*q >=
'A' && *q <=
'Z') {
608 }
else if (*q >=
'a' && *q <=
'z') {
614 if (*p ==
'\0' || *q ==
'\0')
618 return const_cast<char*
>(where);
646 vector<string> lines;
649 for (
auto line : lines) {
653 replaced += line.substr(
indent);
654 auto last = line.size() - 1;
655 if (line[
last] !=
'-') {
657 }
else if (line[
last - 1] ==
' ') {
670 for (
size_t ret =
len; ret > 0;) {
672 if (c !=
' ' && c !=
'\n' && c !=
'\\' && c !=
',' &&
673 c !=
';' && c !=
'~' && c !=
'.' && c !=
':') {
699 if (!
str || *
str ==
'\0')
733 bptr = retptr = *ptr;
734 if (! retptr || *retptr ==
'\0')
737 while (*retptr !=
'\0' && *retptr !=
' ')
742 while (*retptr !=
'\0' && *retptr ==
' ')
761 string_view sv(bptr, eptr - bptr);
764 if (
i != string_view::npos)
779 char*
SrchTheStr(
char* bptr,
char* eptr,
const char* leadstr)
781 string_view sv(bptr, eptr - bptr);
783 auto i = sv.find(leadstr);
784 if (
i != string_view::npos)
805 new_text_id->SetVersion(text_id->
GetVersion());
807 SetTextId(
id.Which(), *new_id, *new_text_id);
812 ibp->
ids.push_back(new_id);
816 ibp->
ids.push_back(std::move(pId));
831 if (! s || *s ==
'\0')
841 static const vector<string>
months{
842 "JAN",
"FEB",
"MAR",
"APR",
"MAY",
"JUN",
"JUL",
"AUG",
"SEP",
"OCT",
"NOV",
"DEC"
845 auto it = find(
months.begin(),
months.end(), maybe_month);
856 int parse_month =
int(it -
months.begin()) + 1;
860 int parse_year = atoi(s);
862 if (1900 <= parse_year && parse_year <= cur_year) {
864 }
else if (0 <= parse_year && parse_year <= 99 &&
'0' <= s[1] && s[1] <=
'9') {
866 (parse_year < 70) ? (parse_year += 2000) : (parse_year += 1900);
899 SIZE_TYPE keywordCount = keywordList.size();
901 for (
unsigned i = 0;
i < keywordCount; ++
i) {
916 for (p = ptr; *p >=
'0' && *p <=
'9';)
922 auto keywordCount = keywordList.size();
923 for (
unsigned i = 0;
i < keywordCount;
i++) {
924 auto keyword = keywordList[
i];
925 if (
StringEquN(ptr, keyword.c_str(), keyword.size()))
1024 bool kwd_tpa =
false;
1025 bool kwd_party =
false;
1026 bool kwd_inf =
false;
1027 bool kwd_exp =
false;
1028 bool kwd_asm =
false;
1029 bool kwd_spedb =
false;
1040 for (
const string&
key : kwds) {
1044 const char* p =
key.c_str();
1048 else if (
i == 1 ||
i == 2)
1054 else if (
i == 5 ||
i == 6)
1062 }
else if (p[3] !=
'\0' && p[4] !=
'\0') {
1066 if (
i > 2 &&
i < 8 && j < 4) {
1073 if (kwd_tpa && ! kwd_party) {
1076 }
else if (! kwd_tpa && kwd_party) {
1080 if (! kwd_tpa && (kwd_inf || kwd_exp)) {
1083 }
else if (kwd_tpa && kwd_inf ==
false && kwd_exp ==
false &&
1084 kwd_asm ==
false && kwd_spedb ==
false) {
1089 for (
i = 0;
i < j;
i++) {
1104 bool kwd_tsa =
false;
1105 bool kwd_assembly =
false;
1112 for (
const string&
key : kwds) {
1119 kwd_assembly =
true;
1122 kwd_assembly =
true;
1125 if (kwd_tsa && ! kwd_assembly) {
1128 }
else if (! kwd_tsa && kwd_assembly) {
1138 bool kwd_tls =
false;
1139 bool kwd_study =
false;
1146 for (
const string&
key : kwds) {
1159 if (kwd_tls && ! kwd_study) {
1162 }
else if (! kwd_tls && kwd_study) {
1195 void fta_keywords_check(
const char*
str,
bool* estk,
bool* stsk,
bool* gssk,
bool* htck,
bool* flik,
bool* wgsk,
bool* tpak,
bool* envk,
bool* mgak,
bool* tsak,
bool* tlsk)
1254 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1268 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1282 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1297 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1312 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1326 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
1336 const list<string> keywordList,
1345 if (keywordList.empty()) {
1348 for (
auto keyword : keywordList) {
1350 keyword.c_str(), &entry->
EST, &entry->
STS, &entry->
GSS, &entry->
HTC,
nullptr,
nullptr, (tpa_check ? &entry->
is_tpa :
nullptr),
nullptr,
nullptr,
nullptr,
nullptr);
1377 if (! kwds || ! kwds->
data ||
len < 1)
1382 for (; kwds; kwds = kwds->
next) {
1385 for (p = line; *p !=
'\0'; p++)
1386 if (*p ==
'\n' || *p ==
'\t')
1388 for (p = line; *p ==
' ' || *p ==
'.' || *p ==
';';)
1394 for (q = p; *q !=
'\0';)
1396 for (q--; *q ==
' ' || *q ==
'.' || *q ==
';'; q--)
1398 for (q = p, p = line; *q !=
'\0';) {
1399 if (*q !=
' ' && *q !=
';') {
1404 for (q++; *q ==
' ';)
1411 while (*q ==
' ' || *q ==
';')
1417 for (p = line;; p = q + 1) {
1422 fta_keywords_check(p, &entry->
EST, &entry->
STS, &entry->
GSS, &entry->
HTC,
nullptr,
nullptr, (tpa_check ? &entry->
is_tpa :
nullptr),
nullptr,
nullptr,
nullptr,
nullptr);
1425 specialist_db =
true;
1431 experimental =
true;
1452 TKeywordList::const_iterator key_it = kwds.end();
1456 for (TKeywordList::const_iterator
key = kwds.begin();
key != kwds.end(); ++
key) {
1475 for (is_sage =
false, is_cage =
false; key_it != kwds.end(); ++key_it) {
1476 const char* p = key_it->c_str();
1501 for (q = dst, p = src; *p !=
'\0';)
1565 for (
const string&
key : keywords) {
1576 for (
const string&
key : keywords) {
1577 if (
key ==
"HTG" ||
key ==
"HTGS_PHASE0" ||
1578 key ==
"HTGS_PHASE1" ||
key ==
"HTGS_PHASE2" ||
1579 key ==
"HTGS_PHASE3") {
1590 for (TKeywordList::iterator
key = keywords.begin();
key != keywords.end();) {
1591 const char* p =
key->c_str();
1593 (p[10] ==
'0' || p[10] ==
'1' || p[10] ==
'2' ||
1596 key = keywords.erase(
key);
1605 for (
const string&
key : keywords) {
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
The NCBI C++ standard methods for dealing with std::string.
#define ERR_REFERENCE_IllegalDate
#define ERR_DATE_IllegalDate
std::list< std::string > TKeywordList
bool StringEquN(const char *s1, const char *s2, size_t n)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
void StringCat(char *d, const char *s)
char * StringNew(size_t sz)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
void Reset(void)
Reset reference object.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint16_t Uint2
2-byte (16-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
NCBI_NS_STD::string::size_type SIZE_TYPE
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
@ fConvErr_NoThrow
Do not throw an exception on error.
int Year(void) const
Get year.
list< string > TExtra_accessions
void SetYear(TYear value)
Assign a value to Year data member.
void SetMonth(TMonth value)
Assign a value to Month data member.
void SetDay(TDay value)
Assign a value to Day data member.
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TEmbl & SetEmbl(void)
Select the variant.
TOther & SetOther(void)
Select the variant.
const TName & GetName(void) const
Get the Name member data.
TTpe & SetTpe(void)
Select the variant.
TTpg & SetTpg(void)
Select the variant.
TPir & SetPir(void)
Select the variant.
TTpd & SetTpd(void)
Select the variant.
TVersion GetVersion(void) const
Get the Version member data.
TGpipe & SetGpipe(void)
Select the variant.
TDdbj & SetDdbj(void)
Select the variant.
TPrf & SetPrf(void)
Select the variant.
TGenbank & SetGenbank(void)
Select the variant.
TSwissprot & SetSwissprot(void)
Select the variant.
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
void SetChain_id(const TChain_id &value)
Assign a value to Chain_id data member.
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TPdb & SetPdb(void)
Select the variant.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
@ e_Tpe
Third Party Annot/Seq EMBL.
@ e_Tpd
Third Party Annot/Seq DDBJ.
@ e_Named_annot_track
Internal named annotation tracking ID.
@ e_Tpg
Third Party Annot/Seq Genbank.
bool IsSetTechexp(void) const
explanation if tech not enough
const TTechexp & GetTechexp(void) const
Get the Techexp member data.
void SetTechexp(const TTechexp &value)
Assign a value to Techexp data member.
@ eTech_htc
high throughput cDNA
@ eTech_sts
Sequence Tagged Site.
@ eTech_wgs
whole genome shotgun sequencing
@ eTech_survey
one-pass genomic sequence
@ eTech_fli_cdna
full length insert cDNA
@ eTech_est
Expressed Sequence Tag.
unsigned int
A callback function used to compare two keys in a database.
#define ERR_KEYWORD_MissingTPAKeywords
#define ERR_ACCESSION_Invalid2ndAccRange
#define ERR_ACCESSION_2ndAccPrefixMismatch
#define ERR_KEYWORD_InvalidTPATier
#define ERR_KEYWORD_UnexpectedTPA
#define ERR_KEYWORD_MissingTSAKeywords
#define ERR_KEYWORD_MissingTPATier
#define ERR_KEYWORD_ConflictingTPATiers
#define ERR_KEYWORD_MissingTLSKeywords
#define ERR_ENTRY_InvalidLineType
#define ERR_KEYWORD_MissingMGAKeywords
#define ERR_KEYWORD_ConflictingMGAKeywords
static void text(MDB_val *v)
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Defines: CTimeFormat - storage class for time format.
static const char delimiter[]
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
list< SectionPtr > mSections
static const char * ParFlat_TLS_kw_array[]
Int2 MatchArrayIString(const Char **array, const Char *text)
bool HasHtg(const TKeywordList &keywords)
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
static const char * ParFlat_STS_kw_array[]
bool HasHtc(const TKeywordList &keywords)
static const char * ParFlat_MGA_kw_array[]
char * SrchTheChar(char *bptr, char *eptr, Char letter)
static const char * ParFlat_MAG_kw_array[]
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void RemoveHtgPhase(TKeywordList &keywords)
bool fta_is_tsa_keyword(const char *str)
static bool sIsPrefixChar(char c)
bool fta_is_tls_keyword(const char *str)
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
static const char * ParFlat_TPA_kw_array_to_remove[]
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Int2 StringMatchIcase(const Char **array, string_view text)
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Int2 MatchArrayISubString(const Char **array, string_view text)
void fta_remove_tpa_keywords(TKeywordList &kwds)
Int2 MatchArraySubString(const Char **array, string_view text)
unique_ptr< TokenStatBlk > TokenString(const char *str, Char delimiter)
static const char * ParFlat_FLI_kw_array[]
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
static const char * ParFlat_ENV_kw_array[]
bool fta_is_tpa_keyword(const char *str)
void CleanTailNoneAlphaCharInString(string &str)
static const char * ParFlat_TPA_kw_array[]
const Section * xTrackNodeType(const Entry &entry, int type)
char * SrchNodeType(DataBlkPtr entry, Int4 type, size_t *len)
static const char * ParFlat_TSA_kw_array[]
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
static const char * ParFlat_MGA_more_kw_array[]
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
string xGetNodeData(const DataBlk &entry, int nodeType)
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
char * GetTheCurrentToken(char **ptr)
Int2 fta_StringMatch(const Char **array, string_view text)
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
bool IsLeadPrefixChar(char c)
static const char * ParFlat_WGS_kw_array[]
void fta_StringCpy(char *dst, const char *src)
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
static const char * ParFlat_HTC_kw_array[]
void fta_remove_mag_keywords(TKeywordList &kwds)
void CleanTailNoneAlphaChar(char *str)
static size_t SeekLastAlphaChar(const Char *str, size_t len)
bool IsCancelled(const TKeywordList &keywords)
Int2 MatchArrayString(const char **array, const char *text)
static const char * ParFlat_GSS_kw_array[]
static string FTAitoa(Int4 m)
Char * StringIStr(const Char *where, const Char *what)
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void CpSeqId(InfoBioseqPtr ibp, const CSeq_id &id)
static const char * ParFlat_EST_kw_array[]
void fta_remove_env_keywords(TKeywordList &kwds)
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
bool fta_tpa_keywords_check(const TKeywordList &kwds)
char * PointToNextToken(char *ptr)
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
void xCheckEstStsGssTpaKeywords(const list< string > keywordList, bool tpa_check, IndexblkPtr entry)
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
static Uint4 letter(char c)