94 #define THIS_FILE "em_ascii.cpp"
106 "ANN",
"CON",
"PAT",
"EST",
"GSS",
"HTC",
"HTG",
"STS",
"TSA",
113 "FUN",
"INV",
"MAM",
"ORG",
"PHG",
"PLN",
"PRI",
"PRO",
"ROD",
114 "SYN",
"UNA",
"VRL",
"VRT",
"PAT",
"EST",
"STS",
"UNC",
"GSS",
115 "HUM",
"HTG",
"HTC",
"CON",
"ENV",
"MUS",
"TGN",
"TSA",
123 "PLN",
"INV",
"MAM",
"UNA",
"PHG",
"PLN",
"PRI",
"BCT",
"ROD",
124 "SYN",
"UNA",
"VRL",
"VRT",
"PAT",
"EST",
"STS",
"UNA",
"GSS",
125 "PRI",
"HTG",
"HTC",
"CON",
"ENV",
"ROD",
"SYN",
"TSA",
192 "GUIDETOPHARMACOLOGY",
240 "UNIPROT/SWISS-PROT",
242 "UNIPROTKB/SWISS-PROT",
298 if (update.
Empty()) {
311 if (seq_entries.empty()) {
328 if (pp->
qamode && ! seq_entries.empty())
376 for (
const auto&
id : ids) {
377 if (id->IsStr() && id->GetStr() ==
str) {
389 ids.push_back(obj_id);
416 bool valid_biosample;
440 for (eptr = bptr +
len; bptr < eptr; bptr = ptr) {
448 name.assign(bptr, ptr);
470 name =
"UniProtKB/Swiss-Prot";
472 name =
"UniProtKB/TrEMBL";
482 if (ptr && ptr < p) {
483 id.assign(bptr, ptr);
499 if (name ==
"BioSample" && !
id.
empty()) {
500 many_biosample = (!
id.empty() && ! id1.empty());
503 valid_biosample =
false;
504 if (many_biosample || ! valid_biosample) {
509 q =
StringChr(
const_cast<char*
>(drline),
'\n');
515 if (! valid_biosample)
522 for (
const string&
val : dr_biosample) {
532 dr_biosample.push_back(
id);
536 if (!
id.
empty() && ! id1.empty()) {
541 q =
StringChr(
const_cast<char*
>(drline),
'\n');
551 for (
const string&
val : dr_ena) {
561 dr_ena.push_back(
id);
570 new_xref->SetDbname().SetName(name);
578 new_xrefs.push_back(new_xref);
601 if (! new_xrefs.empty())
602 embl.
SetXref().swap(new_xrefs);
609 switch (
id.Which()) {
611 return id.SetGenbank();
617 return id.SetSwissprot();
619 return id.SetOther();
631 return id.SetGpipe();
633 return id.SetNamed_annot_track();
682 const char* bptr = dbp->
mOffset;
683 const char* eptr = bptr + dbp->
len;
686 vector<string> taxLines;
688 for (
auto line : taxLines) {
693 if (! sTaxname.empty()) {
700 if (sTaxname.empty()) {
707 auto openP = sTaxname.find(
'(');
708 if (openP != string::npos) {
709 auto sCommonName = sTaxname.substr(0, openP);
710 auto commonTerm = sCommonName.find_last_not_of(
" \t(");
711 if (commonTerm != string::npos) {
712 sCommonName = sCommonName.substr(0, commonTerm + 1);
725 if (condiv && ibp->
segnum != 0) {
765 bool allow_crossdb_featloc;
779 for (q = p; *q !=
'\0'; q++) {
782 else if (*q ==
'\n') {
784 if (q[1] ==
'C' && q[2] ==
'O' && q[3] ==
' ') {
790 for (q = p,
r = p; *q !=
'\0'; q++)
795 for (q = p; *q !=
'\0'; q++)
796 if ((q[0] ==
',' && q[1] ==
',') || (q[0] ==
'(' && q[1] ==
',') ||
797 (q[0] ==
',' && q[1] ==
')'))
879 for (
i = 0, q = p; *q !=
'\0'; q++) {
951 bool pat_ref =
false;
952 bool est_kwd =
false;
953 bool sts_kwd =
false;
954 bool gss_kwd =
false;
955 bool htc_kwd =
false;
956 bool fli_kwd =
false;
957 bool wgs_kwd =
false;
958 bool tpa_kwd =
false;
959 bool tsa_kwd =
false;
960 bool tls_kwd =
false;
961 bool env_kwd =
false;
962 bool mga_kwd =
false;
997 while (*bptr ==
' ' || *bptr ==
';')
1003 bptr = (
char*)
"CON";
1008 while (*bptr ==
' ' || *bptr ==
';')
1011 dataclass[3] =
'\0';
1016 dataclass[0] =
'\0';
1027 embl->SetKeywords() = keywords;
1040 for (
const string&
key : keywords) {
1041 fta_keywords_check(
key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
1076 const char* p = gbdiv.c_str();
1093 if (!
HasHtg(embl->GetKeywords())) {
1132 if (ibp->
is_mga ==
false) {
1137 }
else if (ibp->
is_mga) {
1147 }
else if (ibp->
is_tpa) {
1153 if (ibp->
is_tsa ==
false) {
1158 }
else if (ibp->
is_tsa) {
1163 if (ibp->
is_tls ==
false) {
1168 }
else if (ibp->
is_tls) {
1173 if (
i == 2 && ibp->
htg > 0 && env_kwd)
1174 ErrPostStr(
SEV_WARNING,
ERR_KEYWORD_HTGPlusENV,
"This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
1175 else if ((
i == 2 && wgs_kwd && tpa_kwd) ||
1176 (
i == 2 && tsa_kwd && tpa_kwd)) {
1177 }
else if (
i != 2 || env_kwd ==
false ||
1178 (est_kwd ==
false && gss_kwd ==
false && wgs_kwd ==
false)) {
1179 ErrPostStr(
SEV_REJECT,
ERR_KEYWORD_ConflictingKeywords,
"This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
1187 wgs_kwd ==
false && tpa_kwd ==
false && env_kwd ==
false) {
1203 if (! est_kwd && kw.find(
"EST") != string::npos) {
1206 if (! sts_kwd && kw.find(
"STS") != string::npos) {
1209 if (! gss_kwd && kw.find(
"GSS") != string::npos) {
1218 check_div(ibp->
is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->
bases, pp->
source, drop);
1227 }
else if (! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"CON")) {
1231 bool is_htc_div = ! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"HTC");
1232 bool has_htc =
HasHtc(embl->GetKeywords());
1234 if (is_htc_div && ! has_htc) {
1238 if (! is_htc_div && has_htc) {
1257 if (*p ==
'm' || *p ==
'r')
1275 if (! gbdiv.empty()) {
1279 }
else if (
StringEqu(gbdiv.c_str(),
"STS")) {
1282 }
else if (
StringEqu(gbdiv.c_str(),
"GSS")) {
1285 }
else if (
StringEqu(gbdiv.c_str(),
"HTC")) {
1289 }
else if (
StringEqu(gbdiv.c_str(),
"SYN") && bio_src &&
1317 for (
const auto& subtype : bio_src->
GetSubtype()) {
1331 embl->SetCreation_date().SetStd(*std_creation_date);
1332 embl->SetUpdate_date().SetStd(*std_update_date);
1344 for (
const string& acc : embl->SetExtra_acc()) {
1346 (acc[0] ==
'C' || acc[0] ==
'U')) {
1384 if (! gbdiv.empty()) {
1389 find_if(begin(subtype), end(subtype), [](
auto pSubSource) {
1395 }
else if (! bio_src ||
1401 if (! gbb->IsSetExtra_accessions() && ! gbb->IsSetKeywords() && ! gbb->IsSetDiv())
1440 for (
i = 0, q = bptr; *q !=
'\0'; q++) {
1452 for (p =
r + 1; *p ==
' ' || *p ==
';';)
1476 GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), bptr, pp, entry, org_ref);
1478 mol_info->ResetBiomol();
1490 if (!
tag || lst.empty())
1497 for (
const string& item : lst) {
1498 field->
SetData().SetStrs().push_back(item);
1509 if (dr_ena.empty() && dr_biosample.empty())
1514 for (
auto& descr : descrs) {
1515 if (! descr->IsUser() || ! descr->GetUser().IsSetType())
1518 const CObject_id& obj_id = descr->GetUser().GetType();
1520 if (obj_id.
IsStr() && obj_id.
GetStr() ==
"DBLink") {
1521 user_obj_ptr = &descr->SetUser();
1528 if (! dr_biosample.empty())
1532 if (! dr_ena.empty()) {
1543 user_obj->
SetType().SetStr(
"DBLink");
1549 user_obj_ptr->
SetData().push_back(field_bs);
1551 user_obj_ptr->
SetData().push_back(field_ena);
1556 descrs.push_back(descr);
1563 dbuop->
Assign(*user_obj_ptr);
1574 for (
const auto& xref : embl_block.
GetXref()) {
1575 if (! xref->IsSetDbname() || ! xref->GetDbname().IsName() ||
1576 !
StringEquN(xref->GetDbname().GetName().c_str(),
"IMGT/", 5))
1580 for (
const auto&
id : xref->GetId()) {
1581 if (id->IsStr() && ! id->GetStr().empty()) {
1591 tag->SetDb(xref->GetDbname().GetName());
1593 string& id_str =
tag->SetTag().SetStr();
1595 bool need_delimiter =
false;
1596 for (
const auto&
id : xref->GetId()) {
1597 if (id->IsStr() && ! id->GetStr().empty()) {
1601 need_delimiter =
true;
1603 id_str +=
id->GetStr();
1607 xrefs.push_back(
tag);
1615 imp.
SetKey(
"misc_feature");
1620 if (annot.empty() || ! (*annot.begin())->IsFtable()) {
1622 new_annot->
SetData().SetFtable().push_back(feat);
1624 annot.push_back(new_annot);
1627 old_annot.
SetData().SetFtable().push_front(feat);
1650 bool is_htg =
false;
1668 for (
size_t pos = 0; pos <
str.size();) {
1669 pos =
str.find(
";;", pos);
1670 if (pos == string::npos)
1674 for (
size_t i = pos;
i <
str.size() &&
str[
i] ==
';'; ++
i)
1679 while (!
str.empty()) {
1680 char c =
str.back();
1681 if (c ==
' ' || c ==
';')
1711 str1 =
"TPA_specdb:";
1718 str.replace(0, 4, str1);
1723 bioseq.
SetDescr().Set().push_back(descr);
1740 (title.empty() || !
StringEquN(title.c_str(),
"TSA:", 4))) {
1746 if (ibp->
is_tls && (title.empty() || !
StringEquN(title.c_str(),
"TLS:", 4))) {
1755 for (; dbp; dbp = dbp->
mpNext) {
1763 bioseq.
SetDescr().Set().push_back(descr);
1768 for (; dbp; dbp = dbp->
mpNext) {
1776 bioseq.
SetDescr().Set().push_back(descr);
1785 for (
auto& descr : bioseq.
SetDescr().Set()) {
1786 if (descr->IsSource()) {
1787 bio_src = &(descr->SetSource());
1789 org_ref = &bio_src->
SetOrg();
1808 ibp->
is_contig && ! mol_info->IsSetTech()) {
1811 mol_info->ResetTech();
1813 mol_info->SetTech(tech);
1816 if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
1819 bioseq.
SetDescr().Set().push_back(descr);
1829 if (! dr_ena.empty() || ! dr_biosample.empty())
1832 if (embl_block.
Empty()) {
1842 if (embl_block->GetExtra_acc().empty())
1843 embl_block->ResetExtra_acc();
1853 bool hasEmblBlock =
false;
1857 bioseq.
SetDescr().Set().push_back(descr);
1858 hasEmblBlock =
true;
1885 if (hasEmblBlock && embl_block->IsSetDiv() && embl_block->GetDiv() < 15) {
1892 for (
auto& pAnnot : bioseq.
SetAnnot()) {
1893 if (pAnnot->IsFtable()) {
1894 for (
auto& pFeat : pAnnot->SetData().SetFtable()) {
1895 if (pFeat->IsSetData() && pFeat->SetData().IsBiosrc()) {
1896 auto& biosrc = pFeat->SetData().SetBiosrc();
1897 if (biosrc.IsSetOrg() &&
1898 (! biosrc.GetOrg().IsSetDb() ||
1907 }
else if (gbb && gbb->IsSetDiv()) {
1915 bioseq.
SetDescr().Set().push_back(descr);
1934 for (
auto& user_obj : user_objs) {
1937 bioseq.
SetDescr().Set().push_back(descr);
1943 for (q =
str, p = q; *p !=
'\0';) {
1944 if (*p ==
';' && (p[1] ==
' ' || p[1] ==
'~'))
1946 if (*p ==
'~' || *p ==
' ') {
1948 for (p++; *p ==
' ' || *p ==
'~';)
1959 bioseq.
SetDescr().Set().push_back(descr);
1974 if (std_creation_date.
NotEmpty()) {
1977 bioseq.
SetDescr().Set().push_back(descr);
1983 bioseq.
SetDescr().Set().push_back(descr);
1986 string crdate_str, update_str;
1987 std_creation_date->
GetDate(&crdate_str,
"%2M-%2D-%4Y");
1988 std_update_date->
GetDate(&crdate_str,
"%2M-%2D-%4Y");
2009 for (; dbp; dbp = dbp->
mpNext) {
2014 if (org_ref.
Empty())
2018 bio_src->
SetOrg(*org_ref);
2023 while (taxname_str[off_pos] !=
' ' && off_pos < taxname_str.size())
2025 while (taxname_str[off_pos] ==
' ' && off_pos < taxname_str.size())
2029 taxname_str = taxname_str.substr(off_pos);
2030 if (taxname_str ==
"Unknown.") {
2031 taxname_str = taxname_str.substr(0, taxname_str.size() - 1);
2035 for (; subdbp; subdbp = subdbp->
mpNext) {
2050 for (p = q; *p !=
'\0';)
2057 if (*p !=
' ' && *p !=
'\t' && *p !=
'\n' && *p !=
'.' &&
2079 bioseq.
SetDescr().Set().push_front(descr);
2097 while (*p ==
' ' || *p ==
';')
2110 for (
i = 0, p = entry.
mOffset; *p !=
'\0' &&
i < 4; p++)
2111 if (*p ==
';' && p[1] ==
' ')
2121 for (p++; *p ==
' ';)
2155 bool seq_long =
false;
2160 for (imax = pp->
indx,
i = 0;
i < imax;
i++) {
2173 ebp =
static_cast<EntryBlk*
>(pEntry->mpData);
2174 ptr = pEntry->mOffset;
2176 eptr = ptr + pEntry->len;
2263 else if (ibp->
htg == 4 || ibp->
htg == 1 || ibp->
htg == 2 ||
2266 }
else if (ibp->
gaps)
2270 if (pEntry->mpQscore.empty() && pp->
accver) {
2291 pEntry->mpQscore.clear();
2297 id->SetPatent(*ibp->
psip);
2298 bioseq->
SetId().push_back(
id);
2316 if (ibp->
htg == 4 || ibp->
htg == 1 || ibp->
htg == 2) {
2324 else if (! ibp->
drop)
2345 for (reject_set =
false,
i = 0;
i < imax;
i++) {
2351 if (pp->
limit != 0 && ! reject_set) {
2352 for (seq_long =
false,
i = 0;
i < imax;
i++) {
2355 ibp->
htg != 2 && ibp->
htg != 4) {
2361 for (
i = 0;
i < imax;
i++) {
2364 (ibp->
htg == 1 || ibp->
htg == 2 || ibp->
htg == 4)) {
2380 for (
i = 0;
i < imax;
i++) {
2387 for (
i = 0;
i < imax;
i++) {
2419 bool pat_ref =
false;
2420 bool est_kwd =
false;
2421 bool sts_kwd =
false;
2422 bool gss_kwd =
false;
2423 bool htc_kwd =
false;
2424 bool fli_kwd =
false;
2425 bool wgs_kwd =
false;
2426 bool tpa_kwd =
false;
2427 bool env_kwd =
false;
2428 bool mga_kwd =
false;
2429 bool tsa_kwd =
false;
2430 bool tls_kwd =
false;
2446 embl->SetKeywords().swap(ibp->
keywords);
2451 for (
const string&
key : embl->GetKeywords()) {
2452 fta_keywords_check(
key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
2462 dataclass[0] =
'\0';
2495 const char* p = gbdiv.c_str();
2512 if (!
HasHtg(embl->GetKeywords())) {
2551 if (ibp->
is_mga ==
false) {
2556 }
else if (ibp->
is_mga) {
2567 }
else if (ibp->
is_tpa) {
2573 if (ibp->
is_tsa ==
false) {
2578 }
else if (ibp->
is_tsa) {
2584 if (ibp->
is_tls ==
false) {
2589 }
else if (ibp->
is_tls) {
2595 if (
i == 2 && ibp->
htg > 0 && env_kwd)
2596 ErrPostStr(
SEV_WARNING,
ERR_KEYWORD_HTGPlusENV,
"This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
2597 else if (
i != 2 || env_kwd ==
false ||
2598 (est_kwd ==
false && gss_kwd ==
false && wgs_kwd ==
false)) {
2599 ErrPostStr(
SEV_REJECT,
ERR_KEYWORD_ConflictingKeywords,
"This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
2607 wgs_kwd ==
false && tpa_kwd ==
false && env_kwd ==
false) {
2620 if (! est_kwd &&
StringStr(kw,
"EST")) {
2623 if (! sts_kwd &&
StringStr(kw,
"STS")) {
2626 if (! gss_kwd &&
StringStr(kw,
"GSS")) {
2635 check_div(ibp->
is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->
bases, pp->
source, drop);
2644 }
else if (! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"CON")) {
2648 bool is_htc_div = ! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"HTC");
2649 bool has_htc =
HasHtc(embl->GetKeywords());
2651 if (is_htc_div && ! has_htc) {
2655 if (! is_htc_div && has_htc) {
2664 if (*
r ==
'm' || *
r ==
'r')
2685 if (! gbdiv.empty()) {
2689 }
else if (
StringEqu(gbdiv.c_str(),
"STS")) {
2692 }
else if (
StringEqu(gbdiv.c_str(),
"GSS")) {
2695 }
else if (
StringEqu(gbdiv.c_str(),
"HTC")) {
2699 }
else if (
StringEqu(gbdiv.c_str(),
"SYN") && bio_src &&
2733 embl->SetCreation_date().SetStd(*std_creation_date);
2738 embl->SetUpdate_date().SetStd(*std_update_date);
2742 if (std_update_date.
Empty() && std_creation_date.
NotEmpty())
2743 embl->SetUpdate_date().SetStd(*std_creation_date);
2750 for (
const string& acc : embl->SetExtra_acc()) {
2752 (acc[0] ==
'C' || acc[0] ==
'U')) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
void SeqToDelta(CBioseq &bioseq, Int2 tech)
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
bool fta_if_valid_biosample(const Char *id, bool dblink)
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
bool check_cds(const DataBlk &entry, Parser::EFormat format)
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
bool fta_if_valid_sra(const Char *id, bool dblink)
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
void err_install(const Indexblk *ibp, bool accver)
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, const char *location, const char *name, bool iscon)
void StripSerialNumbers(TEntryList &seq_entries)
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
void PackEntries(TEntryList &seq_entries)
void fta_set_strandedness(TEntryList &seq_entries)
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
unique_ptr< unsigned char[]> GetDNAConv(void)
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
char * GetDescrComment(char *offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
void fta_sort_descr(TEntryList &seq_entries)
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
void xFreeEntry(DataBlkPtr entry)
list< string > TStringList
void ProcessCitations(TEntryList &seq_entries)
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
void GetDate(string *label, const string &format) const
Append a custom string representation of the date to the label.
@ eCompare_after
*this comes second.
@Imp_feat.hpp User-defined methods of the data storage class.
const list< string > KeywordList() const
namespace ncbi::objects::
static bool IsNa(EMol mol)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
static bool GetEmblInst(ParserPtr pp, const DataBlk &entry, unsigned char *dnaconv)
static bool CheckEmblContigEverywhere(const IndexblkPtr ibp, Parser::ESource source)
static const char * ParFlat_DRname_array[]
static bool OutputEmblAsn(bool seq_long, ParserPtr pp, TEntryList &seq_entries)
static const char * ParFlat_Embl_DIV_array[]
static void GetEmblDescr(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
const char * GetEmblDiv(Uint1 num)
static CRef< CUser_field > fta_create_user_field(const char *tag, TStringList &lst)
static void fta_create_imgt_misc_feat(CBioseq &bioseq, CEMBL_block &embl_block, IndexblkPtr ibp)
static bool s_DuplicatesBiosource(const CBioSource &biosource, const string &gbdiv)
static CTextseq_id & SetTextIdRef(CSeq_id &id)
static void EmblGetDivisionNewID(IndexblkPtr ibp, const DataBlk &entry)
static const char * ParFlat_DBname_array[]
bool GetEmblInstContig(const DataBlk &entry, CBioseq &bioseq, ParserPtr pp)
static void EmblGetDivision(IndexblkPtr ibp, const DataBlk &entry)
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
static CRef< CMolInfo > GetEmblMolInfo(ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
static CRef< CGB_block > GetEmblGBBlock(ParserPtr pp, const DataBlk &entry, const string &gbdiv, CBioSource *bio_src)
static const char * ParFlat_Embl_dataclass_array[]
static CRef< CEMBL_block > GetDescrEmblBlock(ParserPtr pp, const DataBlk &entry, CMolInfo &mol_info, string &gbdiv, const CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
static void FakeEmblBioSources(const DataBlk &entry, CBioseq &bioseq)
bool EmblAscii(ParserPtr pp)
static void SetXrefObjId(CEMBL_xref &xref, const string &str)
static const char * ParFlat_GBDIV_array[]
static void GetReleaseInfo(const DataBlk &entry)
static CRef< COrg_ref > GetEmblOrgRef(const DataBlkPtr dbp)
static bool s_HasTPAPrefix(const CTempString &line)
static void GetEmblBlockXref(const DataBlk &entry, XmlIndexPtr xip, const char *chentry, TStringList &dr_ena, TStringList &dr_biosample, bool *drop, CEMBL_block &embl)
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
static void GetEmblDate(Parser::ESource source, const DataBlk &entry, CRef< CDate_std > &crdate, CRef< CDate_std > &update)
#define ParFlat_COL_DATA_EMBL
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
void FinalCleanup(TEntryList &seq_entries)
#define ERR_DRXREF_DuplicatedSRA
#define ERR_SEQUENCE_BadData
#define ERR_TPA_TpaSpansMissing
#define ERR_ENTRY_LongSequence
#define ERR_FORMAT_MissingContigFeature
#define ERR_KEYWORD_ShouldNotBeTPA
#define ERR_DIVISION_BadTSADivcode
#define ERR_FORMAT_MissingSequenceData
#define ERR_DIVISION_InvalidHTCKeyword
#define ERR_DRXREF_InvalidSRA
#define ERR_KEYWORD_IllegalForCON
#define ERR_DIVISION_MissingHTGKeywords
#define ERR_QSCORE_FailedToParse
#define ERR_ENTRY_LongHTGSSequence
#define ERR_KEYWORD_MissingTSA
#define ERR_DIVISION_BadTPADivcode
#define ERR_DRXREF_InvalidBioSample
#define ERR_LOCUS_WrongTopology
#define ERR_REFERENCE_No_references
#define ERR_KEYWORD_ShouldNotBeTLS
#define ERR_ENTRY_GBBlock_not_Empty
#define ERR_KEYWORD_HTGPlusENV
#define ERR_DEFINITION_MissingTPA
#define ERR_ENTRY_Skipped
#define ERR_DEFINITION_MissingTLS
#define ERR_KEYWORD_ESTSubstring
#define ERR_KEYWORD_ConflictingKeywords
#define ERR_DIVISION_ConDivLacksContig
#define ERR_LOCATION_ContigHasNull
#define ERR_KEYWORD_ENV_NoMatchingQualifier
#define ERR_KEYWORD_ShouldNotBeTSA
#define ERR_KEYWORD_STSSubstring
#define ERR_DIVISION_UnknownDivCode
#define ERR_KEYWORD_MissingTLS
#define ERR_DEFINITION_ShouldNotBeTSA
#define ERR_SEGMENT_Rejected
#define ERR_DIVISION_MissingHTCKeyword
#define ERR_DIVISION_MappedtoCON
#define ERR_FORMAT_ContigWithSequenceData
#define ERR_DRXREF_UnknownDBname
#define ERR_DRXREF_DuplicatedBioSamples
#define ERR_KEYWORD_NoGeneExpressionKeywords
#define ERR_DEFINITION_MissingTSA
#define ERR_KEYWORD_GSSSubstring
#define ERR_DEFINITION_ShouldNotBeTPA
#define ERR_FORMAT_MissingEnd
#define ERR_KEYWORD_MissingTPA
#define ERR_DIVISION_ConDivInSegset
#define ERR_ENTRY_ParsingComplete
#define ERR_ORGANISM_NoOrganism
#define ERR_DATE_IllegalDate
#define ERR_DIVISION_HTCWrongMolType
#define ERR_KEYWORD_ShouldNotBeCAGE
#define ERR_DEFINITION_ShouldNotBeTLS
#define ERR_TSA_UnexpectedPrimaryAccession
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
#define INSDSEQ_DATABASE_REFERENCE
#define INSDSEQ_CREATE_DATE
#define INSDSEQ_UPDATE_DATE
unique_ptr< string > XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
unique_ptr< string > XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
std::list< std::string > TKeywordList
std::vector< CRef< objects::CUser_object > > TUserObjVector
bool StringEquNI(const char *s1, const char *s2, size_t n)
bool StringEquN(const char *s1, const char *s2, size_t n)
bool StringEqu(const char *s1, const char *s2)
void StringCpy(char *d, const char *s)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
char * StringNew(size_t sz)
void FtaDeletePrefix(int prefix)
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
static const char * str(char *buf, int n)
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
list< CRef< CObject_id > > TId
const TXref & GetXref(void) const
Get the Xref member data.
TXref & SetXref(void)
Assign a value to Xref data member.
TId & SetId(void)
Assign a value to Id data member.
bool IsSetXref(void) const
Check if a value has been assigned to Xref data member.
list< CRef< CEMBL_xref > > TXref
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
TGenome GetGenome(void) const
Get the Genome member data.
TOrigin GetOrigin(void) const
Get the Origin member data.
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
const TOrg & GetOrg(void) const
Get the Org member data.
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
void SetOrg(TOrg &value)
Assign a value to Org data member.
@ eSubtype_environmental_sample
@ eOrigin_synthetic
purely synthetic
bool IsStr(void) const
Check if variant Str is selected.
void SetYear(TYear value)
Assign a value to Year data member.
void SetMonth(TMonth value)
Assign a value to Month data member.
TStd & SetStd(void)
Select the variant.
void SetDay(TDay value)
Assign a value to Day data member.
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
TYear GetYear(void) const
Get the Year member data.
TMonth GetMonth(void) const
Get the Month member data.
TDay GetDay(void) const
Get the Day member data.
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
const TDiv & GetDiv(void) const
Get the Div member data.
void SetCommon(const TCommon &value)
Assign a value to Common data member.
const TDb & GetDb(void) const
Get the Db member data.
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
vector< CRef< CDbtag > > TDbxref
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
void SetData(TData &value)
Assign a value to Data data member.
void SetKey(const TKey &value)
Assign a value to Key data member.
bool IsMix(void) const
Check if variant Mix is selected.
const TMix & GetMix(void) const
Get the variant data.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
@ e_Tpe
Third Party Annot/Seq EMBL.
@ e_Tpd
Third Party Annot/Seq DDBJ.
@ e_Named_annot_track
Internal named annotation tracking ID.
@ e_Tpg
Third Party Annot/Seq Genbank.
TRepr GetRepr(void) const
Get the Repr member data.
void SetData(TData &value)
Assign a value to Data data member.
list< CRef< CSeqdesc > > Tdata
TId & SetId(void)
Assign a value to Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TTitle & SetTitle(void)
Select the variant.
TPub & SetPub(void)
Select the variant.
TTopology GetTopology(void) const
Get the Topology member data.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
TGenbank & SetGenbank(void)
Select the variant.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
const TId & GetId(void) const
Get the Id member data.
TTech GetTech(void) const
Get the Tech member data.
TComment & SetComment(void)
Select the variant.
void SetInst(TInst &value)
Assign a value to Inst data member.
void ResetTech(void)
Reset Tech data member.
TSource & SetSource(void)
Select the variant.
void SetTopology(TTopology value)
Assign a value to Topology data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
TUser & SetUser(void)
Select the variant.
TEmbl & SetEmbl(void)
Select the variant.
void SetRepr(TRepr value)
Assign a value to Repr data member.
EStrand
strandedness in living organism
list< CRef< CSeq_annot > > TAnnot
void SetStrand(TStrand value)
Assign a value to Strand data member.
void SetTech(TTech value)
Assign a value to Tech data member.
TMolinfo & SetMolinfo(void)
Select the variant.
TCreate_date & SetCreate_date(void)
Select the variant.
TUpdate_date & SetUpdate_date(void)
Select the variant.
@ eRepr_delta
sequence made by changes (delta) to others
@ eRepr_raw
continuous sequence
@ eTech_htgs_2
ordered High Throughput sequence contig
@ eTech_htc
high throughput cDNA
@ eTech_targeted
targeted locus sets/studies
@ eTech_sts
Sequence Tagged Site.
@ eTech_htgs_3
finished High Throughput sequence
@ eTech_htgs_1
unordered High Throughput sequence contig
@ eTech_tsa
transcriptome shotgun assembly
@ eTech_wgs
whole genome shotgun sequencing
@ eTech_survey
one-pass genomic sequence
@ eTech_htgs_0
single genomic reads for coordination
@ eTech_fli_cdna
full length insert cDNA
@ eTech_est
Expressed Sequence Tag.
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
int fta_if_wgs_acc(string_view accession)
int CheckSTRAND(const string &str)
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Uint2 col_data)
CRef< objects::CSeq_entry > seq_entry
CRef< objects::CPatent_seq_id > psip
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
CKeywordParser & KeywordParser()
char *(* ff_get_qscore)(const char *accession, Int2 v)
bool GetGenomeInfo(CBioSource &bsp, string_view bptr)
void MaybeCutGbblockSource(TEntryList &seq_entries)
bool HasHtg(const TKeywordList &keywords)
bool HasHtc(const TKeywordList &keywords)
char * SrchTheChar(char *bptr, char *eptr, Char letter)
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void RemoveHtgPhase(TKeywordList &keywords)
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
void fta_remove_tpa_keywords(TKeywordList &kwds)
void CleanTailNoneAlphaCharInString(string &str)
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Int2 fta_StringMatch(const Char **array, string_view text)
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
void fta_StringCpy(char *dst, const char *src)
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
bool IsCancelled(const TKeywordList &keywords)
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void fta_remove_env_keywords(TKeywordList &kwds)
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
bool fta_tpa_keywords_check(const TKeywordList &kwds)
char * PointToNextToken(char *ptr)
CRef< CSeq_loc > xgbparseint_ver(string_view raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)