94 #define THIS_FILE "em_ascii.cpp"
106 "ANN",
"CON",
"PAT",
"EST",
"GSS",
"HTC",
"HTG",
"STS",
"TSA",
113 "FUN",
"INV",
"MAM",
"ORG",
"PHG",
"PLN",
"PRI",
"PRO",
"ROD",
114 "SYN",
"UNA",
"VRL",
"VRT",
"PAT",
"EST",
"STS",
"UNC",
"GSS",
115 "HUM",
"HTG",
"HTC",
"CON",
"ENV",
"MUS",
"TGN",
"TSA",
123 "PLN",
"INV",
"MAM",
"UNA",
"PHG",
"PLN",
"PRI",
"BCT",
"ROD",
124 "SYN",
"UNA",
"VRL",
"VRT",
"PAT",
"EST",
"STS",
"UNA",
"GSS",
125 "PRI",
"HTG",
"HTC",
"CON",
"ENV",
"ROD",
"SYN",
"TSA",
192 "GUIDETOPHARMACOLOGY",
240 "UNIPROT/SWISS-PROT",
242 "UNIPROTKB/SWISS-PROT",
298 if (update.
Empty()) {
311 if (seq_entries.empty()) {
328 if (pp->
qamode && ! seq_entries.empty())
376 for (
const auto&
id : ids) {
377 if (id->IsStr() && id->GetStr() ==
str) {
389 ids.push_back(obj_id);
416 bool valid_biosample;
440 for (eptr = bptr +
len; bptr < eptr; bptr = ptr) {
448 name.assign(bptr, ptr);
470 name =
"UniProtKB/Swiss-Prot";
472 name =
"UniProtKB/TrEMBL";
482 if (ptr && ptr < p) {
483 id.assign(bptr, ptr);
499 if (name ==
"BioSample" && !
id.
empty()) {
500 many_biosample = (!
id.empty() && ! id1.empty());
503 valid_biosample =
false;
504 if (many_biosample || ! valid_biosample) {
509 q =
StringChr(
const_cast<char*
>(drline),
'\n');
515 if (! valid_biosample)
522 for (
const string&
val : dr_biosample) {
532 dr_biosample.push_back(
id);
536 if (!
id.
empty() && ! id1.empty()) {
541 q =
StringChr(
const_cast<char*
>(drline),
'\n');
551 for (
const string&
val : dr_ena) {
561 dr_ena.push_back(
id);
570 new_xref->SetDbname().SetName(name);
578 new_xrefs.push_back(new_xref);
601 if (! new_xrefs.empty())
602 embl.
SetXref().swap(new_xrefs);
609 switch (
id.Which()) {
611 return id.SetGenbank();
617 return id.SetSwissprot();
619 return id.SetOther();
631 return id.SetGpipe();
633 return id.SetNamed_annot_track();
682 const char* bptr = dbp->
mOffset;
683 const char* eptr = bptr + dbp->
len;
686 vector<string> taxLines;
688 for (
auto line : taxLines) {
693 if (! sTaxname.empty()) {
700 if (sTaxname.empty()) {
707 auto openP = sTaxname.find(
'(');
708 if (openP != string::npos) {
709 auto sCommonName = sTaxname.substr(0, openP);
710 auto commonTerm = sCommonName.find_last_not_of(
" \t(");
711 if (commonTerm != string::npos) {
712 sCommonName = sCommonName.substr(0, commonTerm + 1);
725 if (condiv && ibp->
segnum != 0) {
765 bool allow_crossdb_featloc;
779 for (q = p; *q !=
'\0'; q++) {
782 else if (*q ==
'\n') {
784 if (q[1] ==
'C' && q[2] ==
'O' && q[3] ==
' ') {
790 for (q = p,
r = p; *q !=
'\0'; q++)
795 for (q = p; *q !=
'\0'; q++)
796 if ((q[0] ==
',' && q[1] ==
',') || (q[0] ==
'(' && q[1] ==
',') ||
797 (q[0] ==
',' && q[1] ==
')'))
881 for (
i = 0, q = p; *q !=
'\0'; q++) {
955 bool pat_ref =
false;
956 bool est_kwd =
false;
957 bool sts_kwd =
false;
958 bool gss_kwd =
false;
959 bool htc_kwd =
false;
960 bool fli_kwd =
false;
961 bool wgs_kwd =
false;
962 bool tpa_kwd =
false;
963 bool tsa_kwd =
false;
964 bool tls_kwd =
false;
965 bool env_kwd =
false;
966 bool mga_kwd =
false;
1001 while (*bptr ==
' ' || *bptr ==
';')
1007 bptr = (
char*)
"CON";
1012 while (*bptr ==
' ' || *bptr ==
';')
1015 dataclass[3] =
'\0';
1020 dataclass[0] =
'\0';
1031 embl->SetKeywords() = keywords;
1044 for (
const string&
key : keywords) {
1045 fta_keywords_check(
key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
1080 const char* p = gbdiv.c_str();
1097 if (!
HasHtg(embl->GetKeywords())) {
1136 if (ibp->
is_mga ==
false) {
1141 }
else if (ibp->
is_mga) {
1151 }
else if (ibp->
is_tpa) {
1157 if (ibp->
is_tsa ==
false) {
1162 }
else if (ibp->
is_tsa) {
1167 if (ibp->
is_tls ==
false) {
1172 }
else if (ibp->
is_tls) {
1177 if (
i == 2 && ibp->
htg > 0 && env_kwd)
1178 ErrPostEx(
SEV_WARNING,
ERR_KEYWORD_HTGPlusENV,
"This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
1179 else if ((
i == 2 && wgs_kwd && tpa_kwd) ||
1180 (
i == 2 && tsa_kwd && tpa_kwd)) {
1181 }
else if (
i != 2 || env_kwd ==
false ||
1182 (est_kwd ==
false && gss_kwd ==
false && wgs_kwd ==
false)) {
1183 ErrPostEx(
SEV_REJECT,
ERR_KEYWORD_ConflictingKeywords,
"This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
1191 wgs_kwd ==
false && tpa_kwd ==
false && env_kwd ==
false) {
1208 if (kwp && ! est_kwd) {
1212 if (kwp && ! sts_kwd) {
1216 if (kwp && ! gss_kwd) {
1226 check_div(ibp->
is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->
bases, pp->
source, drop);
1235 }
else if (! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"CON")) {
1239 bool is_htc_div = ! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"HTC");
1240 bool has_htc =
HasHtc(embl->GetKeywords());
1242 if (is_htc_div && ! has_htc) {
1246 if (! is_htc_div && has_htc) {
1265 if (*p ==
'm' || *p ==
'r')
1283 if (! gbdiv.empty()) {
1287 }
else if (
StringEqu(gbdiv.c_str(),
"STS")) {
1290 }
else if (
StringEqu(gbdiv.c_str(),
"GSS")) {
1293 }
else if (
StringEqu(gbdiv.c_str(),
"HTC")) {
1297 }
else if (
StringEqu(gbdiv.c_str(),
"SYN") && bio_src &&
1325 for (
const auto& subtype : bio_src->
GetSubtype()) {
1339 embl->SetCreation_date().SetStd(*std_creation_date);
1340 embl->SetUpdate_date().SetStd(*std_update_date);
1352 for (
const string& acc : embl->SetExtra_acc()) {
1354 (acc[0] ==
'C' || acc[0] ==
'U')) {
1392 if (! gbdiv.empty()) {
1397 find_if(begin(subtype), end(subtype), [](
auto pSubSource) {
1403 }
else if (! bio_src ||
1409 if (! gbb->IsSetExtra_accessions() && ! gbb->IsSetKeywords() && ! gbb->IsSetDiv())
1448 for (
i = 0, q = bptr; *q !=
'\0'; q++) {
1460 for (p =
r + 1; *p ==
' ' || *p ==
';';)
1484 GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), bptr, pp, entry, org_ref);
1486 mol_info->ResetBiomol();
1498 if (!
tag || lst.empty())
1505 for (
const string& item : lst) {
1506 field->
SetData().SetStrs().push_back(item);
1517 if (dr_ena.empty() && dr_biosample.empty())
1522 for (
auto& descr : descrs) {
1523 if (! descr->IsUser() || ! descr->GetUser().IsSetType())
1526 const CObject_id& obj_id = descr->GetUser().GetType();
1528 if (obj_id.
IsStr() && obj_id.
GetStr() ==
"DBLink") {
1529 user_obj_ptr = &descr->SetUser();
1536 if (! dr_biosample.empty())
1540 if (! dr_ena.empty()) {
1551 user_obj->
SetType().SetStr(
"DBLink");
1557 user_obj_ptr->
SetData().push_back(field_bs);
1559 user_obj_ptr->
SetData().push_back(field_ena);
1564 descrs.push_back(descr);
1571 dbuop->
Assign(*user_obj_ptr);
1582 for (
const auto& xref : embl_block.
GetXref()) {
1583 if (! xref->IsSetDbname() || ! xref->GetDbname().IsName() ||
1584 !
StringEquN(xref->GetDbname().GetName().c_str(),
"IMGT/", 5))
1588 for (
const auto&
id : xref->GetId()) {
1589 if (id->IsStr() && ! id->GetStr().empty()) {
1599 tag->SetDb(xref->GetDbname().GetName());
1601 string& id_str =
tag->SetTag().SetStr();
1603 bool need_delimiter =
false;
1604 for (
const auto&
id : xref->GetId()) {
1605 if (id->IsStr() && ! id->GetStr().empty()) {
1609 need_delimiter =
true;
1611 id_str +=
id->GetStr();
1615 xrefs.push_back(
tag);
1623 imp.
SetKey(
"misc_feature");
1628 if (annot.empty() || ! (*annot.begin())->IsFtable()) {
1630 new_annot->
SetData().SetFtable().push_back(feat);
1632 annot.push_back(new_annot);
1635 old_annot.
SetData().SetFtable().push_front(feat);
1661 bool is_htg =
false;
1680 for (p =
str, q = p; *q !=
'\0';) {
1689 for (p--; *p ==
' ' || *p ==
';'; p--)
1692 if (*p !=
' ' && *p !=
';')
1721 str1 =
"TPA_specdb:";
1727 if (! str1.empty()) {
1728 str1.append(
str + 4);
1736 bioseq.
SetDescr().Set().push_back(descr);
1755 (title.empty() || !
StringEquN(title.c_str(),
"TSA:", 4))) {
1761 if (ibp->
is_tls && (title.empty() || !
StringEquN(title.c_str(),
"TLS:", 4))) {
1770 for (; dbp; dbp = dbp->
mpNext) {
1778 bioseq.
SetDescr().Set().push_back(descr);
1783 for (; dbp; dbp = dbp->
mpNext) {
1791 bioseq.
SetDescr().Set().push_back(descr);
1800 for (
auto& descr : bioseq.
SetDescr().Set()) {
1801 if (descr->IsSource()) {
1802 bio_src = &(descr->SetSource());
1804 org_ref = &bio_src->
SetOrg();
1823 ibp->
is_contig && ! mol_info->IsSetTech()) {
1826 mol_info->ResetTech();
1828 mol_info->SetTech(tech);
1831 if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
1834 bioseq.
SetDescr().Set().push_back(descr);
1844 if (! dr_ena.empty() || ! dr_biosample.empty())
1847 if (embl_block.
Empty()) {
1857 if (embl_block->GetExtra_acc().empty())
1858 embl_block->ResetExtra_acc();
1868 bool hasEmblBlock =
false;
1872 bioseq.
SetDescr().Set().push_back(descr);
1873 hasEmblBlock =
true;
1900 if (hasEmblBlock && embl_block->IsSetDiv() && embl_block->GetDiv() < 15) {
1907 for (
auto& pAnnot : bioseq.
SetAnnot()) {
1908 if (pAnnot->IsFtable()) {
1909 for (
auto& pFeat : pAnnot->SetData().SetFtable()) {
1910 if (pFeat->IsSetData() && pFeat->SetData().IsBiosrc()) {
1911 auto& biosrc = pFeat->SetData().SetBiosrc();
1912 if (biosrc.IsSetOrg() &&
1913 (! biosrc.GetOrg().IsSetDb() ||
1922 }
else if (gbb && gbb->IsSetDiv()) {
1930 bioseq.
SetDescr().Set().push_back(descr);
1949 for (
auto& user_obj : user_objs) {
1952 bioseq.
SetDescr().Set().push_back(descr);
1956 for (q =
str, p = q; *p !=
'\0';) {
1957 if (*p ==
';' && (p[1] ==
' ' || p[1] ==
'~'))
1959 if (*p ==
'~' || *p ==
' ') {
1961 for (p++; *p ==
' ' || *p ==
'~';)
1972 bioseq.
SetDescr().Set().push_back(descr);
1987 if (std_creation_date.
NotEmpty()) {
1990 bioseq.
SetDescr().Set().push_back(descr);
1996 bioseq.
SetDescr().Set().push_back(descr);
1999 string crdate_str, update_str;
2000 std_creation_date->
GetDate(&crdate_str,
"%2M-%2D-%4Y");
2001 std_update_date->
GetDate(&crdate_str,
"%2M-%2D-%4Y");
2023 for (; dbp; dbp = dbp->
mpNext) {
2028 if (org_ref.
Empty())
2032 bio_src->
SetOrg(*org_ref);
2038 while (taxname_str[off_pos] !=
' ' && off_pos < taxname_str.size())
2040 while (taxname_str[off_pos] ==
' ' && off_pos < taxname_str.size())
2044 taxname_str = taxname_str.substr(off_pos);
2045 if (taxname_str ==
"Unknown.") {
2046 taxname_str = taxname_str.substr(0, taxname_str.size() - 1);
2050 for (; subdbp; subdbp = subdbp->
mpNext) {
2068 for (p = q; *p !=
'\0';)
2075 if (*p !=
' ' && *p !=
'\t' && *p !=
'\n' && *p !=
'.' &&
2097 bioseq.
SetDescr().Set().push_front(descr);
2115 while (*p ==
' ' || *p ==
';')
2128 for (
i = 0, p = entry.
mOffset; *p !=
'\0' &&
i < 4; p++)
2129 if (*p ==
';' && p[1] ==
' ')
2139 for (p++; *p ==
' ';)
2173 bool seq_long =
false;
2178 for (imax = pp->
indx,
i = 0;
i < imax;
i++) {
2191 ebp =
static_cast<EntryBlk*
>(pEntry->mpData);
2192 ptr = pEntry->mOffset;
2194 eptr = ptr + pEntry->len;
2281 else if (ibp->
htg == 4 || ibp->
htg == 1 || ibp->
htg == 2 ||
2284 }
else if (ibp->
gaps)
2288 if (pEntry->mpQscore.empty() && pp->
accver) {
2309 pEntry->mpQscore.clear();
2315 id->SetPatent(*ibp->
psip);
2316 bioseq->
SetId().push_back(
id);
2334 if (ibp->
htg == 4 || ibp->
htg == 1 || ibp->
htg == 2) {
2342 else if (! ibp->
drop)
2363 for (reject_set =
false,
i = 0;
i < imax;
i++) {
2369 if (pp->
limit != 0 && ! reject_set) {
2370 for (seq_long =
false,
i = 0;
i < imax;
i++) {
2373 ibp->
htg != 2 && ibp->
htg != 4) {
2379 for (
i = 0;
i < imax;
i++) {
2382 (ibp->
htg == 1 || ibp->
htg == 2 || ibp->
htg == 4)) {
2398 for (
i = 0;
i < imax;
i++) {
2405 for (
i = 0;
i < imax;
i++) {
2439 bool pat_ref =
false;
2440 bool est_kwd =
false;
2441 bool sts_kwd =
false;
2442 bool gss_kwd =
false;
2443 bool htc_kwd =
false;
2444 bool fli_kwd =
false;
2445 bool wgs_kwd =
false;
2446 bool tpa_kwd =
false;
2447 bool env_kwd =
false;
2448 bool mga_kwd =
false;
2449 bool tsa_kwd =
false;
2450 bool tls_kwd =
false;
2466 embl->SetKeywords().swap(ibp->
keywords);
2471 for (
const string&
key : embl->GetKeywords()) {
2472 fta_keywords_check(
key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
2482 dataclass[0] =
'\0';
2515 const char* p = gbdiv.c_str();
2532 if (!
HasHtg(embl->GetKeywords())) {
2571 if (ibp->
is_mga ==
false) {
2576 }
else if (ibp->
is_mga) {
2587 }
else if (ibp->
is_tpa) {
2593 if (ibp->
is_tsa ==
false) {
2598 }
else if (ibp->
is_tsa) {
2604 if (ibp->
is_tls ==
false) {
2609 }
else if (ibp->
is_tls) {
2615 if (
i == 2 && ibp->
htg > 0 && env_kwd)
2616 ErrPostEx(
SEV_WARNING,
ERR_KEYWORD_HTGPlusENV,
"This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
2617 else if (
i != 2 || env_kwd ==
false ||
2618 (est_kwd ==
false && gss_kwd ==
false && wgs_kwd ==
false)) {
2619 ErrPostEx(
SEV_REJECT,
ERR_KEYWORD_ConflictingKeywords,
"This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
2627 wgs_kwd ==
false && tpa_kwd ==
false && env_kwd ==
false) {
2641 if (kwp && est_kwd ==
false) {
2645 if (kwp && sts_kwd ==
false) {
2649 if (kwp && gss_kwd ==
false) {
2658 check_div(ibp->
is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->
bases, pp->
source, drop);
2667 }
else if (! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"CON")) {
2671 bool is_htc_div = ! gbdiv.empty() &&
StringEqu(gbdiv.c_str(),
"HTC");
2672 bool has_htc =
HasHtc(embl->GetKeywords());
2674 if (is_htc_div && ! has_htc) {
2678 if (! is_htc_div && has_htc) {
2687 if (*
r ==
'm' || *
r ==
'r')
2708 if (! gbdiv.empty()) {
2712 }
else if (
StringEqu(gbdiv.c_str(),
"STS")) {
2715 }
else if (
StringEqu(gbdiv.c_str(),
"GSS")) {
2718 }
else if (
StringEqu(gbdiv.c_str(),
"HTC")) {
2722 }
else if (
StringEqu(gbdiv.c_str(),
"SYN") && bio_src &&
2756 embl->SetCreation_date().SetStd(*std_creation_date);
2761 embl->SetUpdate_date().SetStd(*std_update_date);
2765 if (std_update_date.
Empty() && std_creation_date.
NotEmpty())
2766 embl->SetUpdate_date().SetStd(*std_creation_date);
2773 for (
const string& acc : embl->SetExtra_acc()) {
2775 (acc[0] ==
'C' || acc[0] ==
'U')) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
void SeqToDelta(CBioseq &bioseq, Int2 tech)
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
bool fta_if_valid_biosample(const Char *id, bool dblink)
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
bool check_cds(const DataBlk &entry, Parser::EFormat format)
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
bool fta_if_valid_sra(const Char *id, bool dblink)
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
void err_install(const Indexblk *ibp, bool accver)
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, char *location, const char *name, bool iscon)
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
void StripSerialNumbers(TEntryList &seq_entries)
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
char * GetDescrComment(char *offset, size_t len, Int2 col_data, bool is_htg, bool is_pat)
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
void PackEntries(TEntryList &seq_entries)
void fta_set_strandedness(TEntryList &seq_entries)
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
unique_ptr< unsigned char[]> GetDNAConv(void)
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
void fta_sort_descr(TEntryList &seq_entries)
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
void xFreeEntry(DataBlkPtr entry)
list< string > TStringList
void ProcessCitations(TEntryList &seq_entries)
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
void GetDate(string *label, const string &format) const
Append a custom string representation of the date to the label.
@ eCompare_after
*this comes second.
@Imp_feat.hpp User-defined methods of the data storage class.
const list< string > KeywordList() const
namespace ncbi::objects::
static bool IsNa(EMol mol)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
static bool GetEmblInst(ParserPtr pp, const DataBlk &entry, unsigned char *dnaconv)
static bool CheckEmblContigEverywhere(const IndexblkPtr ibp, Parser::ESource source)
static const char * ParFlat_DRname_array[]
static bool OutputEmblAsn(bool seq_long, ParserPtr pp, TEntryList &seq_entries)
static const char * ParFlat_Embl_DIV_array[]
static void GetEmblDescr(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
const char * GetEmblDiv(Uint1 num)
static CRef< CUser_field > fta_create_user_field(const char *tag, TStringList &lst)
static void fta_create_imgt_misc_feat(CBioseq &bioseq, CEMBL_block &embl_block, IndexblkPtr ibp)
static bool s_DuplicatesBiosource(const CBioSource &biosource, const string &gbdiv)
static CTextseq_id & SetTextIdRef(CSeq_id &id)
static void EmblGetDivisionNewID(IndexblkPtr ibp, const DataBlk &entry)
static const char * ParFlat_DBname_array[]
bool GetEmblInstContig(const DataBlk &entry, CBioseq &bioseq, ParserPtr pp)
static void EmblGetDivision(IndexblkPtr ibp, const DataBlk &entry)
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
static CRef< CMolInfo > GetEmblMolInfo(ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
static CRef< CGB_block > GetEmblGBBlock(ParserPtr pp, const DataBlk &entry, const string &gbdiv, CBioSource *bio_src)
static const char * ParFlat_Embl_dataclass_array[]
static CRef< CEMBL_block > GetDescrEmblBlock(ParserPtr pp, const DataBlk &entry, CMolInfo &mol_info, string &gbdiv, const CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
static void FakeEmblBioSources(const DataBlk &entry, CBioseq &bioseq)
bool EmblAscii(ParserPtr pp)
static void SetXrefObjId(CEMBL_xref &xref, const string &str)
static const char * ParFlat_GBDIV_array[]
static void GetReleaseInfo(const DataBlk &entry)
static CRef< COrg_ref > GetEmblOrgRef(const DataBlkPtr dbp)
static bool s_HasTPAPrefix(const CTempString &line)
static void GetEmblBlockXref(const DataBlk &entry, XmlIndexPtr xip, const char *chentry, TStringList &dr_ena, TStringList &dr_biosample, bool *drop, CEMBL_block &embl)
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
static void GetEmblDate(Parser::ESource source, const DataBlk &entry, CRef< CDate_std > &crdate, CRef< CDate_std > &update)
#define ParFlat_COL_DATA_EMBL
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
void FinalCleanup(TEntryList &seq_entries)
#define ERR_DRXREF_DuplicatedSRA
#define ERR_SEQUENCE_BadData
#define ERR_TPA_TpaSpansMissing
#define ERR_ENTRY_LongSequence
#define ERR_FORMAT_MissingContigFeature
#define ERR_KEYWORD_ShouldNotBeTPA
#define ERR_DIVISION_BadTSADivcode
#define ERR_FORMAT_MissingSequenceData
#define ERR_DIVISION_InvalidHTCKeyword
#define ERR_DRXREF_InvalidSRA
#define ERR_KEYWORD_IllegalForCON
#define ERR_DIVISION_MissingHTGKeywords
#define ERR_QSCORE_FailedToParse
#define ERR_ENTRY_LongHTGSSequence
#define ERR_KEYWORD_MissingTSA
#define ERR_DIVISION_BadTPADivcode
#define ERR_DRXREF_InvalidBioSample
#define ERR_LOCUS_WrongTopology
#define ERR_REFERENCE_No_references
#define ERR_KEYWORD_ShouldNotBeTLS
#define ERR_ENTRY_GBBlock_not_Empty
#define ERR_KEYWORD_HTGPlusENV
#define ERR_DEFINITION_MissingTPA
#define ERR_ENTRY_Skipped
#define ERR_DEFINITION_MissingTLS
#define ERR_KEYWORD_ESTSubstring
#define ERR_KEYWORD_ConflictingKeywords
#define ERR_DIVISION_ConDivLacksContig
#define ERR_LOCATION_ContigHasNull
#define ERR_KEYWORD_ENV_NoMatchingQualifier
#define ERR_KEYWORD_ShouldNotBeTSA
#define ERR_KEYWORD_STSSubstring
#define ERR_DIVISION_UnknownDivCode
#define ERR_KEYWORD_MissingTLS
#define ERR_DEFINITION_ShouldNotBeTSA
#define ERR_SEGMENT_Rejected
#define ERR_DIVISION_MissingHTCKeyword
#define ERR_DIVISION_MappedtoCON
#define ERR_FORMAT_ContigWithSequenceData
#define ERR_DRXREF_UnknownDBname
#define ERR_DRXREF_DuplicatedBioSamples
#define ERR_KEYWORD_NoGeneExpressionKeywords
#define ERR_DEFINITION_MissingTSA
#define ERR_KEYWORD_GSSSubstring
#define ERR_DEFINITION_ShouldNotBeTPA
#define ERR_FORMAT_MissingEnd
#define ERR_KEYWORD_MissingTPA
#define ERR_DIVISION_ConDivInSegset
#define ERR_ENTRY_ParsingComplete
#define ERR_ORGANISM_NoOrganism
#define ERR_DATE_IllegalDate
#define ERR_DIVISION_HTCWrongMolType
#define ERR_KEYWORD_ShouldNotBeCAGE
#define ERR_DEFINITION_ShouldNotBeTLS
#define ERR_TSA_UnexpectedPrimaryAccession
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
char * XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
#define INSDSEQ_DATABASE_REFERENCE
#define INSDSEQ_CREATE_DATE
#define INSDSEQ_UPDATE_DATE
char * XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
std::list< std::string > TKeywordList
std::vector< CRef< objects::CUser_object > > TUserObjVector
char * StringSave(const char *s)
bool StringEquNI(const char *s1, const char *s2, size_t n)
bool StringEquN(const char *s1, const char *s2, size_t n)
bool StringEqu(const char *s1, const char *s2)
void StringCpy(char *d, const char *s)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
void FtaDeletePrefix(int prefix)
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
list< CRef< CObject_id > > TId
const TXref & GetXref(void) const
Get the Xref member data.
TXref & SetXref(void)
Assign a value to Xref data member.
TId & SetId(void)
Assign a value to Id data member.
bool IsSetXref(void) const
Check if a value has been assigned to Xref data member.
list< CRef< CEMBL_xref > > TXref
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
TGenome GetGenome(void) const
Get the Genome member data.
TOrigin GetOrigin(void) const
Get the Origin member data.
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
const TOrg & GetOrg(void) const
Get the Org member data.
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
void SetOrg(TOrg &value)
Assign a value to Org data member.
@ eSubtype_environmental_sample
@ eOrigin_synthetic
purely synthetic
bool IsStr(void) const
Check if variant Str is selected.
void SetYear(TYear value)
Assign a value to Year data member.
void SetMonth(TMonth value)
Assign a value to Month data member.
TStd & SetStd(void)
Select the variant.
void SetDay(TDay value)
Assign a value to Day data member.
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
TYear GetYear(void) const
Get the Year member data.
TMonth GetMonth(void) const
Get the Month member data.
TDay GetDay(void) const
Get the Day member data.
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
const TDiv & GetDiv(void) const
Get the Div member data.
void SetCommon(const TCommon &value)
Assign a value to Common data member.
const TDb & GetDb(void) const
Get the Db member data.
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
vector< CRef< CDbtag > > TDbxref
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
void SetData(TData &value)
Assign a value to Data data member.
void SetKey(const TKey &value)
Assign a value to Key data member.
bool IsMix(void) const
Check if variant Mix is selected.
const TMix & GetMix(void) const
Get the variant data.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
@ e_Tpe
Third Party Annot/Seq EMBL.
@ e_Tpd
Third Party Annot/Seq DDBJ.
@ e_Named_annot_track
Internal named annotation tracking ID.
@ e_Tpg
Third Party Annot/Seq Genbank.
TRepr GetRepr(void) const
Get the Repr member data.
void SetData(TData &value)
Assign a value to Data data member.
list< CRef< CSeqdesc > > Tdata
TId & SetId(void)
Assign a value to Id data member.
const TInst & GetInst(void) const
Get the Inst member data.
TTitle & SetTitle(void)
Select the variant.
TPub & SetPub(void)
Select the variant.
TTopology GetTopology(void) const
Get the Topology member data.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
TGenbank & SetGenbank(void)
Select the variant.
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
const TId & GetId(void) const
Get the Id member data.
TTech GetTech(void) const
Get the Tech member data.
TComment & SetComment(void)
Select the variant.
void SetInst(TInst &value)
Assign a value to Inst data member.
void ResetTech(void)
Reset Tech data member.
TSource & SetSource(void)
Select the variant.
void SetTopology(TTopology value)
Assign a value to Topology data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
TUser & SetUser(void)
Select the variant.
TEmbl & SetEmbl(void)
Select the variant.
void SetRepr(TRepr value)
Assign a value to Repr data member.
EStrand
strandedness in living organism
list< CRef< CSeq_annot > > TAnnot
void SetStrand(TStrand value)
Assign a value to Strand data member.
void SetTech(TTech value)
Assign a value to Tech data member.
TMolinfo & SetMolinfo(void)
Select the variant.
TCreate_date & SetCreate_date(void)
Select the variant.
TUpdate_date & SetUpdate_date(void)
Select the variant.
@ eRepr_delta
sequence made by changes (delta) to others
@ eRepr_raw
continuous sequence
@ eTech_htgs_2
ordered High Throughput sequence contig
@ eTech_htc
high throughput cDNA
@ eTech_targeted
targeted locus sets/studies
@ eTech_sts
Sequence Tagged Site.
@ eTech_htgs_3
finished High Throughput sequence
@ eTech_htgs_1
unordered High Throughput sequence contig
@ eTech_tsa
transcriptome shotgun assembly
@ eTech_wgs
whole genome shotgun sequencing
@ eTech_survey
one-pass genomic sequence
@ eTech_htgs_0
single genomic reads for coordination
@ eTech_fli_cdna
full length insert cDNA
@ eTech_est
Expressed Sequence Tag.
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
int CheckSTRAND(const string &str)
int fta_if_wgs_acc(const CTempString &accession)
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
static const char * str(char *buf, int n)
CRef< objects::CSeq_entry > seq_entry
CRef< objects::CPatent_seq_id > psip
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
CKeywordParser & KeywordParser()
char *(* ff_get_qscore)(const char *accession, Int2 v)
void MaybeCutGbblockSource(TEntryList &seq_entries)
bool GetGenomeInfo(CBioSource &bsp, const Char *bptr)
bool HasHtg(const TKeywordList &keywords)
bool HasHtc(const TKeywordList &keywords)
char * GetBlkDataReplaceNewLine(char *bptr, char *eptr, Int2 start_col_data)
char * SrchTheChar(char *bptr, char *eptr, Char letter)
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void RemoveHtgPhase(TKeywordList &keywords)
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
void fta_remove_tpa_keywords(TKeywordList &kwds)
Int2 fta_StringMatch(const Char **array, const Char *text)
void CleanTailNoneAlphaCharInString(string &str)
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
void fta_StringCpy(char *dst, const char *src)
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
bool IsCancelled(const TKeywordList &keywords)
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
void fta_remove_env_keywords(TKeywordList &kwds)
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
bool fta_tpa_keywords_check(const TKeywordList &kwds)
char * PointToNextToken(char *ptr)
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)