85 #define THIS_FILE "sp_ascii.cpp"
92 "ALTERNATIVE PRODUCTS:",
93 "BIOPHYSICOCHEMICAL PROPERTIES:",
95 "CATALYTIC ACTIVITY:",
99 "DEVELOPMENTAL STAGE:",
101 "DISRUPTION PHENOTYPE:",
103 "ENZYME REGULATION:",
107 "MASS SPECTROMETRY:",
116 "SUBCELLULAR LOCATION:",
118 "TISSUE SPECIFICITY:",
170 {
"MOD_RES",
ParFlatSPSites, 10,
"5-glutamyl glycerylphosphorylethanolamine" },
227 {
"MOD_RES",
ParFlatSPSites, 13,
"Glutamate methyl ester (Gln)" },
228 {
"MOD_RES",
ParFlatSPSites, 13,
"Glutamate methyl ester (Glu)" },
246 {
"MOD_RES",
ParFlatSPSites, 13,
"N6,N6,N6-trimethyl-5-hydroxylysine" },
251 {
"MOD_RES",
ParFlatSPSites, 13,
"N6-poly(methylaminopropyl)lysine" },
273 {
"MOD_RES",
ParFlatSPSites, 14,
"(3R,4R)-3,4-dihydroxyproline" },
274 {
"MOD_RES",
ParFlatSPSites, 14,
"(3R,4R)-4,5-dihydroxyisoleucine" },
275 {
"MOD_RES",
ParFlatSPSites, 14,
"(3R,4S)-3,4-dihydroxyproline" },
279 {
"MOD_RES",
ParFlatSPSites, 14,
"(3S,4R)-3,4-dihydroxyisoleucine" },
283 {
"MOD_RES",
ParFlatSPSites, 14,
"3',4'-dihydroxyphenylalanine" },
296 {
"MOD_RES",
ParFlatSPSites, 14,
"5-hydroxy-3-methylproline (Ile)" },
304 {
"MOD_RES",
ParFlatSPSites, 14,
"N6-(3,6-diaminohexanoyl)-5-hydroxylysine" },
311 {
"MOD_RES",
ParFlatSPSites, 17,
"Pyrrolidone carboxylic acid (Glu)" },
359 {
"MOTIF",
ParFlatSPRegions, -1,
"Short sequence motif of biological interest" },
367 {
nullptr, 0, 0,
nullptr }
372 #define ParFlatSPSitesModB 9
373 #define ParFlatSPSitesModE 174
375 #define COPYRIGHT "This Swiss-Prot entry is copyright."
376 #define COPYRIGHT1 "Copyrighted by the UniProt Consortium,"
378 #define SPDE_RECNAME 000001
379 #define SPDE_ALTNAME 000002
380 #define SPDE_SUBNAME 000004
381 #define SPDE_FLAGS 000010
382 #define SPDE_INCLUDES 000020
383 #define SPDE_CONTAINS 000040
384 #define SPDE_FULL 000100
385 #define SPDE_SHORT 000200
386 #define SPDE_EC 000400
387 #define SPDE_ALLERGEN 001000
388 #define SPDE_BIOTECH 002000
389 #define SPDE_CD_ANTIGEN 004000
390 #define SPDE_INN 010000
471 "STRAIN",
"SUBSTRAIN",
"TYPE",
"SUBTYPE",
"VAR.",
"SEROTYPE",
472 "SEROGROUP",
"SEROVAR",
"CULTIVAR",
"PATHOVAR",
"CHEMOVAR",
"BIOVAR",
473 "BIOTYPE",
"GROUP",
"SUBGROUP",
"ISOLATE",
"ACRONYM",
"DOSAGE",
474 "NAT_HOST",
"SUBSP.",
nullptr
478 "2DBASE-ECOLI",
"AARHUS/GHENT-2DPAGE",
"AGD",
479 "ANU-2DPAGE",
"BURULIST",
"CARBBANK",
480 "CMR",
"CORNEA-2DPAGE",
"DICTYDB",
481 "DOMO",
"ECO2DBASE",
"GCRDB",
482 "GENEVESTIGATOR",
"GENEW",
"GENOMEREVIEWS",
483 "GERMONLINE",
"HIV",
"HSC-2DPAGE",
484 "HSSP",
"IPI",
"LINKHUB",
485 "LISTILIST",
"MAIZE-2DPAGE",
"MENDEL",
486 "MGD",
"MYPULIST",
"NMPDR",
487 "PATHWAY_INTERACTION_DB",
"PHCI-2DPAGE",
"PHOSSITE",
488 "PPTASEDB",
"PROTCLUSTDB",
"PHOTOLIST",
489 "PMMA-2DPAGE",
"RAT-HEART-2DPAGE",
"RZPD-PROTEXP",
490 "SAGALIST",
"SIENA-2DPAGE",
"STYGENE",
491 "SUBTILIST",
"TIGR",
"TRANSFAC",
492 "WORMPEP",
"YEPD",
"YPD",
497 "ALLERGOME",
"ARACHNOSERVER",
"ARAPORT",
498 "ARRAYEXPRESS",
"BEEBASE",
"BGD",
499 "BGEE",
"BINDINGDB",
"BIOCYC",
500 "BIOGRID",
"BIOMUTA",
"BRENDA",
501 "CAZY",
"CCDS",
"CDD",
502 "CGD",
"CHEMBL",
"CHITARS",
503 "CLEANEX",
"COLLECTF",
"COMPLUYEAST-2DPAGE",
504 "CONOSERVER",
"CTD",
"CYGD",
505 "DBSNP",
"DEPOD",
"DICTYBASE",
506 "DIP",
"DISGENET",
"DISPROT",
507 "DMDM",
"DNASU",
"DOSAC-COBS-2DPAGE",
508 "DRUGBANK",
"ECHOBASE",
"ECOGENE",
509 "EGGNOG",
"EMBL",
"ENSEMBL",
510 "ENSEMBLBACTERIA",
"ENSEMBLFUNGI",
"ENSEMBLMETAZOA",
511 "ENSEMBLPLANTS",
"ENSEMBLPROTISTS",
"EPD",
512 "ESTHER",
"EUHCVDB",
"EUPATHDB",
513 "EUROPEPMC",
"EVOLUTIONARYTRACE",
"EXPRESSIONATLAS",
514 "FLYBASE",
"GENE3D",
"GENECARDS",
515 "GENEDB",
"GENEDB_SPOMBE",
"GENEFARM",
516 "GENEID",
"GENEREVIEWS",
"GENETREE",
517 "GENEVISIBLE",
"GENEWIKI",
"GENOLIST",
518 "GENOMERNAI",
"GK",
"GLYCOSUITEDB",
519 "GRAINGENES",
"GO",
"GRAMENE",
520 "GUIDETOPHARMACOLOGY",
"H-INVDB",
"HAMAP",
521 "HGNC",
"HOGENOM",
"HOVERGEN",
522 "HPA",
"IMGT/GENE-DB",
"IMGT/HLA",
523 "IMGT/LIGM",
"IMGT_GENE-DB",
"INPARANOID",
524 "INTACT",
"INTERPRO",
"IPD-KIR",
525 "IPTMNET",
"KEGG",
"KO",
526 "LEGIOLIST",
"LEPROMA",
"MAIZEDB",
527 "MAIZEGDB",
"MALACARDS",
"MAXQB",
528 "MEROPS",
"MGI",
"MIM",
529 "MINT",
"MIRBASE",
"MOONPROT",
530 "MYCOCLAP",
"NEXTBIO",
"NEXTPROT",
531 "OGP",
"OMA",
"OPENTARGETS",
532 "ORPHANET",
"ORTHODB",
"PANTHER",
533 "PATRIC",
"PAXDB",
"PDB",
534 "PDBSUM",
"PEPTIDEATLAS",
"PEROXIBASE",
535 "PFAM",
"PHARMGKB",
"PHOSPHOSITE",
536 "PHOSPHOSITEPLUS",
"PHYLOMEDB",
"PIR",
537 "PIRSF",
"PMAP-CUTDB",
"POMBASE",
538 "PR",
"PR2",
"PRIDE",
539 "PRINTS",
"PRO",
"PRODOM",
540 "PROMEX",
"PROSITE",
"PROTEINMODELPORTAL",
541 "PROTEOMES",
"PSEUDOCAP",
"REACTOME",
542 "REBASE",
"REFSEQ",
"REPRODUCTION-2DPAGE",
543 "RGD",
"RZPD",
"SABIO-RK",
544 "SFLD",
"SGD",
"SIGNALINK",
545 "SIGNALLINK",
"SIGNOR",
"SMART",
546 "SMR",
"STRING",
"SUPFAM",
547 "SWISS-2DPAGE",
"SWISSLIPIDS",
"SWISSPALM",
548 "TAIR",
"TCDB",
"TIGRFAMS",
549 "TOPDOWNPROTEOMICS",
"TREEFAM",
"TUBERCULIST",
550 "UCD-2DPAGE",
"UCSC",
"UNICARBKB",
551 "UNIGENE",
"UNILIB",
"UNIPATHWAY",
552 "UNITE",
"VBASE2",
"VECTORBASE",
553 "VEGA-TR",
"VEGA-GN",
"VGNC",
554 "WBPARASITE",
"WORLD-2DPAGE",
"WORMBASE",
555 "XENBASE",
"ZFIN",
nullptr
559 "CHLOROPLAST",
"CYANELLE",
"MITOCHONDRION",
"PLASMID",
"NUCLEOMORPH",
560 "HYDROGENOSOME",
"APICOPLAST",
"CHROMATOPHORE",
561 "ORGANELLAR CHROMATOPHORE",
nullptr
565 "Evidence at protein level",
566 "Evidence at transcript level",
567 "Inferred from homology",
588 if (delim && *delim !=
'\0' && ! dest.empty())
608 tag->SetTag().SetStr(
str);
659 pdb_seq_id->SetChain(chain);
678 if (! mol || ! chain)
682 for (bad =
false, got =
false, q = chain; *q !=
'\0'; q = p) {
683 while (*q ==
' ' || *q ==
',')
685 for (p = q; *p !=
'\0' && *p !=
' ' && *p !=
',';)
695 for (
r = q; *
r !=
'\0';
r++) {
698 if (
r[1] !=
'/' &&
r[1] !=
'\0') {
699 while (*
r !=
'/' && *
r !=
'\0')
737 pdb_seq_id->SetRel(*date);
750 if (choice < 1 || choice > 4)
757 }
else if (choice == 4) {
769 for (; dbp; dbp = dbp->
mpNext)
772 for (; subdbp; subdbp = subdbp->
mpNext)
776 for (p += 8; *p ==
' ';)
797 if (gmod == 7 || gmod == 8)
817 while (*eptr ==
' ' && eptr >
offset)
821 pIndex->
bases = atoi(eptr + 1);
822 while (*eptr ==
' ' && eptr >
offset)
854 for (; dbp; dbp = dbp->
mpNext) {
895 if (str_.find(
"(GENE NAME") != string::npos) {
921 for (p = ptr + shift; *p ==
' ';)
924 if (*p == symb || *p ==
'\0') {
929 while (*p ==
'.' || *p ==
'-' || *p ==
'n' ||
isdigit(*p) != 0)
932 while (*p ==
' ' || *p ==
')')
943 if (ptr[8] ==
'\0') {
952 for (q = ptr + 8;;) {
955 if (! q || (p && q > p))
961 else if (q[9] ==
'\0')
963 else if (q[9] ==
's' || q[9] ==
'S') {
966 else if (q[10] ==
'\0')
979 if (ptr[8] ==
'\0') {
1008 for (p = ptr + 8; *p !=
'\0' && *p !=
')';)
1010 while (*p ==
' ' || *p ==
')')
1019 if (! s.empty() && s.back() ==
'.') {
1021 while (! s.empty() && s.back() ==
' ')
1038 for (
size_t i = 0;
i < dbp->
len;
i++)
1043 for (q = dbp->
mOffset; *q !=
'\0';) {
1048 for (q += 5; *q !=
'\n' && *q !=
'\0'; q++)
1056 while (*p ==
'.' || *p ==
' ' || *p ==
'\t') {
1076 if (! line || line[0] ==
'\0')
1078 for (p = line; *p ==
' ' || *p ==
'\t' || *p ==
'.' || *p ==
',';)
1094 for (
r = p - 1; *
r ==
' ' || *
r ==
'\t';
r--) {
1105 for (p++; *p ==
' ' || *p ==
'\t';)
1108 for (
i = 1; *p !=
'\0'; p++) {
1124 for (
r = p - 1; *
r ==
' ' || *
r ==
'\t';
r--) {
1155 size_t len = taxname.size();
1159 const Char* p = taxname.c_str() +
len - 3;
1160 if ((p[0] ==
' ' || p[0] ==
'\t') && (p[1] ==
's' || p[1] ==
'S') &&
1161 (p[2] ==
'p' || p[2] ==
'P') && p[3] ==
'\0') {
1188 if (sosp->
name && sosp->
name[0] !=
'\0')
1191 for (synsp = sosp->
syn; synsp; synsp = synsp->
next) {
1193 if (! p || *p ==
'\0')
1200 i = (*q ==
'C' || *q ==
'c') ? 5 : 7;
1205 if ((q == p || q[0] ==
' ' || q[0] ==
'\t') &&
1206 (q[
i] ==
' ' || q[
i] ==
'\t' || q[
i] ==
'\0')) {
1219 if ((
StringEquNI(p,
"PV.", 3) && (p[3] ==
' ' || p[3] ==
'\t' || p[3] ==
'\0')) ||
1232 for (q = p; *p !=
'\0' && *p !=
' ' && *p !=
'\t';)
1235 org_ref->
SetSyn().push_back(q);
1240 for (q = p + 1; *q ==
' ' || *q ==
'\t';)
1247 org_ref->
SetSyn().push_back(q);
1267 if (*p ==
' ' && (p[
i] ==
' ' || p[
i] ==
'\t' || p[
i] ==
'\0')) {
1282 if (! taxname.empty())
1307 for (ssp = sosp->
syn; ssp; ssp = tssp) {
1327 for (; dbp; dbp = dbp->
mpNext)
1350 if (!
StringEquNI(line,
"\nOH NCBI_TaxID=", 17)) {
1370 for (p += 17, q = p; *q ==
' ';)
1374 if ((!
r ||
r > p) && p) {
1377 for (p--; *p ==
';' || *p ==
' ';)
1380 for (
r = q; *
r >=
'0' && *
r <=
'9';)
1388 for (p++; *p ==
' ' || *p ==
';';)
1395 while ((*
r ==
' ' || *
r ==
'.' || *
r ==
'\0') &&
r > p)
1397 if (*
r !=
'\0' && *
r !=
'.' && *
r !=
' ')
1440 for (; subdbp; subdbp = subdbp->
mpNext) {
1458 for (q = p; *q ==
' ';)
1464 for (p = line + 16; *p ==
' ';)
1470 for (q = p; *q >=
'0' && *q <=
'9';)
1472 if (*q ==
' ' || *q ==
'\0')
1474 if (taxid <=
ZERO_TAX_ID || (*q !=
' ' && *q !=
'\0')) {
1502 for (dbp = entry; dbp; dbp = dbp->
mpNext) {
1515 if (line_OS && line_OS[0] !=
'\0') {
1517 if (sosp && sosp->
name && sosp->
name[0] !=
'\0') {
1525 if (org_ref.
NotEmpty() && line_OC && line_OC[0] !=
'\0') {
1539 char* eptr =
nullptr;
1545 for (; dbp; dbp = dbp->
mpNext) {
1550 for (; subdbp; subdbp = subdbp->
mpNext) {
1570 for (ptr =
str; *ptr !=
'\n' && *ptr !=
' ';)
1574 plasms.push_back(
string(
str, ptr));
1594 if (! p || *p ==
'\0')
1598 if (*p ==
'\0' || *p ==
'\n')
1600 if ((*p ==
';' || *p ==
'.') && (p[1] ==
' ' || p[1] ==
'\n'))
1604 if (*p ==
'\0' || *p ==
'\n')
1611 while (*p ==
' ' || *p ==
';' || *p ==
'.')
1624 char* end =
nullptr;
1628 if (!
str || *
str ==
'\0')
1631 if (
str[0] ==
'-') {
1636 lID = strtoll(
str + 1, &end, 10);
1637 if ((lID == 0 &&
str + 1 == end) || (lID == LLONG_MAX && errno == ERANGE)) {
1647 }
else if (*
str ==
'E' || *
str ==
'D') {
1650 tag->SetTag().SetStr(
str);
1671 if (
str[0] ==
'-' &&
str[1] ==
'\0')
1675 for (vnp = *
head; vnp; vnp = vnp->
next)
1683 for (vnp = *
head; vnp; vnp = vnp->
next) {
1705 for (CSP_block::TSeqref::iterator cur_ref = refs.begin(); cur_ref != refs.end(); ++cur_ref) {
1706 if ((*cur_ref)->Which() !=
CSeq_id::e_Pdb || (*cur_ref)->GetPdb().IsSetRel())
1712 CSP_block::TSeqref::iterator next_ref = cur_ref;
1714 for (++next_ref; next_ref != refs.end();) {
1716 (*next_ref)->GetPdb().IsSetRel())
1719 const CPDB_seq_id& next_id = (*next_ref)->GetPdb();
1727 if (! got && cur_id.
GetChain() == 32) {
1741 next_ref = refs.erase(next_ref);
1760 if (! embl_acc_list || ! embl_acc_list->
next->
next)
1763 for (vnp = embl_acc_list; vnp; vnp = vnp->
next->
next) {
1767 for (p = q + 1; *p >=
'0' && *p <=
'9';)
1873 bool check_embl_prot;
1896 embl_vnp = embl_acc_list;
1897 check_embl_prot =
false;
1913 if (! token1 || ! token2 || ! token3 ||
1951 }
else if (
AddToList(&acc_list, token2)) {
1953 p ? (
Int2) atoi(p + 1) : 0));
1961 if (token3[0] >=
'A' && token3[0] <=
'Z' &&
1962 token3[1] >=
'A' && token3[1] <=
'Z') {
1966 for (q = p + 1; *q >=
'0' && *q <=
'9';)
1968 if (q == p + 1 || *q !=
'\0')
1980 embl_vnp = embl_vnp->
next;
1982 embl_vnp = embl_vnp->
next;
1986 check_embl_prot =
true;
2007 if (
AddToList(&ens_tran_list, token2)) {
2013 if (!
AddToList(&ens_prot_list, token3)) {
2021 if (token4 &&
AddToList(&ens_gene_list, token4)) {
2028 if (token2[0] >=
'A' && token2[0] <=
'Z' &&
2029 token2[1] >=
'A' && token2[1] <=
'Z') {
2033 for (q = p + 1; *q >=
'0' && *q <=
'9';)
2035 if (q == p + 1 || *q !=
'\0')
2057 token1 =
"Reactome";
2067 if (
tag.NotEmpty()) {
2070 for (
const auto& cur_tag : spb.
SetDbref()) {
2071 if (
tag->Match(*cur_tag)) {
2082 if (embl_acc_list->
next) {
2083 if (check_embl_prot)
2087 delete embl_acc_list;
2101 if (pdbold && pdbnew) {
2106 if (pdbnew && spb.
SetSeqref().size() > 1)
2150 for (q =
offset, tvnp = vnp;;) {
2166 vnp->
next =
nullptr;
2175 for (tvnp = vnp; tvnp; tvnp = tvnp->
next) {
2190 for (p += 16; *p ==
' ';)
2192 for (q = p; *p >=
'0' && *p <=
'9';)
2194 if (*p ==
'.' && p[1] ==
'\0') {
2205 for (tvnp = vnp; tvnp; tvnp = tvnp->
next) {
2227 }
else if (
first > 1) {
2230 }
else if (second == 0) {
2233 }
else if (second > 1) {
2236 }
else if (third == 0) {
2239 }
else if (third > 1) {
2242 }
else if (std_crdate.
Empty()) {
2245 }
else if (std_sequpd.
Empty()) {
2248 }
else if (std_annotupd.
Empty()) {
2251 }
else if (ver_num && *ver_num < 1) {
2257 crdate.
SetStd(*std_crdate);
2258 sequpd.SetStd(*std_sequpd);
2259 annotupd.
SetStd(*std_annotupd);
2289 if (reviewed ||
StringEquNI(bptr,
"standard", 8)) {
2291 }
else if (
StringEquNI(bptr,
"preliminary", 11) ||
2305 if (spb->SetExtra_acc().empty())
2306 spb->ResetExtra_acc();
2312 i =
GetSPDate(pp, entry, spb->SetCreated(), spb->SetSequpd(), spb->SetAnnotupd(), &ver_num);
2314 i =
GetSPDate(pp, entry, spb->SetCreated(), spb->SetSequpd(), spb->SetAnnotupd(),
nullptr);
2317 if (spb->SetPlasnm().empty())
2326 for (
auto& cur_id : bioseq.
SetId()) {
2327 if (! cur_id->IsSwissprot())
2335 id.SetRelease(
"reviewed");
2337 id.SetRelease(
"reviewed");
2345 bioseq.
SetDescr().Set().push_back(descr);
2364 for (p = line; *p ==
' ';)
2373 while (*p !=
'\0') {
2379 if (p > line && *(p - 1) !=
'-')
2381 for (++p; *p ==
' ';)
2384 for (p += 3; *p ==
' ';)
2391 for (--q; q > com && *q ==
' ';)
2404 descrs.push_back(descr);
2442 for (q = p; q >
offset && *q !=
'\n';)
2470 if (
count == 0 && cla != 2)
2483 for (p =
tmp; p > bptr && *p !=
'\n';)
2504 for (
const auto& cur_ref : spb.
GetSeqref()) {
2505 if (! cur_ref->IsPir())
2509 text_id->Assign(cur_ref->GetPir());
2512 rep_id->
SetPir(*text_id);
2514 rep_ids.push_back(rep_id);
2517 if (rep_ids.empty())
2532 if (! orpname && ! ohname)
2534 if (! orpname || ! ohname)
2537 for (p = orpname, q = ohname; *p !=
'\0' && *q !=
'\0'; p++, q++) {
2539 if (chp >=
'a' && chp <=
'z')
2542 if (chq >=
'a' && chq <=
'z')
2555 if (*q ==
'(' || *q ==
'\0')
2566 bool fragment =
false;
2580 if (! title.empty()) {
2583 descr.
Set().push_back(desc_new);
2596 for (
const string& cur_acc : spb->GetExtra_acc()) {
2601 text_id->SetAccession(cur_acc);
2605 rep_ids.push_back(rep_id);
2608 if (! rep_ids.empty()) {
2614 if (spb->CanGetCreated()) {
2618 descr.
Set().push_back(create_date_descr);
2621 bool has_update_date = spb->CanGetAnnotupd() || spb->CanGetSequpd();
2624 if (has_update_date) {
2625 if (spb->CanGetAnnotupd() && spb->CanGetSequpd()) {
2627 }
else if (spb->CanGetAnnotupd())
2628 upd_date.
Assign(spb->GetAnnotupd());
2630 upd_date.
Assign(spb->GetSequpd());
2635 descr.
Set().push_back(upd_date_descr);
2638 if (spb->CanGetCreated() && has_update_date &&
2640 string upd_date_str, create_date_str;
2642 upd_date.
GetDate(&upd_date_str);
2643 spb->GetCreated().GetDate(&create_date_str);
2662 if (org_ref.
Empty())
2669 bio_src->
SetOrg(*org_ref);
2675 if (bio_src.
Empty()) {
2681 bio_src->
SetOrg(*org_ref);
2689 if (bio_src.
Empty())
2696 for (tvhp = vhp; tvhp; tvhp = vhp) {
2711 if (org_ref_cur.
Empty()) {
2717 vector<Char> org_taxname;
2719 const string& cur_taxname = org_ref_cur->
GetTaxname();
2720 org_taxname.assign(cur_taxname.begin(), cur_taxname.end());
2723 org_taxname.push_back(0);
2728 "OH-line HostName \"%s\" does not match NCBI organism name \"%s\" obtained by lookup of NCBI TaxID \"%d\".",
2741 descr.
Set().push_back(bio_src_desc);
2753 descr.
Set().push_back(mol_info_descr);
2758 for (; dbp; dbp = dbp->
mpNext) {
2765 pub_desc_descr->
SetPub(*pub_desc);
2767 descr.
Set().push_back(pub_desc_descr);
2813 for (; spfip; spfip =
next) {
2822 if (! fip1 && ! fip2)
2825 if (! fip1 || ! fip2 ||
2826 fip1->
key != fip2->
key ||
2828 fip1->
to != fip2->
to ||
2842 if (! spfip || ! spfip->
next)
2845 for (; spfip && spfip->
next; spfip = spfip->
next) {
2847 for (fip = spfip->
next; fip; fip = fipnext) {
2848 fipnext = fip->
next;
2873 if (! p || p == temp ||
2874 (*(p - 1) !=
' ' && *(p - 1) !=
'\n') || (p[2] !=
' ' && p[2] !=
'\n')) {
2880 for (p--; p > temp && (*p ==
' ' || *p ==
'\n');)
2882 if (*p < 'A' || *p >
'Z') {
2889 while (p > temp && (*p ==
'\n' || (*p >=
'A' && *p <=
'Z')))
2896 while (*p >=
'A' && *p <=
'Z' && p < end)
2900 for (q = p; *p ==
'\n'; p++)
2905 while (*p ==
' ' || *p ==
'\n')
2907 for (p += 2; *p ==
' ' || *p ==
'\n';)
2910 if (*p < 'A' || *p >
'Z') {
2916 for (q = p; *q ==
'\n' || (*q >=
'A' && *q <=
'Z');)
2918 if (q > p && *(q - 1) ==
'\n') {
2919 for (q--; *q ==
'\n' && q > p;)
2927 while (*p >=
'A' && *p <=
'Z' && p < end)
2931 for (q = p; *p ==
'\n'; p++)
2936 for (p = temp; *p !=
'\0'; p++)
2959 const char* defdelim;
2990 while (bptr < eptr && (endline =
SrchTheChar(bptr, eptr,
'\n'))) {
2993 for (p = bptr,
i = 0; *p !=
' ' && *p !=
'\n' &&
i < 8;
i++)
2995 temp->
key.assign(bptr, p);
2998 if (temp->
key ==
"VAR_SEQ")
3003 for (bptr += 8; *bptr ==
' ' && bptr <= endline;)
3008 if (((*bptr >=
'a' && *bptr <=
'z') || (*bptr >=
'A' && *bptr <=
'Z')) &&
3010 for (bptr += 7; *bptr >=
'0' && *bptr <=
'9' && bptr <= endline;)
3012 for (; *bptr ==
':' && bptr <= endline;)
3016 for (ptr1 = bptr; *ptr1 ==
'?' || *ptr1 ==
'>' || *ptr1 ==
'<' ||
3017 (*ptr1 >=
'0' && *ptr1 <=
'9');)
3020 if (bptr < ptr1 && ptr1 <= endline) {
3021 temp->
from.assign(bptr, ptr1);
3028 if (! p || (q && q < p))
3040 temp->
from.assign(
"-1");
3041 fromstart =
nullptr;
3047 for (; (*bptr ==
' ' || *bptr ==
'.') && bptr <= endline; bptr++)
3050 for (ptr1 = bptr; *ptr1 ==
'?' || *ptr1 ==
'>' || *ptr1 ==
'<' ||
3051 (*ptr1 >=
'0' && *ptr1 <=
'9');)
3054 p = (
char*)temp->
from.c_str();
3055 if (*p ==
'<' || *p ==
'>')
3058 for (q = ptr1; *q ==
' ';)
3061 if (bptr < ptr1 && ptr1 <= endline) {
3062 if (*q !=
'\n' && new_format && (*p ==
'?' || atoi(p) != -1))
3064 temp->
to.assign(bptr, ptr1);
3065 }
else if (fromstart) {
3066 if (*q !=
'\n' && (*p ==
'?' || atoi(p) != -1))
3068 temp->
to.assign(fromstart, fromend);
3070 if (*q !=
'\n' && (*p ==
'?' || atoi(p) != -1))
3072 temp->
to.assign(
"-1");
3075 q = (
char*)temp->
to.c_str();
3076 if (*q ==
'<' || *q ==
'>')
3078 if (extra_text || (*p !=
'?' && *q !=
'?' && (atoi(p) > atoi(q)))) {
3082 if (! p || (q && q < p))
3091 temp->
from.assign(
"-1");
3094 for (bptr = ptr1; *bptr ==
' ' && bptr <= endline;)
3100 if (*--
str ==
'-' &&
str > bptr)
3103 if (bptr <= endline)
3104 temp->
descrip.assign(bptr, endline);
3106 for (bptr = endline; *bptr ==
' ' || *bptr ==
'\n';)
3111 while (bptr < eptr && (*bptr ==
' '))
3113 while (*bptr ==
' ')
3119 }
else if (
StringEquN(bptr,
"/evidence=\"", 11)) {
3133 for (p = bptr + 1; (*p >=
'a' && *p <=
'z') || (*p >=
'A' && *p <=
'Z') || (*p >=
'0' && *p <=
'9') || *p ==
'_';)
3135 if (*p ==
'=' && p[1] ==
'\"') {
3147 if (p >= bptr && *p ==
'\"')
3154 if (p && p - 1 >= bptr && *(p - 1) ==
'.')
3159 if (p && p - 1 >= bptr && *(p - 1) ==
'.')
3171 if (*--
str ==
'-' &&
str > bptr)
3174 for (bptr = endline; *bptr ==
' ' || *bptr ==
'\n';)
3186 if (*defdelim ==
'\n')
3189 p = (
char*)temp->
from.c_str();
3190 if (*p ==
'<' || *p ==
'>')
3192 if (*p !=
'?' && atoi(p) < 0) {
3197 q = (
char*)temp->
to.c_str();
3198 if (*q ==
'<' || *q ==
'>')
3200 if ((*p !=
'?' && atoi(p) > (
Int4)seqlen) || (*q !=
'?' && atoi(q) > (
Int4)seqlen)) {
3210 current->
next = temp;
3240 bool fuzzfrom =
false;
3241 bool fuzzto =
false;
3242 bool nofrom =
false;
3244 bool pntfuzz =
false;
3248 if (! spfip || spfip->
from.empty() || spfip->
to.empty())
3255 ptr = spfip->
from.c_str();
3259 while (*ptr !=
'\0' &&
isdigit(*ptr) == 0)
3261 from = (
Int4)atoi(ptr);
3266 from = (
Int4)atoi(ptr);
3268 if ((initmet ==
false && from != 0) ||
3269 (initmet && signal && from == 1))
3272 ptr = spfip->
to.c_str();
3275 while (*ptr !=
'\0' &&
isdigit(*ptr) == 0)
3277 to = (
Int4)atoi(ptr);
3282 to = (
Int4)atoi(ptr);
3284 if (initmet ==
false && to != 0)
3307 }
else if (from != to && ! pntfuzz) {
3331 }
else if (fuzzfrom) {
3418 if ((pos != 0 && retstr[pos - 1] !=
' ' && retstr[pos - 1] !=
'.') ||
3419 (retstr[pos +
len] !=
'\0' && retstr[pos +
len] !=
' ' &&
3420 retstr[pos +
len] !=
'.' && retstr[pos +
len] !=
';'))
3464 fbp->
key =
"VAR_SEQ";
3469 fbp->
key =
"SE_CYS";
3472 fbp->
key =
"MOD_RES";
3489 feat->
SetData().SetImp().SetDescr(
"uncertain amino acids");
3501 for (p = loc; *p; p++)
3512 if (! descrip.empty())
3523 if (! descrip.empty())
3548 for (temp = spfip; temp; temp = temp->
next) {
3553 temp->
key =
"VAR_SEQ";
3558 temp->
key =
"SE_CYS";
3561 temp->
key =
"MOD_RES";
3595 feat->
SetData().SetImp().SetDescr(
"uncertain amino acids");
3623 feats.push_back(feat);
3641 for (p =
str; *p ==
' ' || *p ==
'\t';)
3643 for (q = p; *q !=
'\0';)
3646 for (q--; (*q ==
' ' || *q ==
'\t') && q > p;)
3648 if (q == p && (*q ==
' ' || *q ==
'\t'))
3650 for (pp = p; *pp ==
'(';)
3652 for (qq = q; *qq ==
')' && qq >= pp;)
3654 for (
count = 0, left = 0, right = 0,
r = pp;
r <= qq;
r++) {
3657 else if (*
r ==
')') {
3659 count = left - right;
3663 for (; count < 0 && pp > p; pp--)
3665 for (
count = 0,
r = qq;
r >= pp;
r--) {
3672 for (;
count < 0 && qq < q; qq++)
3695 for (p = gname; *p !=
'\0'; p++)
3696 if (! (
isalnum(*p) || *p ==
'_' || *p ==
'-' || *p ==
'.' ||
3697 *p ==
'\'' || *p ==
'`' || *p ==
'/' || *p ==
'(' || *p ==
')'))
3720 for (p =
str; *p !=
'\0';) {
3723 for (q = p; *p !=
'\0' && *p !=
' ';)
3735 gene.
SetSyn().push_back(gname);
3786 for (ptr =
str; *ptr !=
'\0'; ptr++)
3796 feats.push_back(feat);
3821 for (p =
str; p && *p !=
'\0'; p = q) {
3822 while (*p ==
' ' || *p ==
',')
3834 gene.
SetSyn().push_back(p);
3844 (! name && ! syns && ! ltags && ! orfs))
3864 feats.push_back(feat);
3881 if (! pp || pp->
entrylist.empty() || ! bptr)
3894 for (p =
str; p && *p !=
'\0'; p = q) {
3895 while (*p ==
' ' || *p ==
';')
3897 for (
r = p;;
r = q + 1) {
3899 if (! q || q[1] ==
' ' || q[1] ==
'\n' || q[1] ==
'\0')
3922 }
else if (
StringEquNI(p,
"OrderedLocusNames=", 18)) {
3945 if (! name && ! syns && ! ltags && ! orfs)
3948 if (! name && syns) {
3967 if (! name && ! syns && ! ltags && ! orfs)
4069 if (*p ==
'.' || *p ==
'\0') {
4074 while (*p >=
'0' && *p <=
'9')
4076 if (*q ==
'n' && (*p ==
'.' || *p ==
'\0')) {
4089 if (
count != 4 || *p !=
'\0') {
4101 for (; sfp; sfp = sfp->
next) {
4124 for (rcount = 0, scount = 0, tsfp = sfp; tsfp; tsfp = tsfp->
next) {
4131 for (fcount = 0, tsfp = sfp; tsfp; tsfp = tsfp->
next) {
4134 for (tsfp = tsfp->
next; tsfp; tsfp = tsfp->
next) {
4148 }
else if (rcount == 0 && ! is_trembl) {
4153 if (scount > 0 && ! is_trembl) {
4158 if (fcount == 0 && rcount > 0) {
4185 for (
const auto&
id : ids) {
4186 if (! id->IsSwissprot())
4189 if (id->GetSwissprot().IsSetRelease() &&
4196 sfp->
next =
nullptr;
4198 for (tsfp = sfp, p =
str,
count = 0; *p !=
'\0';) {
4201 for (q = p; *p !=
'\0' && *p !=
' ';)
4213 if (tsfp->
tag != 0) {
4214 if (q == tsfp->
start)
4217 for (
r = q - 1; *
r ==
' ' || *
r ==
';';)
4230 for (
r = q + cilp->
len; *
r ==
' ';)
4233 tsfp->
next =
nullptr;
4241 for (tsfp = sfp->
next; tsfp; tsfp = tsfp->
next)
4244 for (tsfp = sfp->
next; tsfp; tsfp = tsfp->
next)
4248 for (tsfp = sfp->
next; tsfp; tsfp = tsfp->
next)
4251 for (tsfp = sfp->
next; tsfp; tsfp = tsfp->
next)
4255 for (tsfp = sfp->
next; tsfp; tsfp = tsfp->
next)
4258 for (tsfp = sfp->
next; tsfp; tsfp = tsfp->
next)
4278 for (q =
buf + 2; *q ==
' ';)
4282 for (p++; *p ==
' ';)
4297 qual->
SetQual(
"UniProtKB_evidence");
4299 feat.
SetQual().push_back(qual);
4351 while (! str_.empty()) {
4352 char c = str_.back();
4353 if (c ==
'.' || c ==
';' || c ==
',')
4385 str1.assign(
str, ptr);
4392 for (bptr = ptr; *ptr !=
'\0' && *ptr !=
' ' && *ptr != symb;)
4395 string ecnum(bptr, ptr);
4398 if (! ecnum.empty())
4399 prot.SetEc().push_back(ecnum);
4405 while (*ptr !=
'\0' && (*ptr ==
' ' || *ptr == symb))
4420 str1.assign(
str, ptr);
4428 if (!
prot.IsSetName())
4429 prot.SetName().push_back(
str);
4446 feats.push_back(feat);
4484 for (; spfip; spfip = spfip->
next) {
4485 if (spfip->
key !=
"NON_CONS")
4490 p = spfip->
from.c_str();
4491 if (*p ==
'<' || *p ==
'>' || *p ==
'?')
4494 spslp->
len = atoi(p);
4498 p = spfip->
from.c_str();
4499 if (*p ==
'<' || *p ==
'>' || *p ==
'?')
4501 curspslp->
len = atoi(p) - curspslp->
from;
4505 p = spfip->
from.c_str();
4506 if (*p ==
'<' || *p ==
'>' || *p ==
'?')
4508 spslp->
from = atoi(p);
4509 curspslp->
next = spslp;
4513 for (
auto& descr : bioseq.
SetDescr().Set()) {
4514 if (! descr->IsMolinfo())
4546 for (
count = 0; spfip; spfip = spfip->
next) {
4547 if (spfip->
key !=
"INIT_MET")
4554 p = spfip->
from.c_str();
4555 if (*p ==
'<' || *p ==
'>' || *p ==
'?')
4558 p = spfip->
to.c_str();
4559 if (*p ==
'<' || *p ==
'>' || *p ==
'?')
4563 if ((from != 0 || to != 0) && (from != 1 || to != 1))
4576 if (! temp->
descrip.empty()) {
4584 string& sequence =
data.SetIupacaa().Set();
4591 sequence.insert(sequence.begin(),
'M');
4592 bioseq.
SetInst().SetLength(
static_cast<TSeqPos>(sequence.size()));
4593 }
else if (sequence.empty() || sequence[0] !=
'M')
4622 for (
auto& descr : bioseq.
SetDescr().Set()) {
4623 if (! descr->IsMolinfo())
4626 mol_info = &(descr->SetMolinfo());
4631 for (temp = spfip; temp; temp = temp->
next) {
4632 if (temp->
key ==
"NON_CONS") {
4637 if (temp->
key !=
"NON_TER")
4671 const string& sequence =
data.GetIupacaa().Get();
4673 for (string::const_iterator
value = sequence.begin();
value != sequence.end(); ++
value) {
4674 if (*
value !=
'X') {
4698 for (; spslp; spslp = spslp->
next) {
4700 if (! deltas.
Set().empty()) {
4701 delta->SetLiteral().SetLength(0);
4702 delta->SetLiteral().SetFuzz().SetLim();
4708 delta->SetLiteral().SetLength(spslp->
len);
4711 string data_str = bioseq_data.substr(spslp->
from, spslp->
len);
4713 delta->SetLiteral().SetSeq_data().SetIupacaa().Set(data_str);
4717 if (deltas.
Set().size() > 1) {
4719 bioseq.
SetInst().ResetSeq_data();
4721 bioseq.
SetInst().SetExt().Reset();
4763 if (! feats.empty()) {
4765 annot->
SetData().SetFtable().swap(feats);
4766 bioseq.
SetAnnot().push_back(annot);
4769 for (; spslp; spslp =
next) {
4788 eptr = ptr + entry->
len;
4837 for (total = 0,
i = 0, imax = pp->
indx;
i < imax;
i++) {
4854 pp->
entries.push_back(cur_entry);
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void err_install(const Indexblk *ibp, bool accver)
string tata_save(string_view t)
void StripECO(string &str)
void BuildSubBlock(DataBlkPtr dbp, Int2 subtype, const char *subkw)
void StripSerialNumbers(TEntryList &seq_entries)
unique_ptr< unsigned char[]> GetProteinConv(void)
void GetSequenceOfKeywords(const DataBlk &entry, int type, Uint2 col_data, TKeywordList &keywords)
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
CRef< CSeq_id > MakeAccSeqId(const char *acc, Uint1 seqtype, bool accver, Int2 vernum)
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
void GetLenSubNode(DataBlkPtr dbp)
CRef< CSeq_id > MakeLocusSeqId(const char *locus, CSeq_id::E_Choice seqtype)
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
void ShrinkSpaces(char *line)
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
void fta_sort_biosource(objects::CBioSource &bio)
TSeqPos GetLength(void) const
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
@ eCompare_after
*this comes second.
@Gb_qual.hpp User-defined methods of the data storage class.
@OrgMod.hpp User-defined methods of the data storage class.
@Seq_descr.hpp User-defined methods of the data storage class.
namespace ncbi::objects::
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
#define ERR_GENENAME_IllegalGeneName
#define ERR_SOURCE_HostNameVsTaxIDMissMatch
#define ERR_FEATURE_PartialNoNonTerNonCons
#define ERR_FORMAT_UnexpectedData
#define ERR_FEATURE_ExpectEmptyComment
#define ERR_FORMAT_NoProteinNameCategory
#define ERR_FORMAT_MultipleRecName
#define ERR_SOURCE_InvalidNcbiTaxID
#define ERR_SOURCE_UnknownOHType
#define ERR_GENENAME_DELineGeneName
#define ERR_SOURCE_NcbiTaxIDLookupFailure
#define ERR_FORMAT_InvalidPDBCrossRef
#define ERR_FORMAT_ECNumberNotPresent
#define ERR_FORMAT_MixedPDBXrefs
#define ERR_ENTRY_Skipped
#define ERR_FEATURE_UnEqualEndPoint
#define ERR_SOURCE_OrgNameVsTaxIDMissMatch
#define ERR_FORMAT_MissingCopyright
#define ERR_SOURCE_MissingPlasmidName
#define ERR_FEATURE_Invalid_INIT_MET
#define ERR_FEATURE_InvalidQualifier
#define ERR_FEATURE_BadLocation
#define ERR_REFERENCE_IllegalDate
#define ERR_FORMAT_MissingFullRecName
#define ERR_FORMAT_SwissProtHasSubName
#define ERR_FEATURE_UnknownFeatKey
#define ERR_SOURCE_UnknownOXType
#define ERR_DRXREF_UnknownDBname
#define ERR_SOURCE_NoNcbiTaxIDLookup
#define ERR_FEATURE_ObsoleteFeature
#define ERR_FEATURE_Dropped
#define ERR_ENTRY_ParsingComplete
#define ERR_FEATURE_MissingInitMet
#define ERR_SOURCE_IncorrectOHLine
#define ERR_FORMAT_MissingGeneName
#define ERR_LOCATION_FailedCheck
#define ERR_FORMAT_InvalidECNumber
#define ERR_QUALIFIER_InvalidEvidence
#define ERR_DATE_IllegalDate
#define ERR_FORMAT_UnknownGeneField
#define ERR_FEATURE_NotSeqEndPoint
#define ERR_FEATURE_NoFragment
#define ERR_SPROT_DRLineCrossDBProtein
#define ERR_DATACLASS_UnKnownClass
#define ERR_FORMAT_ExcessGeneFields
#define ERR_FORMAT_MissingRecName
#define ERR_FEATURE_DuplicateRemoved
list< CRef< objects::CSeq_entry > > TEntryList
std::list< CRef< objects::CSeq_id > > TSeqIdList
bool StringEquNI(const char *s1, const char *s2, size_t n)
bool StringEquN(const char *s1, const char *s2, size_t n)
bool StringEqu(const char *s1, const char *s2)
void StringCpy(char *d, const char *s)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
void StringCat(char *d, const char *s)
char * StringRChr(char *s, const char c)
char * StringNew(size_t sz)
void FtaDeletePrefix(int prefix)
void FtaInstallPrefix(int prefix, const char *name, const char *location)
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
CRef< COrg_ref > fta_fix_orgref_byid(ParserPtr pp, TTaxId taxid, bool *drop, bool isoh)
void fta_fix_orgref(ParserPtr pp, COrg_ref &org_ref, bool *drop, char *organelle)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static const char * str(char *buf, int n)
static const char location[]
#define GI_FROM(T, value)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define TAX_ID_TO(T, tax_id)
SStrictId_Tax::TId TTaxId
Taxon id type.
#define TAX_ID_FROM(T, value)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.