80 #define THIS_FILE "add.cpp"
132 for (; ftbp; ftbp =
next) {
166 for (
i = 0;
i <
str.length(); ++
i) {
178 while (!
str.empty()) {
180 if (c ==
' ' || c ==
';' || c ==
',' || c ==
'\"' || c ==
'\t')
192 bool no_create =
true;
193 bool no_update =
true;
195 for (
const auto& desc : descrs) {
196 if (desc->IsCreate_date())
198 else if (desc->IsUpdate_date())
201 if (no_create ==
false && no_update ==
false)
208 return (no_create || no_update);
227 for (
const auto& annot : bioseq.
GetAnnot()) {
228 if (! annot->IsFtable())
231 for (
const auto& feat : annot->GetData().GetFtable()) {
232 if (feat->IsSetData() && feat->GetData().IsPub())
236 for (
const auto& feat : annot->GetData().GetFtable()) {
237 if (! feat->IsSetData() || ! feat->GetData().IsImp())
240 const CImp_feat& imp = feat->GetData().GetImp();
241 if (imp.
GetKey() ==
"Site-ref") {
303 if (accver && ibp->
vernum > 0) {
305 temp += to_string(ibp->
vernum);
326 const string& gapType(gfp->
gap_type);
327 if (gapType ==
"unknown" || gapType ==
"within scaffold" || gapType ==
"repeat within scaffold") {
341 CDelta_ext::Tdata::iterator
delta = deltas.begin();
346 if (! (*delta)->IsLiteral())
361 if (*drop || (
delta == deltas.end() && ! gfp))
364 if (
delta == deltas.end() && gfp) {
367 }
else if (
delta != deltas.end() && ! gfp) {
369 if ((*delta)->IsLiteral())
373 if (
delta == deltas.end())
396 if (sequence.empty() || sequence.size() != bioseq.
GetLength())
399 for (prevto = 0, tgfp = gfp; tgfp; tgfp = tgfp->
next) {
401 p = sequence.c_str() + tgfp->
to;
402 for (
i = tgfp->
to + 1; i < tgfp->
next->from; p++,
i++)
414 if (tgfp->
leftNs ==
false && tgfp->
from - prevto > 10) {
415 for (p = sequence.c_str() + tgfp->
from - 11,
i = 0;
i < 10; p++,
i++)
423 if (tgfp->
rightNs ==
false && nextfrom - tgfp->
to > 10) {
424 for (p = sequence.c_str() + tgfp->
to,
i = 0;
i < 10; p++,
i++)
432 for (
i = tgfp->
from - 1, p = sequence.c_str() +
i; i < tgfp->to; p++,
i++)
448 for (prevto = 0, tgfp = gfp;; tgfp = tgfp->
next) {
452 if (tgfp->
from - prevto - 1 > 0) {
455 delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto,
len);
457 deltas.push_back(
delta);
465 delta->SetLiteral().SetFuzz().SetLim();
468 delta->SetLiteral().SetFuzz().SetRange().SetMax(
len);
474 deltas.push_back(
delta);
484 delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto,
len);
486 deltas.push_back(
delta);
492 if (! deltas.empty()) {
493 bioseq.
SetInst().SetExt().SetDelta().Set().swap(deltas);
495 bioseq.
SetInst().ResetSeq_data();
514 if (sequence.empty() || sequence.size() != bioseq.
GetLength())
517 vector<Char>
buf(sequence.begin(), sequence.end());
524 for (q = p; *p !=
'\0';) {
530 for (
r = p, p++,
i = 1; *p ==
'N';
i++)
545 delta->SetLiteral().SetLength(j);
546 delta->SetLiteral().SetSeq_data().SetIupacna().Set(
string(q,
r));
548 deltas.push_back(
delta);
555 delta->SetLiteral().SetLength(
i);
557 delta->SetLiteral().SetFuzz().SetLim();
560 deltas.push_back(
delta);
568 delta->SetLiteral().SetLength(j);
569 delta->SetLiteral().SetSeq_data().SetIupacna().Set(
string(q, p));
571 deltas.push_back(
delta);
574 if (deltas.size() > 1) {
575 bioseq.
SetInst().SetExt().SetDelta().Set().swap(deltas);
577 bioseq.
SetInst().ResetSeq_data();
582 "or more N's to indicate gaps between component contigs. "
583 "This could be an error, or perhaps sequencing is finished "
584 "and this record should not be Phase 1.");
590 "for one contig, and hence gaps are not expected. But "
591 "this record does have one (ore more) gaps, hence it "
592 "may require review.");
595 "of at least 20 N's. They could indicate gaps, "
596 "but have not been treated that way because "
597 "they are below the minimum of 100 N's.");
615 if (extra_accs.empty())
618 if (extra_accs.size() != 2)
621 CGB_block::TExtra_accessions::const_iterator it = extra_accs.begin();
626 acc1 = ppacc1.data();
627 acc2 = ppacc2.data();
629 if (! acc1 && ! acc2)
631 if (! acc1 || ! acc2)
665 for (p = master; *p !=
'\0' && (*p < '0' || *p >
'9');)
674 for (q =
range; *q !=
'\0' && (*q < '0' || *q >
'9');)
683 bool ret = (master ==
range);
699 ext.GetDelta().IsSet()) {
700 const auto&
delta = ext.GetDelta().Get();
701 return any_of(begin(
delta),
703 [](
CRef<CDelta_seq> pDeltaSeq) {
return (pDeltaSeq && pDeltaSeq->IsLoc()); });
711 const auto idType =
id.Which();
726 const auto primaryType = primary.
Which();
731 unique_ptr<string> pPrimaryAccessionString;
733 for (
const auto& pDeltaSeq : delta_ext.
Get()) {
734 if (pDeltaSeq && pDeltaSeq->IsLoc()) {
735 auto pId = pDeltaSeq->GetLoc().GetId();
736 const auto& deltaIdType = pId->Which();
737 if (deltaIdType == primaryType) {
738 if (pId->GetSeqIdString() == primaryString) {
744 auto deltaAccessionHandle = scope.
GetAccVer(deltaHandle);
745 if (! deltaAccessionHandle) {
749 if (deltaAccessionHandle.GetSeqId()->GetSeqIdString() ==
754 if (! pPrimaryAccessionString) {
756 auto primaryAccessionHandle = scope.
GetAccVer(primaryGiHandle);
757 if (! primaryAccessionHandle) {
760 pPrimaryAccessionString =
761 make_unique<string>(primaryAccessionHandle.GetSeqId()->GetSeqIdString());
764 if (*pPrimaryAccessionString == pId->GetSeqIdString()) {
777 auto it = find_if(begin(accession),
779 [](
char c) {
return ! (
isalpha(c) || c ==
'_'); });
782 return int(distance(accession.
begin(), it));
813 vector<string> candidatesAccs;
814 vector<CRef<CSeq_id>> candidatesIds;
815 vector<CSeq_id_Handle> candidatesIdhs;
817 list<CRef<CSeq_id>> replaces;
819 for (
const auto& accessionString : hist) {
820 if (accessionString.empty())
834 if (pri_acc == 0 || pri_acc == 2) {
839 if (prefixLength <= 0) {
843 if ((accessionString.length() <= prefixLength ||
845 !
isdigit(accessionString[prefixLength])) &&
853 candidatesAccs.push_back(accessionString);
854 candidatesIds.push_back(
id);
859 for (
size_t i = 0;
i < candidatesIdhs.size(); ++
i ) {
860 auto& accessionString = candidatesAccs[
i];
861 auto id = candidatesIds[
i];
862 auto idChoice =
id->Which();
863 auto secondaryBsh = secondaryBshs[
i];
864 bool IsConOrScaffold =
false;
872 if (! IsConOrScaffold && pricon && idChoice == acctype) {
876 if (IsConOrScaffold && ! pricon) {
881 replaces.push_back(
id);
886 replaces.push_back(
id);
890 if (! replaces.empty()) {
891 auto& hist_replaces_ids = bioseq.
SetInst().SetHist().SetReplaces().SetIds();
892 hist_replaces_ids.splice(hist_replaces_ids.end(), replaces);
911 for (TKeywordList::iterator
key = kwds.begin();
key != kwds.end();) {
912 bool delnode =
false;
913 bool errpost =
false;
914 if (*
key ==
"HTGS_PHASE0") {
915 if (ibp->
htg != 0 && ibp->
htg != 5) {
917 if (ibp->
htg == 1 || ibp->
htg == 2 || ibp->
htg == 3)
924 }
else if (*
key ==
"HTGS_PHASE1") {
925 if (ibp->
htg != 0 && ibp->
htg != 5) {
927 if (ibp->
htg == 2 || ibp->
htg == 3 || ibp->
htg == 4)
934 }
else if (*
key ==
"HTGS_PHASE2") {
935 if (ibp->
htg != 0 && ibp->
htg != 5) {
937 if (ibp->
htg == 1 || ibp->
htg == 3 || ibp->
htg == 4)
944 }
else if (*
key ==
"HTGS_PHASE3") {
945 if (ibp->
htg != 0 && ibp->
htg != 5) {
947 if (ibp->
htg == 1 || ibp->
htg == 2 || ibp->
htg == 4)
954 }
else if (*
key ==
"HTG") {
987 if (! ftbp || length < 1)
992 ftsp->
to = ftbp->
to1;
993 ftsp->
next =
nullptr;
995 for (tftbp = ftbp; tftbp; tftbp = tftbp->
next) {
998 j = (i2 > i1) ? (i2 - i1) : (i1 - i2);
1001 if (i1 < 3000 && j * 10 > i1) {
1008 if (i1 >= 3000 && j > 300) {
1015 if (tftbp->
from1 <= tftsp->
to + 1) {
1016 if (tftbp->
to1 > tftsp->
to)
1017 tftsp->
to = tftbp->
to1;
1022 tftsp = tftsp->
next;
1024 tftsp->
to = tftbp->
to1;
1027 if (ftsp->
from - 1 > 50) {
1034 for (; ftsp; ftsp = tftsp) {
1036 if (tftsp && tftsp->
from - ftsp->
to - 1 > 50) {
1038 ErrPostEx(
SEV_ERROR,
ERR_TPA_IncompleteCoverage,
"This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->
to + 1, tftsp->
from - 1);
1040 ErrPostEx(
SEV_ERROR,
ERR_TSA_IncompleteCoverage,
"This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->
to + 1, tftsp->
from - 1);
1041 }
else if (! tftsp && length - ftsp->
to > 50) {
1043 ErrPostEx(
SEV_ERROR,
ERR_TPA_IncompleteCoverage,
"This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->
to + 1, length);
1045 ErrPostEx(
SEV_ERROR,
ERR_TSA_IncompleteCoverage,
"This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->
to + 1, length);
1123 const char* bad_accession;
1141 for (p =
offset; *p !=
'\0'; p++)
1164 bad_interval =
false;
1165 bad_accession =
nullptr;
1171 for (p += col_data; *p ==
' ';)
1173 for (
r = p; *p >=
'0' && *p <=
'9';)
1176 bad_interval =
true;
1183 for (
r = p; *p >=
'0' && *p <=
'9';)
1185 if (*p !=
' ' && *p !=
'\n' && *p !=
'\0') {
1186 bad_interval =
true;
1194 bad_interval =
true;
1198 for (ft = ftbp; ft->
next; ft = ft->
next)
1206 tftbp->
from1 = from1;
1211 for (
r = p; *p !=
'\0' && *p !=
' ' && *p !=
'\n';)
1219 for (
t =
r; *
t >=
'0' && *
t <=
'9';)
1236 while (*
r >=
'0' && *
r <=
'9')
1260 for (
r = p; *p >=
'0' && *p <=
'9';)
1263 bad_interval =
true;
1269 for (
r = p; *p >=
'0' && *p <=
'9';)
1271 if (*p !=
' ' && *p !=
'\n' && *p !=
'\0') {
1272 bad_interval =
true;
1277 tftbp->
to2 = atoi(
r);
1280 bad_interval =
true;
1289 for (p++; *p ==
' ';)
1300 if (bad_line || bad_interval || bad_accession) {
1306 }
else if (bad_accession) {
1324 ftbp->
next =
nullptr;
1331 if (! assembly.empty())
1339 for (; tftbp; tftbp = tftbp->
next) {
1340 len1 = tftbp->
to1 - tftbp->
from1 + 1;
1341 len2 = tftbp->
to2 - tftbp->
from2 + 1;
1350 seg.SetNumseg((len1 == len2) ? 1 : 2);
1352 seg.SetStarts().push_back(tftbp->
from1 - 1);
1353 seg.SetStarts().push_back(tftbp->
from2 - 1);
1357 seg.SetStarts().push_back(-1);
1358 seg.SetStarts().push_back(tftbp->
from2 - 1 + len1);
1360 seg.SetStarts().push_back(tftbp->
from1 - 1 + len2);
1361 seg.SetStarts().push_back(-1);
1366 seg.SetLens().push_back(len1);
1367 else if (len1 < len2) {
1368 seg.SetLens().push_back(len1);
1369 seg.SetLens().push_back(len2 - len1);
1371 seg.SetLens().push_back(len2);
1372 seg.SetLens().push_back(len1 - len2);
1376 seg.SetStrands().push_back(tftbp->
strand);
1380 seg.SetStrands().push_back(tftbp->
strand);
1384 text_id->SetAccession(acnum);
1387 text_id->SetVersion(vernum);
1392 seg.SetIds().push_back(
id);
1401 tag.SetTag().SetId(atoi(
r));
1403 tag.SetTag().SetStr(
r);
1406 seg.SetIds().push_back(gen_id);
1409 otext_id->SetAccession(tftbp->
accession);
1412 otext_id->SetVersion(tftbp->
version);
1419 seg.SetIds().push_back(aux_id);
1421 align_set.
Set().push_back(align);
1424 assembly.push_back(root_align);
1434 if (! where || ! what || *where ==
'\0' || *what ==
'\0')
1438 char* res =
nullptr;
1439 for (
char* p = where; *p !=
'\0'; p++)
1459 interval.
SetId(seq_id);
1467 bool bad_format =
false;
1470 if (! p || *p ==
'\0' ||
StringLen(p) < 7)
1472 else if (p[0] !=
'G' || p[1] !=
'C' || (p[2] !=
'F' && p[2] !=
'A') ||
1473 p[3] !=
'_' || p[4] <
'0' || p[4] >
'9')
1476 for (p += 5; *p !=
'\0'; p++)
1477 if (*p < '0' || *p >
'9')
1479 if (*p !=
'.' || p[1] <
'0' || p[1] >
'9')
1482 for (p++; *p !=
'\0'; p++)
1483 if (*p < '0' || *p >
'9')
1498 bool bad_format =
false;
1502 else if (name[0] !=
'P' || name[1] !=
'R' || name[2] !=
'J' ||
1503 (name[3] !=
'E' && name[3] !=
'N' && name[3] !=
'D') ||
1504 name[4] <
'A' || name[4] >
'Z' || name[5] <
'0' || name[5] >
'9')
1507 for (p = name + 6; *p !=
'\0'; p++)
1508 if (*p < '0' || *p >
'9')
1521 (name[3] !=
'N' || name[4] !=
'A')) ||
1523 (name[3] !=
'N' || name[4] !=
'A')))
1540 if (!
str || *
str ==
'\0') {
1545 for (p =
str; *p !=
'\0'; p++)
1546 if (*p ==
';' || *p ==
',' || *p ==
'\t')
1549 for (p =
str; *p ==
' ';)
1559 for (bad =
false, p =
str; *p !=
'\0';) {
1566 for (q = p; *p !=
' ' && *p !=
'\0';)
1572 for (
r = q; *
r >=
'0' && *
r <=
'9';)
1619 bool newstyle =
false;
1622 name =
"GenomeProject:";
1658 for (
auto& descr : descrs) {
1659 if (! descr->IsUser() || ! descr->GetUser().IsSetData())
1662 user_obj_ptr = &(descr->SetUser());
1666 obj_id = &(user_obj_ptr->
SetType());
1668 if (obj_id && obj_id->
IsStr() && obj_id->
GetStr() ==
"DBLink") {
1676 for (
i = 0, tvnp = vnp; tvnp; tvnp = tvnp->
next)
1684 id.SetStr(
"DBLink");
1688 user_field->
SetLabel().SetStr(
"BioProject");
1691 for (tvnp = vnp; tvnp; tvnp = tvnp->
next)
1692 user_field->
SetData().SetStrs().push_back(tvnp->
data);
1694 user_obj_ptr->
SetData().push_back(user_field);
1702 id.SetStr(
"GenomeProjectsDB");
1704 for (tvnp = vnp; tvnp; tvnp = tvnp->
next) {
1707 user_field->
SetLabel().SetStr(
"ProjectID");
1709 user_obj_ptr->
SetData().push_back(user_field);
1713 user_field->
SetLabel().SetStr(
"ParentID");
1714 user_field->
SetData().SetInt(0);
1715 user_obj_ptr->
SetData().push_back(user_field);
1721 descr->
SetUser(*user_obj_ptr);
1722 descrs.push_back(descr);
1735 (p[0] ==
'E' || p[0] ==
'S' || p[0] ==
'D') && p[1] ==
'R' &&
1736 (p[2] ==
'A' || p[2] ==
'P' || p[2] ==
'R' || p[2] ==
'S' ||
1737 p[2] ==
'X' || p[2] ==
'Z')) {
1738 for (p += 3; *p >=
'0' && *p <=
'9';)
1755 if (p &&
StringLen(p) > 5 && p[0] ==
'S' && p[1] ==
'A' &&
1756 p[2] ==
'M' && (p[3] ==
'N' || p[3] ==
'E' || p[3] ==
'D')) {
1757 if (p[4] ==
'A' || p[4] ==
'G')
1761 while (*p >=
'0' && *p <=
'9')
1795 if (!
str || *
str ==
'\0') {
1800 for (p =
str; *p !=
'\0'; p++)
1801 if (*p ==
';' || *p ==
'\t')
1814 for (p =
str; *p !=
'\0'; got_nl =
false) {
1815 while (*p ==
' ' || *p ==
'\n' || *p ==
':' || *p ==
',') {
1827 if ((! u || u >
t) && (!
r ||
r >
t)) {
1835 !
StringEqu(p,
"Sequence Read Archive:") &&
1836 !
StringEqu(p,
"Trace Assembly Archive:")) {
1842 bioproject =
StringEqu(p,
"BioProject:");
1843 sra =
StringEqu(p,
"Sequence Read Archive:");
1853 for (uvnp = vnp->
next; uvnp; uvnp = uvnp->
next)
1872 while (*p !=
',' && *p !=
'\n' && *p !=
':' && *p !=
'\0')
1875 while (*p !=
'\0' && *p !=
'\n')
1879 while (*
r !=
'\n' &&
r >
str)
1881 while (*
r ==
' ' || *
r ==
'\n')
1895 if (tagvnp && tagvnp->
data) {
1896 for (uvnp = tagvnp->
next; uvnp; uvnp = uvnp->
next) {
1969 for (tvnp = vnp; tvnp; tvnp = tvnp->
next) {
1978 id.SetStr(
"GenomeProjectsDB");
1983 if (user_obj.
Empty())
1987 if (!
str || *
str ==
'\0')
1991 while (*
str >=
'0' && *
str <=
'9')
2000 user_field->
SetLabel().SetStr(
"ProjectID");
2002 user_obj->
SetData().push_back(user_field);
2005 user_field->
SetLabel().SetStr(
"ParentID");
2006 user_field->
SetData().SetInt(0);
2008 user_obj->
SetData().push_back(user_field);
2018 descrs.push_back(descr);
2025 for (tvnp = vnp; tvnp; tvnp = tvnp->
next) {
2034 if (user_obj.
Empty()) {
2036 user_obj->
SetType().SetStr(
"DBLink");
2039 for (
i = 0, uvnp = tvnp->
next; uvnp; uvnp = uvnp->
next,
i++)
2045 string lstr(tvnp->
data);
2046 lstr = lstr.substr(0, lstr.size() - 1);
2047 user_field->
SetLabel().SetStr(lstr);
2049 user_field->
SetData().SetStrs();
2051 user_obj->
SetData().push_back(user_field);
2052 }
else if (! inpr && user_obj.
NotEmpty()) {
2053 user_field->
SetData().SetStrs().push_back(tvnp->
data);
2062 descrs.push_back(descr);
2075 bool finished =
true;
2078 if (!
delta->IsLoc())
2115 if (good && finished)
2133 if (! name &&
id.IsGeneral()) {
2135 if (
tag.GetDb() ==
"SeqLit" ||
tag.GetDb() ==
"UnkSeqLit")
2139 if (!
id.IsGenbank() && !
id.IsEmbl() && !
id.IsPir() &&
2140 !
id.IsSwissprot() && !
id.IsOther() && !
id.IsDdbj() && !
id.IsPrf() &&
2141 !
id.IsTpg() && !
id.IsTpe() && !
id.IsTpd()) {
2168 slip->
wgsacc = accession;
2169 }
else if (
i == 7) {
2173 slip->
wgsacc = accession;
2179 if (
type !=
id.Which()) {
2181 new_text_id->Assign(*text_id);
2185 id.SetGeneral().SetDb(
"FlyBase");
2186 id.SetGeneral().SetTag().SetStr(accession);
2189 id.SetPatent(*pat_id);
2201 if (
id.IsGenbank()) {
2206 }
else if (
id.IsEmbl()) {
2211 }
else if (
id.IsPir()) {
2216 }
else if (
id.IsSwissprot()) {
2221 }
else if (
id.IsOther()) {
2226 }
else if (
id.IsDdbj()) {
2231 }
else if (
id.IsPrf()) {
2236 }
else if (
id.IsTpg()) {
2241 }
else if (
id.IsTpe()) {
2246 }
else if (
id.IsTpd()) {
2257 for (
auto& loc : locs) {
2258 if (loc->IsEmpty()) {
2260 }
else if (loc->IsWhole()) {
2262 }
else if (loc->IsInt()) {
2264 }
else if (loc->IsPnt()) {
2266 if (iscon && ! loc->GetPnt().IsSetFuzz()) {
2267 int point = loc->GetPnt().GetPoint();
2270 interval->
SetTo(point);
2272 if (loc->GetPnt().IsSetStrand())
2273 interval->
SetStrand(loc->GetPnt().GetStrand());
2275 interval->
SetId(loc->SetPnt().SetId());
2276 loc->SetInt(*interval);
2278 }
else if (loc->IsPacked_int()) {
2279 for (
auto& interval : loc->SetPacked_int().Set()) {
2282 }
else if (loc->IsPacked_pnt()) {
2284 }
else if (loc->IsMix()) {
2286 }
else if (loc->IsEquiv()) {
2296 const Char* p =
nullptr;
2315 if ((tpa > 0 && non_tpa > 0) || tpa > 1 || non_tpa > 1 ||
2319 if (tpa > 0 && non_tpa > 0) {
2327 if (tpa > 1 || non_tpa > 1) {
2330 p =
"Entry skipped.";
2362 msga[4] = msgb[4] = 0;
2369 if (
i == 3 ||
i == 7) {
2409 if (!
buf || *
buf ==
'\0')
2412 for (p =
buf; *p !=
'\0'; p++) {
2416 for (p++; *p ==
' ' || *p ==
'~'; p++)
2424 for (start =
buf;;) {
2435 for (
r = vnp->
data; *
r !=
'\0';
r++)
2453 for (p = vnp->
data; *p !=
'\0'; p++)
2462 res->
next =
nullptr;
2493 id.SetStr(
"StructuredComment");
2496 field->
SetLabel().SetStr(
"StructuredCommentPrefix");
2499 field->
SetData().SetStr() +=
"-START##";
2501 obj->
SetData().push_back(field);
2503 for (tvnp = vnp; tvnp; tvnp = tvnp->
next) {
2505 if (! p || *p ==
'\0')
2512 if (q > p && *(q - 1) ==
' ')
2515 for (*q++ =
'\0'; *q ==
' ' || *q ==
':';)
2518 if (*p ==
'\0' || *q ==
'\0')
2525 obj->
SetData().push_back(field);
2528 if (obj->
GetData().size() < 2) {
2534 field->
SetLabel().SetStr(
"StructuredCommentSuffix");
2536 field->
SetData().SetStr() +=
"-END##";
2538 obj->
SetData().push_back(field);
2552 char*
tag =
nullptr;
2558 if (!
str || *
str ==
'\0')
2567 if (*q ==
'~' || (*q ==
'#' && q >
str && *--q ==
'#') || q ==
str)
2569 if (q[0] !=
'#' || q[1] !=
'#') {
2603 for (vnp = tagvnp; vnp; vnp = vnp->
next) {
2645 objs.push_back(cur);
2672 while (fgets(
buf, 1023, fd)) {
2673 if (
buf[0] ==
'>' && ret[0] !=
'\0')
2684 if (seq_entry.
IsSeq()) {
2687 }
else if (seq_entry.
IsSet()) {
2695 for (TSeqdescList::iterator descr = descrs->begin(); descr != descrs->end();) {
2696 if (! (*descr)->IsUser()) {
2708 descr = descrs->erase(descr);
2717 bool got_comment =
false;
2718 bool got_dblink =
false;
2720 for (
const auto& descr : bioseq.
GetDescr().
Get()) {
2721 if (! descr->IsUser())
2730 if (user_type_str ==
"StructuredComment")
2732 else if (user_type_str ==
"GenomeProjectsDB")
2734 else if (user_type_str ==
"DBLink") {
2735 for (
const auto& field : user_obj.
GetData()) {
2736 if (! field->IsSetLabel() || ! field->GetLabel().IsStr() ||
2737 field->GetLabel().GetStr() !=
"BioProject")
2765 for (
auto& descr : bioseq.
SetDescr().Set()) {
2766 if (descr->IsMolinfo()) {
2767 mol_info = &descr->SetMolinfo();
2779 bioseq.
SetDescr().Set().push_back(descr);
2789 ErrPostEx(
SEV_INFO,
ERR_SEQUENCE_HasManyComponents,
"An OnlyNearFeatures FeatureFetchPolicy User-object has been added to this record because it is constructed from %d components, which exceeds the threshold of 999 for User-object creation.", num);
2796 field->
SetLabel().SetStr(
"Policy");
2797 field->
SetData().SetStr(
"OnlyNearFeatures");
2801 bsp.
SetDescr().Set().push_back(descr);
2807 for (
size_t i =
str.find(
"{ECO:");
i != string::npos;
i =
str.find(
"{ECO:",
i)) {
2808 size_t j =
str.find(
'}',
i);
2809 if (j == string::npos)
2812 if (
i > 0 &&
str[
i - 1] ==
' ')
2814 if (
i > 0 && j <
str.size()) {
2815 if ((
str[
i - 1] ==
'.' &&
str[j] ==
'.') ||
2816 (
str[
i - 1] ==
';' &&
str[j] ==
';')) {
2820 str.erase(
i, j -
i);
2833 for (
const auto& field : uop->
GetData()) {
2834 if (! field->IsSetData() || ! field->GetData().IsStrs() || ! field->IsSetNum() || field->GetNum() < 1 ||
2835 ! field->IsSetLabel() || ! field->GetLabel().IsStr() || field->GetLabel().GetStr() !=
"Sequence Read Archive")
2839 if (
str.size() > 2 &&
2840 (
str[0] ==
'D' ||
str[0] ==
'E' ||
str[0] ==
'S') &&
str[1] ==
'R' &&
2841 (
str[2] ==
'R' ||
str[2] ==
'X' ||
str[2] ==
'Z')) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static bool s_IsConOrScaffold(CBioseq_Handle bsh)
static void CreateSeqGap(CSeq_literal &seq_lit, GapFeatsPtr gfp)
static void fta_fix_seq_id(CSeq_loc &loc, CSeq_id &id, IndexblkPtr ibp, const char *location, const char *name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
bool no_reference(const CBioseq &bioseq)
void SeqToDelta(CBioseq &bioseq, Int2 tech)
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
static void fta_validate_assembly(char *name)
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
static bool fta_ranges_to_hist(const CGB_block::TExtra_accessions &extra_accs)
static int sGetPrefixLength(const CTempString &accession)
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
bool g_DoesNotReferencePrimary(const CDelta_ext &delta_ext, const CSeq_id &primary, CScope &scope)
static bool fta_validate_bioproject(char *name, Parser::ESource source)
bool fta_if_valid_biosample(const Char *id, bool dblink)
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
bool fta_strings_same(const char *s1, const char *s2)
bool check_cds(const DataBlk &entry, Parser::EFormat format)
static bool s_IsAccession(const CSeq_id &id)
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
bool fta_if_valid_sra(const Char *id, bool dblink)
static void fta_do_fix_seq_loc_id(TSeqLocList &locs, IndexblkPtr ibp, const char *location, const char *name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
static ValNodePtr fta_tokenize_dblink(char *str, Parser::ESource source)
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
static ValNodePtr fta_vnp_structured_comment(char *buf)
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
char * StringRStr(char *where, const char *what)
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
void fta_get_dblink_user_object(TSeqdescList &descrs, char *offset, size_t len, Parser::ESource source, bool *drop, CRef< CUser_object > &dbuop)
bool fta_number_is_huge(const Char *s)
void err_install(const Indexblk *ibp, bool accver)
string tata_save(string_view t)
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
static void fta_tpa_block_free(FTATpaBlockPtr ftbp)
static ValNodePtr fta_tokenize_project(char *str, Parser::ESource source, bool newstyle)
static CRef< CUser_object > fta_build_structured_comment(char *tag, char *buf)
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, const char *location, const char *name, bool iscon)
static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
void StripECO(string &str)
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
void ShrinkSpaces(char *line)
TSeqPos GetLength(void) const
@Imp_feat.hpp User-defined methods of the data storage class.
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
#define ParFlat_COL_DATA_EMBL
#define ERR_REFERENCE_Illegalreference
#define ERR_LOCATION_TpaAndNonTpa
#define ERR_TPA_SpanLengthDiff
#define ERR_COMMENT_SameStructuredCommentTags
#define ERR_TSA_SpanLengthDiff
#define ERR_TPA_InvalidPrimarySeqId
#define ERR_TSA_SpanDiffOver300bp
#define ERR_ENTRY_TLSLacksBioProjectLink
#define ERR_FORMAT_InvalidBioProjectAcc
#define ERR_TPA_IncompleteCoverage
#define ERR_DBLINK_InvalidIdentifier
#define ERR_SEQUENCE_HasManyComponents
#define ERR_LOCATION_CrossDatabaseFeatLoc
#define ERR_COMMENT_StructuredCommentLacksDelim
#define ERR_TPA_InvalidPrimarySpan
#define ERR_ENTRY_TSALacksStructuredComment
#define ERR_FORMAT_WrongBioProjectPrefix
#define ERR_LOCATION_SeqIdProblem
#define ERR_SEQUENCE_MultipleWGSProjects
#define ERR_TSA_IncompleteCoverage
#define ERR_ACCESSION_CannotGetDivForSecondary
#define ERR_ENTRY_TSALacksBioProjectLink
#define ERR_TPA_SpanDiffOver300bp
#define ERR_FORMAT_ContigVersusAssemblyGapMissmatch
#define ERR_TSA_InvalidPrimaryBlock
#define ERR_TSA_InvalidPrimarySpan
#define ERR_FEATURE_AllNsBetweenGaps
#define ERR_FEATURE_InvalidGapSequence
#define ERR_FORMAT_IncorrectDBLINK
#define ERR_FEATURE_NsAbutGap
#define ERR_ENTRY_TLSLacksStructuredComment
#define ERR_LOCATION_ContigAndScaffold
#define ERR_ACCESSION_WGSPrefixMismatch
#define ERR_DBLINK_DuplicateIdentifierRemoved
#define ERR_SEQUENCE_HTGPossibleShortGap
#define ERR_TPA_InvalidPrimaryBlock
#define ERR_SEQUENCE_HTGPhaseZeroHasGap
#define ERR_COMMENT_InvalidStructuredComment
#define ERR_KEYWORD_MultipleHTGPhases
#define ERR_SEQUENCE_HTGWithoutGaps
#define ERR_TSA_InvalidPrimarySeqId
std::list< std::string > TKeywordList
std::list< CRef< objects::CSeqdesc > > TSeqdescList
std::vector< CRef< objects::CUser_object > > TUserObjVector
bool StringEquNI(const char *s1, const char *s2, size_t n)
bool StringEquN(const char *s1, const char *s2, size_t n)
bool StringEqu(const char *s1, const char *s2)
void StringNCpy(char *d, const char *s, size_t n)
size_t StringLen(const char *s)
char * StringRChr(char *s, const char c)
void FtaInstallPrefix(int prefix, const char *name, const char *location)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static const char * str(char *buf, int n)
static const char location[]
unsigned int TSeqPos
Type for sequence locations and lengths.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
static E_Choice GetAccType(EAccessionInfo info)
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
const CSeq_id & GetSeq_id(void) const
Get seq_id of the current location.
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
CSeq_id_Handle GetAccVer(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get accession.version Seq-id Returns null CSeq_id_Handle if the sequence is not found or if it doesn'...
TBioseqHandles GetBioseqHandles(const TIds &ids)
Get bioseq handles for all ids.
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
bool IsSetInst_Repr(void) const
TInst_Repr GetInst_Repr(void) const
TObjectType * GetNCPointer(void) const THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
NCBI_NS_STD::string::size_type SIZE_TYPE
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
const_iterator begin() const
Return an iterator to the string's starting position.
static const char label[]
list< string > TExtra_accessions
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
void SetLabel(TLabel &value)
Assign a value to Label data member.
const TData & GetData(void) const
Get the Data member data.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TType & GetType(void) const
Get the Type member data.
Tdata & Set(void)
Assign a value to data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
void SetDim(TDim value)
Assign a value to Dim data member.
void SetType(TType value)
Assign a value to Type data member.
@ eType_partial
mapping pieces together
const TKey & GetKey(void) const
Get the Key member data.
void SetTo(TTo value)
Assign a value to To data member.
bool IsGenbank(void) const
Check if variant Genbank is selected.
TGeneral & SetGeneral(void)
Select the variant.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsTpg(void) const
Check if variant Tpg is selected.
bool IsEmpty(void) const
Check if variant Empty is selected.
ENa_strand
strand of nucleic acid
bool IsPacked_pnt(void) const
Check if variant Packed_pnt is selected.
bool IsTpd(void) const
Check if variant Tpd is selected.
bool IsOther(void) const
Check if variant Other is selected.
void SetId(TId &value)
Assign a value to Id data member.
bool IsEmbl(void) const
Check if variant Embl is selected.
E_Choice Which(void) const
Which variant is currently selected.
void SetFrom(TFrom value)
Assign a value to From data member.
TGi GetGi(void) const
Get the variant data.
bool IsWhole(void) const
Check if variant Whole is selected.
bool IsInt(void) const
Check if variant Int is selected.
void SetStrand(TStrand value)
Assign a value to Strand data member.
bool IsTpe(void) const
Check if variant Tpe is selected.
bool IsPnt(void) const
Check if variant Pnt is selected.
const TAccession & GetAccession(void) const
Get the Accession member data.
bool IsDdbj(void) const
Check if variant Ddbj is selected.
@ e_General
for other databases
@ e_Gi
GenInfo Integrated Database.
@ e_Named_annot_track
Internal named annotation tracking ID.
@ e_not_set
No variant selected.
@ e_Tpg
Third Party Annot/Seq Genbank.
const TSeq & GetSeq(void) const
Get the variant data.
TSet & SetSet(void)
Select the variant.
const TSet & GetSet(void) const
Get the variant data.
bool IsSeq(void) const
Check if variant Seq is selected.
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
void SetCompleteness(TCompleteness value)
Assign a value to Completeness data member.
TRepr GetRepr(void) const
Get the Repr member data.
list< CRef< CSeq_align > > TAssembly
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
const TInst & GetInst(void) const
Get the Inst member data.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
TTopology GetTopology(void) const
Get the Topology member data.
const TIupacna & GetIupacna(void) const
Get the variant data.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
const Tdata & Get(void) const
Get the member data.
void SetType(TType value)
Assign a value to Type data member.
bool IsDelta(void) const
Check if variant Delta is selected.
void SetInst(TInst &value)
Assign a value to Inst data member.
const TExt & GetExt(void) const
Get the Ext member data.
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
const TDelta & GetDelta(void) const
Get the variant data.
TUser & SetUser(void)
Select the variant.
const Tdata & Get(void) const
Get the member data.
list< CRef< CDelta_seq > > Tdata
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
void SetTech(TTech value)
Assign a value to Tech data member.
const TDescr & GetDescr(void) const
Get the Descr member data.
TMolinfo & SetMolinfo(void)
Select the variant.
@ eRepr_delta
sequence made by changes (delta) to others
@ eCompleteness_complete
complete biological entity
@ eTech_htgs_2
ordered High Throughput sequence contig
@ eTech_htgs_3
finished High Throughput sequence
@ eTech_htgs_1
unordered High Throughput sequence contig
@ eTech_wgs
whole genome shotgun sequencing
@ eTech_htgs_0
single genomic reads for coordination
unsigned int
A callback function used to compare two keys in a database.
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
int fta_if_wgs_acc(string_view accession)
bool isSupportedAccession(CSeq_id::E_Choice type)
Int4 IsNewAccessFormat(const Char *acnum)
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
static const BitmapCharRec ch1
static const BitmapCharRec ch2
std::list< SeqLoc > TSeqLocList
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence
objects::CSeq_gap::TType asn_gap_type
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
void fta_StringCpy(char *dst, const char *src)
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
ValNodePtr ValNodeFree(ValNodePtr vnp)
ValNodePtr ValNodeFreeData(ValNodePtr vnp)