80 const CBioseq& bioseq = context.CurrentBioseq();
85 auto bsh = context.GetBioseqHandle(bioseq);
87 sequence::CDeflineGenerator deflineGenerator;
88 auto defline = deflineGenerator.GenerateDefline(bsh, 0);
91 for (
const auto& desc : context.GetSeqdesc()) {
93 m_Objs[defline].Add(*context.SeqdescObjRef(desc));
98 m_Objs[defline].Add(*context.BioseqObjRef());
106 if (m_Objs.empty()) {
109 bool all_unique =
true;
111 for (
auto& it : m_Objs.GetMap()) {
113 if (list.size() == 1) {
116 else if (list.size() > 1) {
125 m_ReportItems =
tmp.Export(*this)->GetSubitems();
133 const CBioseq& bioseq = context.CurrentBioseq();
137 m_Objs[
"[n] sequence[s] [has] terminal Ns"].Fatal().Add(*context.BioseqObjRef());
147 const CBioseq& bioseq = context.CurrentBioseq();
151 m_Objs[
"[n] protein sequences are shorter than 50 aa."].Add(*context.BioseqObjRef(),
false);
161 for (
const auto& desc : context.GetSeqdesc()) {
162 if (desc.IsComment()) {
163 m_Objs[desc.GetComment()].Add(*context.SeqdescObjRef(desc));
171 if (!m_Objs.empty()) {
173 string label = m_Objs.GetMap().size() == 1 ?
"[n] comment descriptor[s] were found (all same)" :
"[n] comment descriptor[s] were found (some different)";
174 for (
auto it : m_Objs.GetMap()) {
175 for (
auto obj : it.second->GetObjects()) {
186 DISCREPANCY_CASE(MRNA_ON_WRONG_SEQUENCE_TYPE, SEQUENCE,
eDisc |
eOncaller,
"Eukaryotic sequences that are not genomic or macronuclear should not have mRNA features")
188 const CBioseq& bioseq = context.CurrentBioseq();
196 const CSeqdesc* biosrc = context.GetBiosource();
200 !context.IsEukaryotic(&biosrc->
GetSource())) {
203 for (
const CSeq_feat* feat : context.FeatMRNAs()) {
204 m_Objs[
"[n] mRNA[s] [is] located on eukaryotic sequence[s] that [does] not have genomic or plasmid source[s]"].Add(*context.SeqFeatObjRef(*feat));
215 const CBioseq& bioseq = context.CurrentBioseq();
218 bool has_gaps = !!sum.
Gaps;
222 if (it->IsFtable()) {
247 const CBioseq& bioseq = context.CurrentBioseq();
249 for (
const auto& desc : context.GetAllSeqdesc()) {
253 for (
const auto& user_field : user.
GetData()) {
254 if (user_field->IsSetLabel() && user_field->GetLabel().IsStr() && user_field->GetLabel().GetStr() ==
"BioProject" && user_field->IsSetData() && user_field->GetData().IsStrs()) {
256 if (!strs.empty() && !strs[0].empty()) {
257 m_Objs[
"[n] sequence[s] contain[S] BioProject IDs"].Add(*context.BioseqObjRef());
273 const CBioseq& bioseq = context.CurrentBioseq();
275 m_Objs[
"[n] bioseq[s] [has] no definition line"].Add(*context.BioseqObjRef());
285 const CBioseq& bioseq = context.CurrentBioseq();
289 m_Objs[
"[n] sequence[s] [has] runs of 15 or more Ns"].Add(*context.BioseqObjRef());
299 const CBioseq& bioseq = context.CurrentBioseq();
303 m_Objs[
"[n] sequence[s] [has] external references"].Add(*context.BioseqObjRef());
313 const double MIN_N_PERCENTAGE = 10.0;
315 const CBioseq& bioseq = context.CurrentBioseq();
318 if (!sum.
HasRef && sum.
N * 100. / sum.
Len > MIN_N_PERCENTAGE) {
319 m_Objs[
"[n] sequence[s] [has] > 10% Ns"].Add(*context.BioseqObjRef());
330 for (
const auto& feat : context.GetFeat()) {
335 m_Objs[
key +
": [n] present"].Info().Incr();
338 const CBioseq& bioseq = context.CurrentBioseq();
344 for (
const auto& feat : context.GetAllFeat()) {
349 key = to_string(feat.GetData().GetSubtype()) +
" " +
key;
360 for (
auto& it : m_Objs[
kEmptyCStr].GetMap()) {
361 if (it.first ==
"N" || it.first ==
"A") {
364 size_t n = it.first.find(
' ');
365 string key = it.first.substr(
n + 1);
367 string label =
key +
": [n] present";
379 for (
auto& obj : m_Objs[
kEmptyStr][it.first].GetObjects()) {
382 for (
auto& pp : obj2num) {
383 m_Objs[
label][
"[n] bioseq[s] [has] [(]" + to_string(pp.second) +
"[)] " +
key +
" features"].Info().Add(*pp.first);
398 if (context.FeatExons().size()) {
408 const CBioseq* seq =
dynamic_cast<const CBioseq*
>(context.FindObject(*obj));
431 const CBioseq& bioseq = context.CurrentBioseq();
440 m_Objs[
kEmptyStr].Add(*context.BioseqObjRef());
452 if (m_Objs.empty()) {
463 size_t num_of_missing = 0,
466 for (
auto it : the_map) {
467 num_of_bioseqs += it.second->GetObjects().
size();
468 if (it.first.empty()) {
469 num_of_missing += it.second->GetObjects().size();
475 else if (tech != it.first) {
480 if (num_of_missing == num_of_bioseqs || (same && !num_of_missing)) {
483 summary += num_of_missing ?
"some missing, " :
"all present, ";
484 summary += same ?
"all same)" :
"some different)";
485 if (num_of_missing) {
486 if (num_of_missing == num_of_bioseqs) {
487 report[summary].
SetCount(num_of_missing);
502 return (ch ==
'A' || ch ==
'T' || ch ==
'G' || ch ==
'C');
508 static const size_t MIN_TITLE_SEQ_LEN = 19;
511 for (string::const_reverse_iterator it = title.rbegin(); it != title.rend(); ++it) {
518 if (count >= MIN_TITLE_SEQ_LEN) {
523 return count >= MIN_TITLE_SEQ_LEN;
529 for (
auto& desc : context.GetSeqdesc()) {
531 m_Objs[
"[n] defline[s] appear[S] to end with sequence characters"].Add(*context.SeqdescObjRef(desc));
542 bool is_genomic =
false;
543 const CBioseq& bioseq = context.CurrentBioseq();
547 auto molinfo = context.GetMolinfo();
551 if (!is_genomic || !is_dna) {
554 for (
auto& annot_it : bioseq.
GetAnnot()) {
555 if (annot_it->IsFtable()) {
563 if (feat->IsSetData()) {
566 m_Objs[
"[n] sequence[s] [has] rRNA or misc_RNA features but [is] not genomic DNA"].Add(*context.BioseqObjRef(
CDiscrepancyContext::eFixSelf));
577 const CBioseq* seq =
dynamic_cast<const CBioseq*
>(context.FindObject(*obj));
582 if (descrs.
IsSet()) {
583 for (
auto descr : descrs.
Set()) {
584 if (descr->IsMolinfo()) {
585 molinfo = &(descr->SetMolinfo());
590 if (molinfo ==
nullptr) {
593 descrs.
Set().push_back(new_descr);
595 if (molinfo ==
nullptr) {
642 const string& object_name,
648 for (
auto& z : obj.second->GetMap()) {
649 collector[field_prefix + z.first][
" [n] " + object_name +
"[s] [is] missing field " + field_prefix + z.first]
660 if (
f->IsSetLabel() &&
f->GetLabel().IsStr() &&
f->IsSetData()) {
661 string field_name = field_prefix +
f->GetLabel().GetStr();
663 if (already_seen && !collector.
Exist(field_name)) {
665 string missing_label =
"[n] " + object_name +
"[s] [is] missing field " + field_name;
669 collector[field_name][missing_label].
Add(*ro);
681 collector[field_prefix + z.first][
" [n] " + object_name +
"[s] [is] missing field " + field_prefix + z.first].
Add(*context.
SeqdescObjRef(*desc));
696 const CBioseq& bioseq = context.CurrentBioseq();
699 auto rep_seq = context.BioseqObjRef();
700 for (
auto& desc : context.GetAllSeqdesc()) {
721 size_t num_values = 0;
724 for (
auto& s : node.
GetMap()) {
731 value = s.first.substr(pos);
739 if (num_values > 1) {
754 for (
auto& s : node.
GetMap()) {
755 bool this_present =
true;
756 bool this_same =
true;
758 all_present &= this_present;
759 all_same &= this_same;
760 if (!all_present && !all_same) {
769 string summary =
"(";
771 summary +=
"all present";
773 summary +=
"some missing";
777 summary +=
"all same";
779 summary +=
"inconsistent";
788 for (
auto& s : original.
GetMap()){
789 for (
auto q : s.second->GetObjects()) {
790 new_home[s.first].
Add(*q);
802 return " " + orig_field_name;
803 }
else if (
NStr::Equal(orig_field_name,
"ProbeDB")) {
804 return " " + orig_field_name;
805 }
else if (
NStr::Equal(orig_field_name,
"Sequence Read Archive")) {
806 return " " + orig_field_name;
807 }
else if (
NStr::Equal(orig_field_name,
"BioProject")) {
808 return " " + orig_field_name;
809 }
else if (
NStr::Equal(orig_field_name,
"Assembly")) {
810 return " " + orig_field_name;
812 return orig_field_name;
821 if (m_Objs.empty()) {
826 bool all_present =
true;
827 bool all_same =
true;
829 if (all_present && all_same) {
833 string top_label =
"DBLink Report " +
GetSummaryLabel(all_present, all_same);
836 while (it != m_Objs.GetMap().end()) {
839 CopyNode(m_Objs[top_label][
" " + it->first], *it->second);
840 it = m_Objs.GetMap().erase(it);
847 bool this_present =
true;
848 bool this_same =
true;
851 for (
auto& s : it2.second->GetMap()){
852 for (
auto& q : s.second->GetObjects()) {
853 m_Objs[top_label][new_label][s.first].Add(*q);
873 const CBioseq& bioseq = context.CurrentBioseq();
875 auto rep_seq = context.BioseqObjRef();
876 for (
auto& desc : context.GetAllSeqdesc()) {
892 m_Objs[
"[n] Bioseq[s] [is] missing " + it.first +
" structured comment"].Add(*rep_seq);
900 m_Objs[
"[n] Bioseq[s] [is] missing " + it.first +
" structured comment"].Add(*ro);
922 if (m_Objs.empty()) {
927 bool all_present =
true;
928 bool all_same =
true;
930 if (all_present && all_same) {
934 string top_label =
"Structured Comment Report " +
GetSummaryLabel(all_present, all_same);
937 while (it != m_Objs.GetMap().end()) {
940 CopyNode(m_Objs[top_label][
" " + it->first], *it->second);
941 it = m_Objs.GetMap().erase(it);
948 bool this_present =
true;
949 bool this_same =
true;
951 string new_label = it2.first +
" " +
GetSummaryLabel(this_present, this_same);
952 for (
auto& s : it2.second->GetMap()) {
953 string sub_label = s.first;
954 if (this_present && this_same) {
957 for (
auto& q : s.second->GetObjects()) {
958 m_Objs[top_label][new_label][sub_label].Add(*q);
972 const CBioseq& bioseq = context.CurrentBioseq();
974 for (
auto& desc : context.GetAllSeqdesc()) {
982 m_Objs[
"[n] sequence[s] [does] not include structured comments."].Add(*context.BioseqObjRef());
991 const CBioseq& bioseq = context.CurrentBioseq();
993 for (
auto& desc : context.GetAllSeqdesc()) {
998 for (
auto& it : user.
GetData()) {
999 if (it->IsSetLabel() && it->GetLabel().IsStr() &&
NStr::Equal(it->GetLabel().GetStr(),
"BioProject")) {
1010 m_Objs[
"[n] sequence[s] [does] not include project."].Add(*context.BioseqObjRef());
1019 const CBioseq& bioseq = context.CurrentBioseq();
1021 for (
auto& desc : context.GetAllSeqdesc()) {
1022 if (desc.IsUser()) {
1025 m_Objs[
"[n] sequence[s] [is] unverified"].Add(*context.BioseqObjRef(),
false);
1040 const CBioseq& bioseq = context.CurrentBioseq();
1051 const CBioseq& bioseq = context.CurrentBioseq();
1055 m_Objs[
"[n] sequence[s] contain[S] nucleotides that are not ATCG or N"].Add(*context.BioseqObjRef());
1067 const CBioseq& bioseq = context.CurrentBioseq();
1071 if (
source &&
source->IsSource() &&
source->GetSource().IsSetOrg() &&
source->GetSource().GetOrg().IsSetTaxname() && title) {
1072 string taxname =
source->GetSource().GetOrg().GetTaxname();
1079 bool no_taxname_in_defline =
false;
1081 if (taxname_pos ==
NPOS) {
1082 no_taxname_in_defline =
true;
1086 no_taxname_in_defline =
NStr::CompareCase(title->
GetTitle().c_str() + taxname_pos, 1, taxname.size() - 1, taxname.c_str() + 1) != 0;
1088 no_taxname_in_defline =
true;
1091 if (no_taxname_in_defline) {
1105 for (
auto field: user.
GetData()) {
1106 if (field->IsSetData() && field->GetData().IsInt() && field->IsSetLabel() && field->GetLabel().IsStr() && field->GetLabel().GetStr() ==
"ProjectID") {
1117 const CBioseq& bioseq = context.CurrentBioseq();
1119 for (
auto& desc : context.GetAllSeqdesc()) {
1120 if (desc.IsUser()) {
1124 if (!proj_id.empty()) {
1125 m_Objs[proj_id][bioseq.
IsNa() ?
"N" :
"A"].Add(*context.BioseqObjRef());
1136 if (m_Objs.empty()) {
1140 string all =
"[n] sequence[s] [has] project IDs ";
1141 string prots =
"[n] protein sequence[s] [has] project IDs ";
1142 string nucs =
"[n] nucleotide sequence[s] [has] project IDs ";
1143 auto& projects = m_Objs.GetMap();
1144 all += projects.size() > 1 ?
"(some different)" :
"(all same)";
1145 size_t count_prots = 0;
1146 size_t count_nucs = 0;
1147 for (
auto it: projects) {
1148 auto& M = it.second->GetMap();
1149 if (M.find(
"A") != M.end()) {
1152 if (M.find(
"N") != M.end()) {
1156 prots += count_prots > 1 ?
"(some different)" :
"(all same)";
1157 nucs += count_nucs > 1 ?
"(some different)" :
"(all same)";
1158 for (
auto it : projects) {
1159 auto& M = it.second->GetMap();
1160 if (M.find(
"A") != M.end()) {
1162 res[
all][prots].
Add(*obj);
1165 if (M.find(
"N") != M.end()) {
1167 res[
all][nucs].
Add(*obj);
1182 auto& cds = context.FeatCDS();
1183 if (cds.size() < 2) {
1186 size_t count_pseudo = 0;
1187 size_t count_disrupt = 0;
1188 for (
auto feat : cds) {
1189 if (feat->IsSetComment() &&
NStr::Find(feat->GetComment(),
"coding region disrupted by sequencing gap") !=
NPOS) {
1192 if (context.IsPseudo(*feat)) {
1196 if (count_disrupt != cds.size() && count_pseudo != cds.size()) {
1197 m_Objs[
"[n] mRNA bioseq[s] [has] multiple CDS features"].Add(*context.BioseqObjRef());
1211 auto& cds = context.FeatCDS();
1212 size_t count_plus = 0;
1213 size_t count_minus = 0;
1214 for (
auto& feat : cds) {
1237 const CBioseq* seq =
dynamic_cast<const CBioseq*
>(context.FindObject(*obj));
1241 for (; feat_ci; ++feat_ci) {
1246 new_inst->Assign(bioseq.
GetInst());
1263 const size_t MAX_N_IN_SEQ = 7;
1264 const CBioseq& bioseq = context.CurrentBioseq();
1267 if (sum.
MinQ > MAX_N_IN_SEQ) {
1268 m_Objs[
"[n] sequence[s] contain[S] low quality region"].Add(*context.BioseqObjRef());
1279 if (
set.IsSetDescr()) {
1280 for (
const auto& descr :
set.GetDescr().Get()) {
1281 if (descr->IsTitle()) {
1282 m_Objs[
"[n] title[s] on sets were found"].Add(*context.SeqdescObjRef(*descr));
1291 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1299 const CSeqdesc* biosrc = context.GetBiosource();
1301 auto&
all = context.FeatAll();
1302 bool has_D_loop =
false;
1303 bool has_misc_feat_with_control_region =
false;
1304 for (
auto& feat :
all) {
1305 if (feat->IsSetData()) {
1312 has_misc_feat_with_control_region =
true;
1318 if (has_D_loop || has_misc_feat_with_control_region) {
1319 m_Objs[
"[n] bioseq[s] [has] D-loop or control region misc_feature, but [is] do not have mitochondrial source"].Add(*context.BioseqObjRef(
CDiscrepancyContext::eFixSet));
1327 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1349 const CBioseq* seq =
dynamic_cast<const CBioseq*
>(context.FindObject(*obj));
1369 const CBioseq& bioseq = context.CurrentBioseq();
1377 m_Objs[
"[n] sequence[s] [is] shorter than 50 nt"].Add(*context.BioseqObjRef());
1386 const CBioseq& bioseq = context.CurrentBioseq();
1396 for (
auto& annot_it : bioseq.
GetAnnot()) {
1397 if (annot_it->IsFtable()) {
1402 m_Objs[
"[n] contig[s] [is] shorter than 200 nt"].Add(*context.BioseqObjRef(fix));
1409 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1415 const CBioseq* seq =
dynamic_cast<const CBioseq*
>(context.FindObject(*obj));
1427 const CBioseq& bioseq = context.CurrentBioseq();
1429 const CSeqdesc* biosrc = context.GetBiosource();
1431 m_Objs[
"[n] RNA bioseq[s] [is] proviral"].Add(*context.BioseqObjRef());
1439 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1451 if ((bio_src.*is_set_fn)()) {
1453 val = (bio_src.*get_fn)();
1455 else if (
val != (bio_src.*get_fn)()) {
1468 if (
mod->IsSetSubtype() &&
mod->GetSubtype() == subtype &&
mod->IsSetSubname()) {
1473 if (
mod->GetSubname() !=
val) {
1489 for (
const auto& subtype : bio_src.
GetSubtype()) {
1504 string taxname, isolate, strain;
1505 bool all_taxname_same =
true, all_isolate_same =
true, all_strain_same =
true;
1506 for (
auto& descr_bio_src : context.GetSetBiosources()) {
1507 const CBioSource& bio_src = descr_bio_src->GetSource();
1508 if (context.HasLineage(bio_src,
"",
"Viruses")) {
1510 m_Objs[
"[n] biosource[s] should have segment qualifier but [does] not"].Add(*context.SeqdescObjRef(*descr_bio_src));
1513 if (all_taxname_same) {
1516 if (all_isolate_same) {
1519 if (all_strain_same) {
1523 if (!all_taxname_same) {
1524 m_Objs[
"Not all biosources have same taxname"];
1526 if (!all_isolate_same) {
1527 m_Objs[
"Not all biosources have same isolate"];
1529 if (!all_strain_same) {
1530 m_Objs[
"Not all biosources have same strain"];
1538 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1548 for (
auto& qual : feat.
GetQual()) {
1561 const CSeqdesc* biosrc = context.GetBiosource();
1570 for (
auto& feat : context.GetFeat()) {
1578 if (!context.IsBioseq()) {
1580 if (
set.IsSetClass()) {
1583 unsigned char flags = context.ReadFlags();
1585 m_Objs[
"[n] unwanted set wrapper[s]"].Add(*context.BioseqSetObjRef());
1602 {
"Agricultutral",
"agricultural",
false },
1603 {
"Bacilllus",
"Bacillus",
false },
1604 {
"Enviromental",
"Environmental",
false },
1605 {
"Insitiute",
"institute",
false },
1606 {
"Instutite",
"institute",
false },
1607 {
"Instutute",
"Institute",
false },
1608 {
"P.R.Chian",
"P.R. China",
false },
1609 {
"PRChian",
"PR China",
false },
1610 {
"Scieces",
"Sciences",
false },
1611 {
"agricultral",
"agricultural",
false },
1612 {
"agriculturral",
"agricultural",
false },
1613 {
"biotechnlogy",
"biotechnology",
false },
1614 {
"Biotechnlogy",
"Biotechnology",
false },
1615 {
"biotechnolgy",
"biotechnology",
false },
1616 {
"biotechology",
"biotechnology",
false },
1617 {
"caputre",
"capture",
true },
1618 {
"casette",
"cassette",
true },
1619 {
"catalize",
"catalyze",
false },
1620 {
"charaterization",
"characterization",
false },
1621 {
"clonging",
"cloning",
false },
1622 {
"consevered",
"conserved",
false },
1623 {
"cotaining",
"containing",
false },
1624 {
"cytochome",
"cytochrome",
true },
1625 {
"diveristy",
"diversity",
true },
1626 {
"enivronment",
"environment",
false },
1627 {
"enviroment",
"environment",
false },
1628 {
"genone",
"genome",
true },
1629 {
"homologue",
"homolog",
true },
1630 {
"hypotethical",
"hypothetical",
false },
1631 {
"hypotetical",
"hypothetical",
false },
1632 {
"hypothetcial",
"hypothetical",
false },
1633 {
"hypothteical",
"hypothetical",
false },
1634 {
"indepedent",
"independent",
false },
1635 {
"insititute",
"institute",
false },
1636 {
"insitute",
"institute",
false },
1637 {
"institue",
"institute",
false },
1638 {
"instute",
"institute",
false },
1639 {
"muesum",
"museum",
true },
1640 {
"musuem",
"museum",
true },
1641 {
"nuclear shutting",
"nuclear shuttling",
true },
1642 {
"phylogentic",
"phylogenetic",
false },
1643 {
"protien",
"protein",
false },
1644 {
"puatative",
"putative",
false },
1645 {
"putaitve",
"putative",
false },
1646 {
"putaive",
"putative",
false },
1647 {
"putataive",
"putative",
false },
1648 {
"putatitve",
"putative",
false },
1649 {
"putatuve",
"putative",
false },
1650 {
"putatvie",
"putative",
false },
1651 {
"pylogeny",
"phylogeny",
false },
1652 {
"resaerch",
"research",
false },
1653 {
"reseach",
"research",
false },
1654 {
"reserach",
"research",
true },
1655 {
"reserch",
"research",
false },
1656 {
"ribosoml",
"ribosomal",
false },
1657 {
"ribossomal",
"ribosomal",
false },
1658 {
"scencies",
"sciences",
false },
1659 {
"scinece",
"science",
false },
1660 {
"simmilar",
"similar",
false },
1661 {
"structual",
"structural",
false },
1662 {
"subitilus",
"subtilis",
false },
1663 {
"sulfer",
"sulfur",
false },
1664 {
"technlogy",
"technology",
false },
1665 {
"technolgy",
"technology",
false },
1666 {
"Technlogy",
"Technology",
false },
1667 {
"Veterinry",
"Veterinary",
false },
1668 {
"Argricultural",
"Agricultural",
false },
1669 {
"transcirbed",
"transcribed",
false },
1670 {
"transcirption",
"transcription",
true },
1671 {
"uiniversity",
"university",
false },
1672 {
"uinversity",
"university",
false },
1673 {
"univercity",
"university",
false },
1674 {
"univerisity",
"university",
false },
1675 {
"univeristy",
"university",
false },
1676 {
"univesity",
"university",
false },
1677 {
"unversity",
"university",
true },
1678 {
"uviversity",
"university",
false },
1679 {
"anaemia",
nullptr,
false },
1680 {
"haem",
nullptr,
false },
1681 {
"haemagglutination",
nullptr,
false },
1682 {
"heam",
nullptr,
false },
1683 {
"mithocon",
nullptr,
false },
1693 #include "FLATFILE_FIND.inc"
1694 static constexpr TLocalFSM s_FSM{s_compact, s_hits_init_1, s_hits_init_2, s_states,
nullptr};
1706 string error =
"String not found: ";
1719 "FLATFILE_FIND_ONCALLER",
1720 "FLATFILE_FIND_ONCALLER_UNFIXABLE",
1721 "FLATFILE_FIND_ONCALLER_FIXABLE"
1725 for (
auto& desc : context.GetAllSeqdesc()) {
1735 m_Objs[fixable][subitem].Add(*context.SeqdescObjRef(desc, &desc));
1739 for (
auto& feat: context.FeatAll()) {
1749 m_Objs[fixable][subitem].Add(*context.SeqFeatObjRef(*feat, feat));
1758 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1781 const CSeqdesc* desc =
dynamic_cast<const CSeqdesc*
>(context.FindObject(*obj));
1827 if (m_Objs.GetMap().find(
kEmptyStr) == m_Objs.GetMap().end()) {
1829 m_Objs[
"No sequences longer than 20,000 nt found"];
1834 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1841 const CBioseq& bioseq = context.CurrentBioseq();
1843 if (m_Objs[
"N"].GetCount()) {
1847 const CSeqdesc* biosrc = context.GetBiosource();
1852 if (!m_Objs[
"F"].GetCount()) {
1854 for (
auto id : bioseq.
GetId()) {
1866 for (
const auto& descr : bioseq.
GetDescr().
Get()) {
1867 if (descr->IsMolinfo() && descr->GetMolinfo().CanGetTech()) {
1887 if (m_Objs[
"C"].GetCount() && !m_Objs[
"N"].GetCount()) {
1898 static CRegexp regexp(
"chromosome|plasmid|mito|chloroplast|apicoplast|plastid|^chr|^lg|\\bNW_|\\bNZ_|\\bNM_|\\bNC_|\\bAC_|CP\\d\\d\\d\\d\\d\\d",
CRegexp::fCompile_ignore_case);
1904 const CBioseq& bioseq = context.CurrentBioseq();
1906 bool report =
false;
1907 for (
const auto&
id : bioseq.
GetId()) {
1908 if (id->IsLocal()) {
1909 if (id->GetLocal().IsStr() &&
SuspiciousId(id->GetLocal().GetStr())) {
1914 else if (id->IsGeneral()) {
1915 if (id->GetGeneral().IsSetDb() &&
SuspiciousId(id->GetGeneral().GetDb())) {
1919 if (id->GetGeneral().IsSetTag() && id->GetGeneral().GetTag().IsStr() &&
SuspiciousId(id->GetGeneral().GetTag().GetStr())) {
1926 m_Objs[
"[n] sequence[s] [has] suspicious identifiers"].Add(*context.BioseqSetObjRef());
1934 m_ReportItems = m_Objs.Export(*
this,
false)->GetSubitems();
1973 if (
set.IsSetSeq_set()) {
1974 for (
const auto& se :
set.GetSeq_set()) {
1975 if (!se->IsSetDescr()) {
1979 for (
const auto& descr : se->GetDescr().Get()) {
1980 if (!descr->IsSource()) {
1983 const CBioSource& bio_src = descr->GetSource();
1995 for (
const auto& subtype : bio_src.
GetSubtype()) {
1997 if (subtype->IsSetSubtype()) {
2001 m_Objs[
"one or more chromosomes are present"];
2006 m_Objs[
"one or more chromosomes are present"];
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
const string & GetTaxname(void) const
bool IsSetOrgMod(void) const
const COrgName & GetOrgname(void) const
bool IsSetTaxname(void) const
TSeqPos GetLength(void) const
bool IsSetLength(void) const
CRef< CDiscrepancyObject > SeqdescObjRef(const CSeqdesc &desc, const CObject *fix=nullptr, const CObject *more=nullptr)
void Search(const char *input, VoidCall1 found_callback) const
virtual vector< CRef< CReportItem > > GetSubitems() const =0
static void Add(TReportObjectList &list, TReportObjectSet &hash, CReportObj &obj, bool unique=true)
TReportObjectList & GetObjects()
CReportNode & Severity(CReportItem::ESeverity s)
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
static bool Exist(TReportObjectSet &hash, CReportObj &obj)
static EFeatureLocationAllowed AllowedFeatureLocation(ESubtype subtype)
@ eFeatureLocationAllowed_NucOnly
@ eFeatureLocationAllowed_ProtOnly
@ eFeatureLocationAllowed_Any
ESubtype GetSubtype(void) const
@Seq_descr.hpp User-defined methods of the data storage class.
CSeq_feat_EditHandle –.
namespace ncbi::objects::
static bool IsAa(EMol mol)
static bool IsNa(EMol mol)
Base class for all serializable objects.
Template class for iteration on objects of class C (non-medifiable version)
Template class for iteration on objects of class C.
@ eObjectType_StructuredComment
EObjectType GetObjectType() const
container_type::iterator iterator
API (CDeflineGenerator) for computing sequences' titles ("definitions").
vector< CRef< CReportObj > > TReportObjectList
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE1(name, type, group, descr,...)
#define DISCREPANCY_CASE0(name, sname, type, group, descr)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
vector< CConstRef< CObject > > GetObjects(CSeq_entry_Handle seh, const string &field, CFieldNamePanel::EFieldType field_type, int subtype, const string &ncRNA_class, CConstRef< objects::CSeq_submit > submit, CRef< CEditingActionConstraint > constraint, vector< CSeq_entry_Handle > *descr_context=nullptr)
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
constexpr size_t ArraySize(const Element(&)[Size])
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
EAccessionInfo
For IdentifyAccession (below)
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
void SetDescr(TDescr &v) const
void SetInst_Mol(TInst_Mol v) const
const CSeqFeatData & GetData(void) const
void Remove(void) const
Remove the feature from Seq-annot.
void SetInst(TInst &v) const
void Remove(ERemoveMode mode=eRemoveSeq_entry) const
bool IsSetData(void) const
const TInst & GetInst(void) const
const CSeq_feat & GetMappedFeature(void) const
Feature mapped to the master sequence.
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
bool IsMatch(CTempString str, TMatch flags=fMatch_default)
Check existence substring which match a specified pattern.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
NCBI_NS_STD::string::size_type SIZE_TYPE
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
static int CompareCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive compare of a substring with another string.
@ eNocase
Case insensitive compare.
static const char label[]
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
TGenome GetGenome(void) const
Get the Genome member data.
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
void SetGenome(TGenome value)
Assign a value to Genome data member.
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
EGenome
biological context
const TStr & GetStr(void) const
Get the variant data.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
bool IsStrs(void) const
Check if variant Strs is selected.
const TStrs & GetStrs(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool IsStr(void) const
Check if variant Str is selected.
const TStr & GetStr(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TType & GetType(void) const
Get the Type member data.
vector< CStringUTF8 > TStrs
const TMod & GetMod(void) const
Get the Mod member data.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
const TQual & GetQual(void) const
Get the Qual member data.
const TData & GetData(void) const
Get the Data member data.
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
TRepr GetRepr(void) const
Get the Repr member data.
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
const TUser & GetUser(void) const
Get the variant data.
const TInst & GetInst(void) const
Get the Inst member data.
TTopology GetTopology(void) const
Get the Topology member data.
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
bool CanGetTopology(void) const
Check if it is safe to call GetTopology method.
const TTitle & GetTitle(void) const
Get the variant data.
const TSource & GetSource(void) const
Get the variant data.
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
const TId & GetId(void) const
Get the Id member data.
TTech GetTech(void) const
Get the Tech member data.
const Tdata & Get(void) const
Get the member data.
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
TLength GetLength(void) const
Get the Length member data.
TMol GetMol(void) const
Get the Mol member data.
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
TSource & SetSource(void)
Select the variant.
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if a value has been assigned to data member.
TBiomol GetBiomol(void) const
Get the Biomol member data.
void SetBiomol(TBiomol value)
Assign a value to Biomol data member.
bool CanGetId(void) const
Check if it is safe to call GetId method.
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
const TFtable & GetFtable(void) const
Get the variant data.
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
const TData & GetData(void) const
Get the Data member data.
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Tdata & Set(void)
Assign a value to data member.
const TDescr & GetDescr(void) const
Get the Descr member data.
const TMolinfo & GetMolinfo(void) const
Get the variant data.
TMolinfo & SetMolinfo(void)
Select the variant.
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
@ eRepr_delta
sequence made by changes (delta) to others
@ eCompleteness_complete
complete biological entity
@ eTech_targeted
targeted locus sets/studies
@ eTech_tsa
transcriptome shotgun assembly
@ eTech_wgs
whole genome shotgun sequencing
@ eBiomol_pre_RNA
precursor RNA of any sort really
@ e_Source
source of materials, includes Org-ref
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
void ReverseComplementFeature(CSeq_feat &feat, CScope &scope)
Simultaneous search of multiple RegEx patterns in the input string.
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
double f(double x_, const double &y_)
static const char * prefix[]
static const string kFixable
const string kStructuredCommentReport
const string & kPreviouslySeenFields
static const string kMrnaSequenceMinusStrandFeatures
static bool IsSegmentSubtype(const CBioSource &bio_src)
static bool s_areCompatible(CBioSource::EGenome Location, CSubSource::ESubtype Qualifier)
void UnitTest_FLATFILE_FIND()
Checking that FLATFILE_FIND.inc is in sync with kSpellFixes If the array is changed,...
string AdjustDBLinkFieldName(const string &orig_field_name)
static const CSubSource::ESubtype eSubtype_unknown
const string kMissingDBLink
const string kStructuredCommentObservedPrefixes
const string kSomeIdenticalDeflines
const string &(CBioSource::* FnGet)() const
string GetFieldValueAsString(const CUser_field &field)
const string & kPreviouslySeenObjects
static const string kNonFixable
static bool SuspiciousId(const string &s)
static const string kInconsistentMolinfoTech
void AddUserObjectFieldItems(const CSeqdesc *desc, CReportObj &rep_seq, CReportNode &collector, CReportNode &previously_seen, CDiscrepancyContext &context, const string &object_name, const string &field_prefix=kEmptyStr)
const string kDBLinkObjectList
static const string kInconsistentMolinfoTechSummary
const string kStructuredCommentObservedPrefixesThis
static bool IsATGC(char ch)
string GetSummaryLabel(bool all_present, bool all_same)
static const size_t MIN_SEQUENCE_LEN
const string & kPreviouslySeenFieldsThis
static bool EndsWithSequence(const string &title)
static bool FixTextInObject(CSerialObject *obj, size_t misspell_idx)
void AnalyzeFieldReport(CReportNode &node, bool &all_present, bool &all_same)
const string kSequencesWithGaps
const string kIdenticalDeflines
const string kDeflineExists
static void FindFlatfileText(const char *str, bool *result)
static bool IsMolProd(int biomol)
void AnalyzeField(CReportNode &node, bool &all_present, bool &all_same)
static bool CompareOrGetString(const CBioSource &bio_src, FnIsSet is_set_fn, FnGet get_fn, string &val)
void CopyNode(CReportNode &new_home, CReportNode &original)
const string kStructuredCommentPrevious
const string kNoTaxnameInDefline
static SpellFixData kSpellFixes[]
static bool CompareOrgModValue(const CBioSource &bio_src, COrgMod::TSubtype subtype, string &val)
const string kStructuredCommentFieldPrefix
static const size_t kSpellFixesSize
const string kDBLinkFieldCountTop
static string GetProjectID(const CUser_object &user)
const string kUniqueDeflines
const string kAllUniqueDeflines
static bool IsMicroSatellite(const CSeq_feat &feat)
const string kDBLinkCollect
bool(CBioSource::* FnIsSet)() const
const string kStructuredCommentsSeqs
static const char * str(char *buf, int n)
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
static const char *const features[]