55 using namespace sequence;
58 : m_Feat(feat),
m_Scope(scope), m_Imp(imp), m_ProductIsFar(
false)
68 "The feature is missing a location");
73 bool lowerSev =
false;
77 const CDbtag& dbtag = **it;
78 if ( dbtag.
GetDb() ==
"dbSNP" ) {
86 "Location",
m_Feat, lowerSev);
125 "Inference or experiment qualifier missing but obsolete experimental evidence qualifier set");
151 if (loc.IsInt() || loc.IsWhole()) {
156 for (CSeq_loc_CI citer(loc); citer; ++citer) {
157 const CSeq_id& this_id = citer.GetSeq_id();
158 if (!
prev || !
prev->Equals(this_id)) {
163 prev.Reset(&this_id);
177 switch (sid.
Which()) {
190 "Feature product should not put an accession in the Textseq-id 'name' slot");
193 "Feature product should not use "
194 "Textseq-id 'name' slot");
209 if (id->Which() == sid.
Which()) {
211 string from_seq =
id->AsFastaString();
216 "Capitalization change from product location on feature to product sequence");
219 switch (id->Which()) {
232 "Protein bioseq has Textseq-id 'name' that "
233 "looks like it is derived from a nucleotide "
237 "Protein bioseq has Textseq-id 'name' and no accession");
254 bool is_seqloc_bond =
false;
258 for (CSeq_loc_CI it(feat.
GetLocation()); it; ++it) {
259 if (it.GetEmbeddingSeq_loc().IsBond()
260 && (!it.GetEmbeddingSeq_loc().GetBond().IsSetA()
261 || it.GetEmbeddingSeq_loc().GetBond().IsSetB())) {
262 is_seqloc_bond =
true;
267 for (CSeq_loc_CI it(feat.
GetLocation()); it; ++it) {
268 if (it.GetEmbeddingSeq_loc().IsBond()) {
269 is_seqloc_bond =
true;
275 for (CSeq_loc_CI it(feat.
GetLocation()); it; ++it) {
276 if (it.GetEmbeddingSeq_loc().IsBond()) {
277 is_seqloc_bond =
true;
282 return is_seqloc_bond;
293 if (both || both_rev) {
295 if (both && both_rev) {
296 suffix =
"(forward and reverse)";
299 }
else if (both_rev) {
306 label +
" may not be on both " +
suffix +
" strands");
315 for (CSeq_loc_CI it(loc); it; ++it) {
316 if (it.IsSetStrand()) {
324 if (both && both_rev) {
333 has_parent_gene_id =
false;
339 has_parent_gene_id =
true;
340 if ((*it)->IsSetTag() && (*it)->GetTag().Equals(
tag)) {
372 bool has_parent_gene_id =
false;
373 if (!
HasGeneIdXref(parent, (*it)->GetTag(), has_parent_gene_id)) {
374 if (has_parent_gene_id ||
380 parent = feat_tree->GetParent(parent);
397 if ((*pi)->IsEquiv()) {
399 "Citation on feature has unexpected internal Pub-equiv");
409 "empty inference string",
410 "bad inference prefix",
411 "bad inference body",
412 "single inference field",
413 "spaces in inference",
414 "possible comment in inference",
415 "same species misused",
416 "the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem =",
417 "bad inference accession version",
418 "accession.version not public",
419 "bad accession type",
420 "unrecognized database",
435 "Qualifier other than replace has just quotation marks");
442 "Inference qualifier problem - empty inference string ()");
449 qual.
GetVal() +
" is not in proper EC_number format");
451 string ec_number = qual.
GetVal();
457 "EC_number " + ec_number +
" was deleted");
462 "EC_number " + ec_number +
" was replaced");
467 if (pos == string::npos || !
isdigit(ec_number.c_str()[pos + 1])) {
469 ec_number +
" is not a legal value for qualifier EC_number");
472 ec_number +
" is not a legal preliminary value for qualifier EC_number");
499 "/pseudogene value should not be '" + qual.
GetVal() +
"'",
m_Feat);
502 bool has_space =
false;
503 bool has_char_after_space =
false;
505 if (
isspace((
unsigned char)(*it))) {
507 }
else if (has_space) {
509 has_char_after_space =
true;
513 if (has_char_after_space) {
515 "Number qualifiers should not contain spaces");
520 "feature qualifier " + qual.
GetVal() +
" has SGML");
533 "Unable to find EC number file 'ecnum_ambiguous.txt' in data directory");
537 "Unable to find EC number file 'ecnum_deleted.txt' in data directory");
541 "Unable to find EC number file 'ecnum_replaced.txt' in data directory");
545 "Unable to find EC number file 'ecnum_specific.txt' in data directory");
554 for (
auto it : errors) {
556 it.first, it.second);
583 "Feature comment may refer to reference by serial number - "
584 "attach reference specific comments to the reference "
585 "REMARK instead.",
m_Feat);
589 "feature comment " + comment +
" has SGML",
617 "On partial Bioseq, SeqFeat.partial should be TRUE");
620 else if (is_partial &&
629 "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
630 "should also be partial");
638 "Gene of 'order' with otherwise complete location should "
639 "have partial flag set");
645 bool is_far_fail =
false;
653 string str(
"Inconsistent: Product= complete, Location= ");
655 str +=
"Feature.partial= ";
656 str += is_partial ?
"TRUE" :
"FALSE";
659 }
else if (is_far_fail) {
667 string str(
"Inconsistent: ");
674 str +=
"Feature.partial= ";
675 str += is_partial ?
"TRUE" :
"FALSE";
688 "5' or 3' partial location should not have unclassified"
689 " partial in product molinfo descriptor");
704 "Bond location should only be on bond features");
709 string prefix =
"Feature";
728 if ((*it)->IsGi() || (*it)->IsGibbsq() || (*it)->IsGibbmt()) {
732 (*it)->WriteAsFasta(os2);
736 "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
744 "Feature on protein indicates negative strand");
751 vector<TSeqPos> gap_starts;
757 "Feature contains more than 50% Ns");
759 for (
auto gap_start : gap_starts) {
766 "Feature inside sequence gap");
771 "Internal interval begins or ends in gap");
775 "Feature crosses gap of unknown length");
780 string(
"Exception while checking for intervals in gaps. EXCEPTION: ") +
782 }
catch (
const std::exception&) {
827 while (map_iter && pos <= stop) {
830 for (; pos < map_end && pos <= stop; pos++) {
889 if ( (*it)->IsLoc() ) {
910 int num_unknown_gap = 0;
911 bool first_in_gap =
false, last_in_gap =
false;
912 bool local_first_gap =
false, local_last_gap =
false;
913 bool startsOrEndsInGap =
false;
916 for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
923 if (id_it->Equals(loc_it.GetSeq_id())) {
938 local_first_gap =
false;
939 local_last_gap =
false;
944 string::iterator it = vec_data.begin();
945 while (it != vec_data.end() && pos <
len) {
947 bool unknown_length =
false;
953 unknown_length =
true;
960 unknown_length =
true;
966 local_first_gap =
true;
967 }
else if (pos ==
len - 1) {
968 local_last_gap =
true;
970 if (unknown_length) {
975 }
else if (*it ==
'N') {
992 first_in_gap = local_first_gap;
995 last_in_gap = local_last_gap;
996 if (local_first_gap || local_last_gap) {
997 startsOrEndsInGap =
true;
1001 if (num_real == 0 && num_n == 0) {
1011 if (num_gap == 0 && num_unknown_gap == 0 && num_n == 0) {
1013 }
else if (first_in_gap || last_in_gap) {
1018 gap_starts.push_back(gap_start);
1022 }
else if (num_real == 0 && num_gap == 0 && num_unknown_gap == 0 && num_n >= 50) {
1024 }
else if (startsOrEndsInGap) {
1026 }
else if (num_unknown_gap > 0) {
1049 unsigned int len = 0;
1050 if ((*it)->IsLiteral()) {
1051 len = (*it)->GetLiteral().GetLength();
1052 }
else if ((*it)->IsLoc()) {
1055 if ((
unsigned int)pos >=
offset && (
unsigned int)pos <
offset +
len) {
1073 for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
1080 if (id_it->Equals(loc_it.GetSeq_id())) {
1096 string::iterator it = vec_data.begin();
1097 while (it != vec_data.end()) {
1105 if ((
unsigned)(*it + 1) <= 256 &&
isalpha(*it)) {
1113 }
catch (
const std::exception& ) {
1118 return (num_n > real_bases);
1129 const CSeq_id* protid =
nullptr;
1154 if (!prot_handle && look_far) {
1167 bool look_far =
false;
1188 "Exception text is present, but exception flag is not set");
1192 "Exception flag is set, but exception text is empty");
1202 if (
text.empty())
return;
1209 bool reasons_in_cit =
false;
1210 bool annotated_by_transcript_or_proteomic =
false;
1211 bool redundant_with_comment =
false;
1212 bool refseq_except =
false;
1213 vector<string> exceptions;
1215 ITERATE(vector<string>, it, exceptions) {
1225 reasons_in_cit =
true;
1227 annotated_by_transcript_or_proteomic =
true;
1232 bool check_refseq =
false;
1234 check_refseq =
true;
1236 check_refseq =
true;
1239 if ((*id_it)->IsOther()) {
1240 check_refseq =
true;
1249 refseq_except =
true;
1262 str +
" is not a legal exception explanation");
1269 redundant_with_comment =
true;
1271 redundant_with_comment =
true;
1275 if (redundant_with_comment) {
1277 "Exception explanation text is also found in feature comment");
1279 if (refseq_except) {
1282 if (!found_just_the_exception) {
1284 "Genome processing exception should not be combined with other explanations");
1290 "Reasons given in citation exception does not have the required citation");
1292 if (annotated_by_transcript_or_proteomic) {
1293 bool has_inference =
false;
1296 has_inference =
true;
1300 if (!has_inference) {
1302 "Annotated by transcript or proteomic data exception does not have the required inference qualifier");
1326 bool is_imp =
false;
1350 const string& qual_str = gbq->GetQual();
1356 auto gbqual = gbqual_and_value.first;
1361 qual_str +
" is improperly capitalized");
1372 "Unknown qualifier " + qual_str);
1387 if (
NStr::Equal(qual_str,
"orig_transcript_id")) {
1391 if (
NStr::Equal(qual_str,
"orig_transcript_id")) {
1401 "Wrong qualifier " + qual_str +
" for feature " +
1407 "feat_class qualifier is only legal for RefSeq");
1412 const string&
val = gbq->GetVal();
1418 "Compound '" +
val +
"' must be split into separate instances of qualifier " + qual_str);
1422 val +
" is not a legal value for qualifier " + qual_str);
1452 val +
" is not a legal value for qualifier " + qual_str);
1464 "Vector Contamination region should be trimmed from sequence");
1473 "A product qualifier is not used on a gene feature");
1481 "locus-tag values should be on genes");
1494 bool multiple_rpt_unit =
false;
1498 }
else if ( *it ==
'(' || *it ==
')' ||
1499 *it ==
',' || *it ==
'.' ||
1500 isdigit((
unsigned char)(*it)) ) {
1501 multiple_rpt_unit =
true;
1511 !multiple_rpt_unit ) {
1513 bool just_nuc_letters =
true;
1514 static const string nuc_letters =
"ACGTNacgtn";
1516 if ( nuc_letters.find(*it) ==
NPOS ) {
1517 just_nuc_letters =
false;
1522 if ( just_nuc_letters ) {
1524 if ( !vec.
empty() ) {
1529 "repeat_region /rpt_unit and underlying "
1530 "sequence do not match");
1536 "Length of rpt_unit_seq is greater than feature length");
1548 const char *cp =
val.c_str();
1549 bool badchars =
false;
1550 while (*cp != 0 && !badchars) {
1553 }
else if (*cp !=
'(' && *cp !=
')'
1555 && *cp !=
',' && *cp !=
';') {
1562 "/rpt_unit_seq has illegal characters");
1570 if (
str.length() > 25) {
1574 if (pos == string::npos) {
1578 int tmp_from, tmp_to;
1586 }
catch (
const std::exception& ) {
1589 if (tmp_from < 0 || tmp_to < 0) {
1601 "/rpt_unit_range is not a base range");
1604 if (from - 1 <
range.GetFrom() || from - 1>
range.GetTo() || to - 1 <
range.GetFrom() || to - 1 >
range.GetTo()) {
1606 "/rpt_unit_range is not within sequence length");
1608 bool nulls_between =
false;
1611 nulls_between =
true;
1614 if (nulls_between) {
1615 bool in_range =
false;
1617 range = it.GetEmbeddingSeq_loc().GetTotalRange();
1618 if (from - 1 <
range.GetFrom() || from - 1>
range.GetTo() || to - 1 <
range.GetFrom() || to - 1 >
range.GetTo()) {
1625 "/rpt_unit_range is not within ordered intervals");
1635 bool only_digits =
true,
1639 if (
isspace((
unsigned char)(*it)) ) {
1642 if ( !
isdigit((
unsigned char)(*it)) ) {
1643 only_digits =
false;
1646 if (only_digits || has_spaces) {
1658 val +
" accession missing version for qualifier compare");
1661 val +
" accession has bad version for qualifier compare");
1664 val +
" is not a legal accession for qualifier compare");
1667 "RefSeq accession " +
val +
" cannot be used for qualifier compare");
1675 const char *src =
str.c_str();
1676 const char *find = consist.c_str();
1679 while (*src != 0 && rval) {
1680 if (strchr (find, *src) ==
NULL) {
1696 val +
" is not a legal value for qualifier " + qual_str
1697 +
" - should only be composed of acgt unambiguous nucleotide bases");
1701 val +
" is not a legal value for qualifier " + qual_str
1702 +
" - should only be composed of acgtmrwsykvhdbn nucleotide bases");
1707 val +
" is not a legal value for qualifier " + qual_str
1708 +
" - should only be composed of acdefghiklmnpqrstuvwy* amino acids");
1713 bool has_fuzz =
false;
1715 if (it.IsPoint() && (it.GetFuzzFrom() || it.GetFuzzTo())) {
1726 "/replace already matches underlying sequence (" +
val +
")");
1729 }
catch (
const std::exception& ) {
1740 field_name +
" contains undesired character");
1744 field_name +
" ends with undesired character");
1749 field_name +
" ends with hyphen");
1770 "feature has exception but passes splice site test");
1791 "Bad sequence at splice donor after exon ending at position "
1795 "Splice donor consensus (GT) not found after exon ending at position "
1806 "Bad sequence at splice acceptor before exon starting at position "
1810 "Splice acceptor consensus (AG) not found before exon starting at position "
1821 for (
auto it = donor_problems.begin(); it != donor_problems.end(); it++) {
1825 for (
auto it = acceptor_problems.begin(); it != acceptor_problems.end(); it++) {
1835 if ((*it)->IsOther() && (*it)->GetOther().IsSetAccession()
1876 if ((*it)->IsOther()) {
1908 " for feature " +
key);
1919 if (strand1 == strand2) {
1947 "Gene cross-reference is not on expected strand");
1955 bool equivalent =
false;
1995 bool has_gene_id_xref =
false;
1998 if ((*xref)->IsSetId() && (*xref)->GetId().IsLocal()) {
2001 if (gene_feats.size() > 0) {
2002 has_gene_id_xref =
true;
2010 if (has_gene_id_xref) {
2022 size_t num_genes = 0;
2024 size_t num_trans_spliced = 0;
2025 bool equivalent =
false;
2037 size_t num_match_by_locus = 0;
2038 size_t num_match_by_locus_tag = 0;
2040 for ( ; gene_it; ++gene_it) {
2044 num_match_by_locus++;
2050 num_match_by_locus_tag++;
2056 "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists");
2063 if (
len <
max || num_genes == 0) {
2066 num_trans_spliced = 0;
2069 num_trans_spliced++;
2072 prev_gene = gene_it;
2078 num_trans_spliced++;
2087 if (num_genes > 1 &&
2092 }
else if (equivalent) {
2094 "Feature overlapped by "
2096 +
" identical-length equivalent genes but has no cross-reference");
2099 "Feature overlapped by "
2101 +
" identical-length genes but has no cross-reference");
2103 }
else if (num_genes == 1
2109 const CGb_qual& qual = **qual_iter;
2115 "Redundant allele qualifier (" + allele +
2116 ") on gene and feature");
2119 "Mismatched allele qualifier on gene (" + allele +
2120 ") and feature (" + qual.
GetVal() +
")");
2131 const string& allele = gene_xref->
GetAllele();
2134 const CGb_qual& qual = **qual_iter;
2140 "Redundant allele qualifier (" + allele +
2141 ") on gene and feature");
2144 "Mismatched allele qualifier on gene (" + allele +
2145 ") and feature (" + qual.
GetVal() +
")");
2151 if (num_match_by_locus == 0 && num_match_by_locus_tag == 0) {
2163 const CSeq_id*
id = loc.GetId();
2181 "Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
2186 "Feature has gene locus cross-reference but no equivalent gene feature exists");
2205 if (it->IsSetQual() &&
NStr::Equal(it->GetQual(),
"old_locus_tag")
2228 for (
auto it : feat.
GetQual()) {
2268 string gene_old_locus_tag;
2271 if ((*it)->IsSetQual() &&
NStr::Equal ((*it)->GetQual(),
"old_locus_tag")
2273 gene_old_locus_tag = (*it)->GetVal();
2280 "Old locus tag on feature (" + old_locus_tag
2281 +
") does not match that on gene (" + gene_old_locus_tag +
")");
2292 "old_locus_tag without inherited locus_tag");
2307 if ( imp_loc.find(
"one-of") != string::npos ) {
2309 "ImpFeat loc " + imp_loc +
2310 " has obsolete 'one-of' text for feature " +
key);
2315 if ( imp_loc != temp_loc ) {
2317 "ImpFeat loc " + imp_loc +
" does not equal feature location " +
2318 temp_loc +
" for feature " +
key);
2351 if ((*it)->IsOther()) {
2374 " for feature " +
key);
2418 if ((*it)->IsOther() && (*it)->GetTextseq_Id()->IsSetAccession()
2437 bool has_sfp_pseudo =
false;
2438 bool has_gene_pseudo =
false;
2441 if (it->IsSetQual() &&
2444 sfp_pseudo = it->GetVal();
2445 has_sfp_pseudo =
true;
2450 for (
auto it : gene->
GetQual()) {
2451 if (it->IsSetQual() &&
2454 gene_pseudo = it->GetVal();
2455 has_gene_pseudo =
true;
2460 if (!has_sfp_pseudo && !has_gene_pseudo) {
2462 }
else if (!has_sfp_pseudo) {
2464 }
else if (has_sfp_pseudo && !has_gene_pseudo) {
2466 msg +=
" has pseudogene qualifier, gene does not";
2470 string msg =
"Different pseudogene values on ";
2472 msg +=
" (" + sfp_pseudo +
") and gene (" + gene_pseudo +
")";
2523 "Gene locus_tag does not match general ID of product");
2534 for (
char ch : src) {
2535 unsigned char chu = ch;
2536 if (chu > 31 && chu < 128) {
2552 const string&
str = *it;
2554 const char& ch = *c_it;
2555 unsigned char chu = ch;
2556 if (ch > 127 || (ch < 32 && ch !=
'\t' && ch !=
'\r' && ch !=
'\n')) {
2573 for (
auto it :
prot.GetName()) {
2574 if (
prot.IsSetEc() && !
prot.IsSetProcessed()
2580 "Unknown or hypothetical protein should not have EC number");
2587 "protein description " +
prot.GetDesc() +
" has SGML");
2593 "Comment has same value as protein description");
2598 "Apparent EC number in protein comment");
2605 if (
prot.IsSetName() &&
prot.GetName().size() > 0) {
2608 "Apparent EC number in protein title");
2613 if (
prot.CanGetDb () ) {
2616 if ( (!
prot.IsSetName() ||
prot.GetName().empty()) &&
2617 (!
prot.IsSetProcessed()
2622 "Protein feature has description but no name");
2623 }
else if (
prot.IsSetActivity() && !
prot.GetActivity().empty()) {
2625 "Protein feature has function but no name");
2626 }
else if (
prot.IsSetEc() && !
prot.GetEc().empty()) {
2628 "Protein feature has EC number but no name");
2631 "Protein feature has no name");
2646 if (
prot.IsSetProcessed() ) {
2647 processed =
prot.GetProcessed();
2653 if (
prot.IsSetName() &&
2654 !
prot.GetName().empty() &&
2655 !
prot.GetName().front().empty() ) {
2658 if (
prot.CanGetDesc() && !
prot.GetDesc().empty() ) {
2661 if (
prot.CanGetEc() && !
prot.GetEc().empty() ) {
2664 if (
prot.CanGetActivity() && !
prot.GetActivity().empty() ) {
2667 if (
prot.CanGetDb() && !
prot.GetDb().empty() ) {
2673 "There is a protein feature where all fields are empty");
2682 "'hypothetical protein",
2685 "alternatively spliced",
2686 "bacteriophage hypothetical protein",
2689 "cnserved hypothetical protein",
2690 "conesrved hypothetical protein",
2691 "conserevd hypothetical protein",
2692 "conserved archaeal protein",
2693 "conserved domain protein",
2694 "conserved hypohetical protein",
2695 "conserved hypotehtical protein",
2696 "conserved hypotheical protein",
2697 "conserved hypothertical protein",
2698 "conserved hypothetcial protein",
2699 "conserved hypothetical",
2700 "conserved hypothetical exported protein",
2701 "conserved hypothetical integral membrane protein",
2702 "conserved hypothetical membrane protein",
2703 "conserved hypothetical phage protein",
2704 "conserved hypothetical prophage protein",
2705 "conserved hypothetical protein",
2706 "conserved hypothetical protein - phage associated",
2707 "conserved hypothetical protein fragment 3",
2708 "conserved hypothetical protein, fragment",
2709 "conserved hypothetical protein, putative",
2710 "conserved hypothetical protein, truncated",
2711 "conserved hypothetical protein, truncation",
2712 "conserved hypothetical protein.",
2713 "conserved hypothetical protein; possible membrane protein",
2714 "conserved hypothetical protein; putative membrane protein",
2715 "conserved hypothetical proteins",
2716 "conserved hypothetical protien",
2717 "conserved hypothetical transmembrane protein",
2718 "conserved hypotheticcal protein",
2719 "conserved hypthetical protein",
2720 "conserved in bacteria",
2721 "conserved membrane protein",
2722 "conserved protein",
2723 "conserved protein of unknown function",
2724 "conserved protein of unknown function ; putative membrane protein",
2725 "conserved unknown protein",
2726 "conservedhypothetical protein",
2727 "conserverd hypothetical protein",
2728 "conservered hypothetical protein",
2729 "consrved hypothetical protein",
2730 "converved hypothetical protein",
2734 "duplicated hypothetical protein",
2739 "homeodomain protein",
2741 "hyopthetical protein",
2743 "hypotheical protein",
2744 "hypothertical protein",
2745 "hypothetcical protein",
2747 "hypothetical protein",
2748 "hypothetical conserved protein",
2749 "hypothetical exported protein",
2750 "hypothetical novel protein",
2752 "hypothetical phage protein",
2753 "hypothetical prophage protein",
2754 "hypothetical protein (fragment)",
2755 "hypothetical protein (multi-domain)",
2756 "hypothetical protein (phage associated)",
2757 "hypothetical protein - phage associated",
2758 "hypothetical protein fragment",
2759 "hypothetical protein fragment 1",
2760 "hypothetical protein predicted by genemark",
2761 "hypothetical protein predicted by glimmer",
2762 "hypothetical protein predicted by glimmer/critica",
2763 "hypothetical protein, conserved",
2764 "hypothetical protein, phage associated",
2765 "hypothetical protein, truncated",
2766 "hypothetical protein-putative conserved hypothetical protein",
2767 "hypothetical protein.",
2768 "hypothetical proteins",
2769 "hypothetical protien",
2770 "hypothetical transmembrane protein",
2771 "hypothetoical protein",
2772 "hypothteical protein",
2773 "identified by sequence similarity; putative; orf located~using blastx/framed",
2774 "identified by sequence similarity; putative; orf located~using blastx/glimmer/genemark",
2776 "membrane protein, putative",
2778 "narrowly conserved hypothetical protein",
2781 "orf, conserved hypothetical protein",
2782 "orf, hypothetical",
2783 "orf, hypothetical protein",
2784 "orf, hypothetical, fragment",
2785 "orf, partial conserved hypothetical protein",
2786 "orf; hypothetical protein",
2787 "orf; unknown function",
2789 "partial cds, hypothetical",
2790 "partially conserved hypothetical protein",
2791 "phage hypothetical protein",
2792 "phage-related conserved hypothetical protein",
2793 "phage-related protein",
2795 "possible hypothetical protein",
2797 "predicted coding region",
2798 "predicted protein",
2799 "predicted protein (pseudogene)",
2800 "predicted protein family",
2801 "product uncharacterised protein family",
2803 "protein of unknown function",
2806 "putative conserved protein",
2807 "putative exported protein",
2808 "putative hypothetical protein",
2809 "putative membrane protein",
2810 "putative orf; unknown function",
2811 "putative phage protein",
2814 "repeats containing protein",
2816 "ribosomal protein",
2819 "small hypothetical protein",
2820 "transmembrane protein",
2823 "trp-repeat protein",
2824 "truncated conserved hypothetical protein",
2825 "truncated hypothetical protein",
2826 "uncharacterized conserved membrane protein",
2827 "uncharacterized conserved protein",
2828 "uncharacterized conserved secreted protein",
2829 "uncharacterized protein",
2830 "uncharacterized protein conserved in archaea",
2831 "uncharacterized protein conserved in bacteria",
2832 "unique hypothetical",
2833 "unique hypothetical protein",
2839 "unknown, conserved protein",
2840 "unknown, hypothetical",
2841 "unknown-related protein",
2842 "unknown; predicted coding region",
2844 "unnamed protein product",
2845 "very hypothetical protein"
2857 if (!
prot.IsSetName()) {
2858 if (!
prot.IsSetProcessed() ||
2862 "Protein name is not set");
2869 if (search.empty()) {
2871 "Protein name is empty");
2872 }
else if (sc_BadProtName.find (search.c_str()) != sc_BadProtName.end()
2880 "Uninformative protein name '" + it +
"'");
2896 (it) +
" is not in proper EC_number format");
2898 const string& ec_number = it;
2904 "EC_number " + ec_number +
" was deleted");
2909 "EC_number " + ec_number +
" was transferred and is no longer valid");
2914 if (pos == string::npos || !
isdigit (ec_number.c_str()[pos + 1])) {
2916 ec_number +
" is not a legal value for qualifier EC_number");
2919 ec_number +
" is not a legal preliminary value for qualifier EC_number");
2935 bool report_name =
true;
2937 if (pos == string::npos) {
2939 }
else if (prot_name.length() - pos < 5) {
2942 report_name =
false;
2947 "Protein name ends with bracket and may contain organism name");
2953 if (id_it->IsOther()
2954 && id_it->GetOther().IsSetAccession()
2956 prot_name.substr(21))) {
2958 "Hypothetical protein reference does not match accession");
2967 "Comment has same value as protein name");
2972 "Protein name has internal PMID");
2978 &&
NStr::FindCase(prot_name,
"methyltransferase") == string::npos
2980 if (
NStr::EqualNocase(prot_name,
"ribulose-1,5-bisphosphate carboxylase/oxygenase")) {
2982 }
else if (!
NStr::EqualNocase(prot_name,
"ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
2983 && !
NStr::EqualNocase(prot_name,
"ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")) {
2985 "Nonstandard ribulose bisphosphate protein name");
2995 "protein name " + prot_name +
" has SGML");
3020 const CSeq_loc& prot_loc =
prot->GetLocation();
3024 bool conflict =
false;
3039 "Molinfo completeness and protein feature partials conflict");
3050 if (
rna.IsSetType()) {
3051 rna_type =
rna.GetType();
3055 if (
rna.CanGetExt() &&
rna.GetExt().IsName()) {
3056 const string& rna_name =
rna.GetExt().GetName();
3060 "rRNA name " + rna_name +
" has SGML");
3069 bool pseudo = feat_pseudo;
3091 rna_typename +
" has no name");
3099 "RNA type 0 (unknown) not supported");
3119 "A pseudo RNA should not have a product");
3120 }
else if (pseudo) {
3122 "An RNA overlapped by a pseudogene should not have a product");
3169 "Type of RNA does not match MolInfo of product Bioseq");
3203 "tRNA data structure on non-tRNA feature");
3210 if ( anticodon_len != 3 ) {
3212 "Anticodon is not 3 bases in length");
3220 "Anticodon location not in tRNA");
3241 "Unparsed anticodon qualifier in tRNA");
3247 "Unparsed product qualifier in tRNA");
3254 if (
rna.IsSetExt() &&
3257 "Unparsed product qualifier in tRNA");
3260 "Missing encoded amino acid qualifier in tRNA");
3265 bool isLessThan100 =
false;
3267 CSeq_loc_CI li(loc);
3269 TSeqPos last_start = li.GetRange().GetFrom();
3270 TSeqPos last_stop = li.GetRange().GetTo();
3272 last_id->
Assign(li.GetSeq_id());
3276 TSeqPos this_start = li.GetRange().GetFrom();
3277 TSeqPos this_stop = li.GetRange().GetTo();
3278 if (
abs ((
int)this_start - (
int)last_stop) < 100 ||
abs ((
int)this_stop - (
int)last_start) < 100) {
3279 if (li.GetSeq_id().Equals(*last_id)) {
3281 isLessThan100 =
true;
3287 for (
auto id_it : last_bsh.
GetId()) {
3288 if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
3289 isLessThan100 =
true;
3296 last_start = this_start;
3297 last_stop = this_stop;
3298 last_id->
Assign(li.GetSeq_id());
3304 if ( grp ==
NULL ) {
3313 if ( !pseudo && grp !=
NULL ) {
3317 if (isLessThan100 && ! pseudo) {
3323 if (
source.IsSetLineage()) {
3324 string lineage =
source.GetLineage();
3327 "tRNA intron in bacteria is less than 100 bp");
3338 bool ordered =
true;
3339 bool adjacent =
false;
3340 bool unmarked_strand =
false;
3341 bool mixed_strand =
false;
3344 for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
3346 if (curr.GetEmbeddingSeq_loc().IsInt()) {
3348 }
else if (curr.GetEmbeddingSeq_loc().IsPnt()) {
3356 curr.GetEmbeddingSeq_loc().GetLabel(&lbl);
3358 "Anticodon location [" + lbl +
"] out of range");
3361 if (
prev && curr &&
3367 if (prev_range.
GetTo() < curr_range.
GetTo()) {
3374 if (prev_range.
GetTo() > curr_range.
GetTo()) {
3384 if ( curr_range == prev_range && curr_strand == prev_strand ) {
3386 "Duplicate anticodon exons in location");
3388 if ( curr_strand != prev_strand ) {
3390 unmarked_strand =
true;
3392 unmarked_strand =
true;
3394 mixed_strand =
true;
3402 "Adjacent intervals in Anticodon");
3406 ENa_strand ac_strand = anticodon.GetStrand();
3409 "Anticodon strand and tRNA strand do not match.");
3412 "Anticodon strand and tRNA strand do not match.");
3416 bool trans_splice =
false;
3419 trans_splice =
true;
3422 if (!trans_splice) {
3424 anticodon.GetLabel(&loc_lbl);
3427 "Mixed strands in Anticodon [" + loc_lbl +
"]");
3429 if (unmarked_strand) {
3431 "Mixed plus and unknown strands in Anticodon [" + loc_lbl +
"]");
3435 "Intervals out of order in Anticodon [" + loc_lbl +
"]");
3441 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
3442 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
3443 84, 85, 86, 87, 88, 89, 90 };
3446 "---",
"Ala",
"Asx",
"Cys",
"Asp",
"Glu",
"Phe",
"Gly",
"His",
"Ile",
3447 "Lys",
"Leu",
"Met",
"Asn",
"Pro",
"Gln",
"Arg",
"Ser",
"Thr",
3448 "Val",
"Trp",
"OTHER",
"Tyr",
"Glx",
"Sec",
"TERM",
"Pyl",
"Xle"
3462 }
catch (
const std::exception& ) {
3471 const list<CRef<CGenetic_code> >& codes = code_table.
Get();
3473 for ( list<
CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end(); code_it != code_it_end; ++code_it ) {
3474 if ((*code_it)->GetId() == gcode) {
3475 return (*code_it)->GetName();
3496 unsigned char aa = 0, orig_aa;
3497 vector<char> seqData;
3540 bool mustbemethionine =
false;
3545 mustbemethionine =
true;
3549 if (mustbemethionine) {
3553 "Initiation tRNA claims to be tRNA-" + aanm +
3554 ", but should be tRNA-Met");
3570 if ( ncbieaa.length() != 64 ) {
3578 string aaname =
buf;
3584 bool modified_codon_recognition =
false;
3585 bool rna_editing =
false;
3589 modified_codon_recognition =
true;
3596 vector<string> recognized_codon_values;
3597 vector<unsigned char> recognized_taa_values;
3600 if (*iter == 255)
continue;
3605 " is greater than maximum 63");
3607 }
else if (*iter < 0) {
3614 if ( !modified_codon_recognition && !rna_editing ) {
3615 unsigned char taa = ncbieaa[*iter];
3617 recognized_codon_values.push_back (codon);
3618 recognized_taa_values.push_back (
taa);
3621 if ( (aa ==
'U') && (
taa ==
'*') && (*iter == 14) ) {
3629 "Codon recognized by tRNA (" + codon +
") does not match amino acid ("
3630 + aaname +
") specified by genetic code ("
3638 string anticodon =
"?";
3639 vector<string> codon_values;
3640 vector<unsigned char> taa_values;
3648 if (codon.length() > 3) {
3649 codon = codon.substr (0, 3);
3655 char ch = anticodon.c_str()[0];
3673 string::iterator str_it = wobble.begin();
3674 while (str_it != wobble.end()) {
3677 if (index < 64 && index > -1) {
3678 unsigned char taa = ncbieaa[index];
3679 taa_values.push_back(
taa);
3680 codon_values.push_back(codon);
3686 if (anticodon.length() > 3) {
3687 anticodon = anticodon.substr(0, 3);
3690 }
catch (
const std::exception& ) {
3693 if (codon_values.size() > 0) {
3696 for (
size_t i = 0;
i < codon_values.size();
i++) {
3702 if (aa ==
'U' &&
NStr::Equal (anticodon,
"UCA")) {
3704 }
else if (aa ==
'O' &&
NStr::Equal (anticodon,
"CUA")) {
3706 }
else if (aa ==
'I' &&
NStr::Equal (anticodon,
"CAU")) {
3712 "Codons predicted from anticodon (" + anticodon
3713 +
") cannot produce amino acid (" + aaname +
")");
3718 if (recognized_codon_values.size() > 0) {
3720 for (
size_t i = 0;
i < codon_values.size() && !
ok;
i++) {
3721 for (
size_t j = 0; j < recognized_codon_values.size() && !
ok; j++) {
3722 if (
NStr::Equal (codon_values[
i], recognized_codon_values[j])) {
3724 }
else if (
NStr::Equal (codon_values[
i],
"ATG") && aa ==
'I') {
3734 "Codon recognized cannot be produced from anticodon ("
3742 if (orig_aa == 0 || orig_aa == 255) {
3752 if (idx == 0 || idx >= 28) {
3772 bool found_bad =
false;
3773 for (
auto it : scores) {
3787 "tRNA-rRNA overlap");
3794 "tRNA overlaps CDS");
3801 size_t mismatches = 0;
3815 "Unable to transcribe mRNA");
3821 "Unable to fetch mRNA transcript '" +
label +
"'");
3827 if ((*it)->IsOther()) {
3855 "] less than " + farstr +
"product length [" +
3862 +
"] less than " + farstr +
"product length ["
3868 "] less than " + farstr +
"product length [" +
3874 "greater than " + farstr +
"product length [" +
3881 " bases between the transcript and " + farstr +
"product sequence");
3885 "mRNA has exception but passes transcription test");
3890 "mRNA has unclassified exception but only difference is " +
NStr::SizetToString(mismatches)
3895 "mRNA has transcribed product replaced exception");
3946 "protein_id should not be a gbqual on an mRNA feature");
3950 "transcript_id should not be a gbqual on an mRNA feature");
3956 if (
rna.IsSetExt() &&
rna.GetExt().IsName()) {
3957 const string& rna_name =
rna.GetExt().GetName();
3962 "mRNA feature product indicates it should be a tRNA feature");
3967 "mRNA name " + rna_name +
" has SGML");
3985 "Product Bioseq of mRNA feature is not "
3986 "packaged in the record");
3995 "Identical transcript IDs found on multiple mRNAs");
4016 return locus && allele && desc && locus_tag;
4041 "Gene on mRNA bioseq does not match gene on genomic bioseq",
4062 "Focus must be on BioSource descriptor, not BioSource feature.");
4083 "BioSource descriptor must have focus or transgenic "
4084 "when BioSource feature with different taxname is "
4102 "PolyA_site should be a single point");
4134 "sig/mat/transit_peptide feature cannot be associated with a "
4135 "protein product of a coding region feature");
4138 "Peptide processing feature should be converted to the "
4139 "appropriate protein feature subtype");
4163 "Start and stop of " +
key +
" are out of frame with CDS codons");
4168 "Start and stop of " +
key +
" are out of frame with CDS codons");
4172 "Start of " +
key +
" is out of frame with CDS codons");
4176 "Stop of " +
key +
" is out of frame with CDS codons");
4188 bool pseudo = feat_pseudo;
4205 bool pseudo = feat_pseudo;
4215 "Introns should be at least 10 nt long");
4227 if (partial5 && partial3) {
4238 if (scores.size() > 0) {
4249 if (scores.size() > 0) {
4274 bool donor_in_gap =
false;
4275 bool acceptor_in_gap =
false;
4279 donor_in_gap =
true;
4284 acceptor_in_gap =
true;
4287 if (!partial5 && !partial3) {
4288 if (donor_in_gap && acceptor_in_gap) {
4295 bool donor_good =
false;
4296 bool acceptor_good =
false;
4299 if (!partial5 && !donor_in_gap) {
4302 donor[0] = vec[end5 - 1];
4303 donor[1] = vec[end5];
4309 donor[0] = vec[end5];
4310 donor[1] = vec[end5 + 1];
4317 if (!partial3 && !acceptor_in_gap) {
4320 acceptor[0] = vec[end3];
4321 acceptor[1] = vec[end3 + 1];
4322 acceptor_good =
true;
4327 acceptor[0] = vec[end3 - 1];
4328 acceptor[1] = vec[end3];
4329 acceptor_good =
true;
4335 if (!partial5 && !partial3) {
4336 if (donor_good && acceptor_good) {
4345 if (!donor_in_gap) {
4359 "Splice donor consensus (GT) not found at start of terminal intron, position "
4364 "Splice donor consensus (GT) not found at start of intron, position "
4373 if (!acceptor_in_gap) {
4376 if (acceptor_good) {
4386 "Splice acceptor consensus (AG) not found at end of terminal intron, position "
4391 "Splice acceptor consensus (AG) not found at end of intron, position "
4411 bool is_short =
false;
4424 }
else if (partial_right &&
4445 "A note or other qualifier is required for a misc_feature");
4451 string content_label;
4453 if (
NStr::Equal(content_label,
"cold-shock protein")) {
4455 "cspA misc_feature overlapped by cold-shock protein CDS");
4468 bool is_far_delta =
false;
4472 is_far_delta =
true;
4476 if ( !(*sg) )
continue;
4478 is_far_delta =
false;
4482 if (! is_far_delta) {
4484 "An assembly_gap feature should only be on a contig record");
4508 "Assembly_gap flanked by Ns on 5' and 3' sides");
4511 "Assembly_gap flanked by Ns on 5' side");
4514 "Assembly_gap flanked by Ns on 3' side");
4517 for (
size_t i = 0;
i < sequence.size();
i++) {
4518 if (sequence[
i] !=
'N') {
4535 if ((*it)->IsSetQual() &&
NStr::EqualNocase ((*it)->GetQual(),
"estimated_length")
4539 if (estimated_length != loc_len) {
4546 }
catch (
const std::exception& ) {
4553 if ( !vec.
empty() ) {
4558 unsigned int num_gap = 0;
4560 string::iterator it = vec_data.begin();
4561 while (it != vec_data.end()) {
4569 }
else if (*it !=
'-') {
4575 if (num_real > 0 && num_n > 0) {
4580 }
else if (num_real > 0) {
4584 }
else if (num_n > 0) {
4592 +
" gap characters");
4597 }
catch (
const std::exception& ) {
4610 "NULL feature key");
4654 "Feature key Import is no longer legal");
4659 switch ( subtype ) {