55 #define NCBI_USE_ERRCODE_X Objtools_Validator
69 bool Overlaps(
const CSeq_feat& cds)
const;
71 bool HasMatch()
const;
73 bool OkWithoutCds(
bool isGenbank =
false)
const;
151 for (
auto id : bsh.
GetId()) {
169 if ((*it)->IsSetSubtype() && (*it)->IsSetName() && !
NStr::IsBlank((*it)->GetName())) {
177 if (
source.IsSetLineage()) {
178 string lineage =
source.GetLineage();
189 if (
source.IsSetDivision()) {
190 string div =
source.GetDivision();
196 if (
source.IsSetGenome()) {
233 if (appropriate_parent) {
254 }
catch (
const exception& ) {
278 if (appropriate_parent) {
283 }
catch (
const exception& e ) {
285 string(
"Exception while validating bioseq. EXCEPTION: ") +
303 const string& db = dbt.
GetDb();
317 const char& ch = *itr;
318 if (ch ==
'|' || ch ==
',')
return ch;
326 for (
size_t i = 0;
i <
id.length();
i++) {
328 return id.c_str()[
i];
338 const char& ch = *itr;
339 if (ch ==
'|' || ch ==
',')
return ch;
353 "BioseqFind (" +
id.AsFastaString() +
354 ") unable to find itself - possible internal error",
ctx);
366 "BioseqFind (" +
id.AsFastaString() +
367 ") unable to find itself - possible internal error",
ctx);
371 "SeqID " +
id.AsFastaString() +
372 " is present on multiple Bioseqs in record",
ctx);
376 "BioseqFind (" +
id.AsFastaString() +
377 ") unable to find itself - possible internal error",
ctx);
383 switch (
id.Which()) {
389 "TPA record " +
ctx.GetId().front()->AsFastaString() +
390 " should have Seq-hist.assembly for PRIMARY block",
402 "Bad character '" +
string(1, badch) +
"' in accession '" + acc +
"'",
ctx);
409 "Bad accession " + acc,
ctx);
419 const string& name = tsid->
GetName();
421 if (
isspace((
unsigned char)(*s))) {
424 "Seq-id.name '" + name +
"' should be a single "
425 "word without any spaces",
ctx);
436 "Bad character '" +
string(1, badch) +
"' in accession '" + acc +
"'",
ctx);
438 size_t num_letters = 0;
439 size_t num_digits = 0;
440 size_t num_underscores = 0;
441 bool bad_id_chars =
false;
444 bool letter_after_digit =
false;
450 for ( ;
i < acc.length(); ++
i ) {
451 if (
isupper((
unsigned char) acc[
i]) ) {
453 }
else if (
isdigit((
unsigned char) acc[
i]) ) {
455 }
else if ( acc[
i] ==
'_' ) {
457 if ( num_digits > 0 || num_underscores > 1 ) {
458 letter_after_digit =
true;
465 if ( letter_after_digit || bad_id_chars ) {
467 "Bad accession " + acc,
ctx);
468 }
else if ( is_NZ && ( num_letters == 4 || num_letters == 6 ) &&
469 ( num_digits >= 8 && num_digits <= 11 ) && num_underscores == 0 ) {
473 }
else if ( num_letters == 2 &&
474 (num_digits == 6 || num_digits == 8 || num_digits == 9) &&
475 num_underscores == 1 ) {
477 }
else if (num_letters == 4 && num_digits == 10 &&
ctx.IsNa()) {
480 "Bad accession " + acc,
ctx);
495 string msg =
"Missing accession for " +
id.AsFastaString();
504 "Seq-id type not handled",
ctx);
510 "Invalid GI number",
ctx);
514 if (!
id.GetGeneral().IsSetDb() ||
NStr::IsBlank(
id.GetGeneral().GetDb())) {
517 if (
id.GetGeneral().IsSetDb()) {
518 const CDbtag& dbt =
id.GetGeneral();
519 size_t dblen = dbt.
GetDb().length();
531 if (dblen > max_dblen) {
538 if (longer_general) {
559 if (badch ==
'\0' && dbt.
IsSetDb()) {
565 "Bad character '" +
string(1, badch) +
"' in sequence ID '" +
id.AsFastaString() +
"'",
ctx);
580 if (
id.IsLocal() &&
id.GetLocal().IsStr()) {
581 const string& acc =
id.GetLocal().GetStr();
585 "Bad character '" +
string(1, badch) +
"' in local ID '" + acc +
"'",
ctx);
595 if (chain_id.size() == 1 && chain_id[0] == chain) {
597 }
else if (
islower(chain) && chain_id.size() == 2
598 && chain_id[0] == chain_id[1]
599 && chain_id[0] ==
toupper(chain)) {
601 }
else if (chain ==
'|' && chain_id ==
"VB") {
605 "PDB Seq-id contains mismatched \'chain\' and"
606 " \'chain-id\' slots",
ctx);
617 if (!IsNCBIFILESeqId(**
i)) {
619 (*i)->GetLabel(&
label);
620 if (
label.length() > 40) {
622 "Sequence ID is unusually long (" +
635 const list< string > *extra_acc =
nullptr;
637 switch (desc.
Which()) {
668 bool found_good =
false;
676 "The only ids on this Bioseq will be stripped during ID load", seq);
687 "No ids on a Bioseq", seq);
699 bool wgs_tech_needs_wgs_accession =
false;
700 bool is_segset_accession =
false;
701 bool has_wgs_general =
false;
702 bool is_eb_db =
false;
703 bool longer_general =
false;
706 if ((*i)->IsOther() || (*i)->IsEmbl() || (*i)->IsTpe()) {
707 longer_general =
true;
715 if ((*i)->IsGeneral() && (*i)->GetGeneral().IsSetDb()) {
720 has_wgs_general =
true;
722 }
else if ((*i)->IsOther() && (*i)->GetOther().IsSetAccession()) {
723 const string& acc = (*i)->GetOther().GetAccession();
726 wgs_tech_needs_wgs_accession =
true;
730 wgs_tech_needs_wgs_accession =
true;
732 }
else if ((*i)->IsEmbl() && (*i)->GetEmbl().IsSetAccession()) {
734 }
else if ((*i)->IsDdbj() && (*i)->GetDdbj().IsSetAccession()) {
739 CBioseq::TId::const_iterator j;
740 for (j =
i, ++j; j != seq.
GetId().end(); ++j) {
743 os <<
"Conflicting ids on a Bioseq: (";
744 (**i).WriteAsFasta(os);
746 (**j).WriteAsFasta(os);
753 if ( (*i)->IsGenbank() || (*i)->IsEmbl() || (*i)->IsDdbj() ) {
754 wgs_tech_needs_wgs_accession =
true;
757 if ( (*i)->IsGi() ) {
762 is_segset_accession =
true;
766 if (is_lrg && !has_ng) {
768 "LRG sequence needs NG_ accession", seq);
774 unsigned int gi_count = 0;
775 unsigned int accn_count = 0;
776 unsigned int lcl_count = 0;
779 switch ((**k).Which()) {
790 if ((*k)->IsGenbank() || (*k)->IsEmbl() || (*k)->IsDdbj()) {
798 "Accession " + acc +
" has 0 version", seq);
811 "Missing accession for " + tsid->
GetName(), seq);
825 string label = (*k)->AsFastaString();
827 "Missing identifier for " +
label, seq);
843 if ( !mi || !mi->IsSetTech() ||
848 "WGS accession should have Mol-info.tech of wgs", seq);
850 }
else if ( mi && mi->IsSetTech() &&
852 wgs_tech_needs_wgs_accession &&
853 !is_segset_accession &&
862 "Mol-info.tech of wgs should have WGS accession", seq);
867 && (!mi->IsSetBiomol()
871 "genomic RefSeq accession should use genomic or cRNA moltype",
876 if (mi && mi->IsSetBiomol()) {
877 switch (mi->GetBiomol()) {
890 "Molecule type (DNA) does not match biomol (RNA)", seq);
899 if ( gi_count > 0 && accn_count == 0 && !
m_Imp.
IsPDB() &&
902 "No accession on sequence with gi number", seq);
904 if (gi_count > 0 && accn_count > 1) {
906 "Multiple accessions on sequence with gi number", seq);
949 CEMBL_block::TKeywords::const_iterator keyword = embl_i->
GetEmbl().
GetKeywords().begin();
964 (
const string &primary_acc,
970 const list< string > *extra_acc =
nullptr;
985 primary_acc +
" used for both primary and"
986 " secondary accession", seq);
1011 bool has_barcode_tech =
false;
1015 has_barcode_tech =
true;
1018 bool has_barcode_keyword =
false;
1022 has_barcode_keyword =
true;
1026 if (has_barcode_keyword && !has_barcode_tech) {
1028 "BARCODE keyword without Molinfo.tech barcode",
1032 if (has_barcode_tech && !has_barcode_keyword && di) {
1034 "Molinfo.tech barcode without BARCODE keyword",
1039 "Sequence has both BARCODE and UNVERIFIED keywords",
1066 "Bioseq.mol is type nucleic acid", seq);
1074 "Non-linear topology set on protein", seq);
1080 "Protein not single stranded", seq);
1095 "Circular Bacteria or Archaea should be chromosome, or plasmid, or extrachromosomal", seq);
1108 "Bioseq.mol is type other", seq);
1179 bool is_wgs =
false;
1180 bool is_grc =
false;
1203 sequence::CDeflineGenerator defline_generator;
1204 string title = defline_generator.GenerateDefline(seq, *
m_Scope, sequence::CDeflineGenerator::fIgnoreExisting);
1212 is_wgs =
IsWGS(bsh);
1214 bool is_gb =
false, is_refseq =
false, is_ng =
false;
1217 const CSeq_id& sid = **sid_itr;
1218 switch (sid.
Which()) {
1243 if (is_ng)
return false;
1245 if (! is_wgs && ! is_grc)
return false;
1286 "No CdRegion in nuc-prot set points to this protein",
1292 bool is_complete =
false;
1398 "BioProject entries not present on CON record", seq);
1401 }
catch (
const exception& e ) {
1402 if (
NStr::Find(e.what(),
"Error: Cannot resolve") == string::npos) {
1404 string(
"Exception while validating BioseqContext. EXCEPTION: ") +
1425 "Orphaned stand-alone protein", seq);
1435 if (prot_feats.size() > 1) {
1438 "Protein sequence has multiple unprocessed protein features",
1439 feat->GetOriginalFeature());
1446 "Expected submission citation is missing for this Bioseq", seq);
1456 for (
CFeat_CI feat_ci(bsh, sel); feat_ci; ++feat_ci) {
1458 const CSeq_feat& matpeptide = feat_ci->GetOriginalFeature();
1465 if (matlen != prdlen) {
1467 "Mat_peptide does not match length of instantiated product",
1483 if (m_res != p_res) {
1485 "Mismatch in mat_peptide (" +
string(1, (
char)m_res) +
") and instantiated product (" + \
1525 bool has_cit_sub =
false;
1527 while (p && !has_cit_sub) {
1538 template <
class Iterator,
class Predicate>
1541 while (iter1 != iter1_stop && iter2 != iter2_stop) {
1542 if (!pred(*iter1, *iter2)) {
1548 if (iter1 != iter1_stop || iter2 != iter2_stop) {
1573 if (chs1 == chs2)
return true;
1583 return dbt1->
Compare(*dbt2) == 0;
1644 if (chs1 != chs2)
return false;
1684 printf (
"Orgname not set!\n");
1686 printf (
"Lineage not set!\n");
1702 if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1720 if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1738 if (start1 == stop2 + 1 || start2 == stop1 + 1) {
1760 CCacheImpl::TFeatValue::const_iterator feat = rnas.begin();
1761 if (feat != rnas.end()) {
1763 CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1765 for ( ; feat != rnas.end(); ++feat_prev, ++feat) {
1768 feat->GetLocation(),
m_Scope)) {
1772 const CRNA_ref& tm = feat_prev->GetData().GetRna();
1773 const CRNA_ref& tr = feat->GetData().GetRna();
1777 feat->GetLocation(),
m_Scope)) {
1779 "tRNA contained within tmRNA",
1780 feat->GetOriginalFeature());
1786 }
catch (
const exception& e ) {
1787 if (
NStr::Find(e.what(),
"Error: Cannot resolve") == string::npos) {
1789 string(
"Exception while validating RNA features. EXCEPTION: ") +
1808 CCacheImpl::TFeatValue::const_iterator feat = biosrcs.begin();
1809 if (feat != biosrcs.end()) {
1815 "Source feature is full length, should be descriptor",
1816 feat->GetOriginalFeature());
1821 CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1823 for ( ; feat != biosrcs.end(); ++feat_prev, ++feat) {
1826 "Multiple full-length source features, should only be one if descriptor is transgenic",
1827 feat->GetOriginalFeature());
1831 feat->GetLocation(),
m_Scope)) {
1837 bool are_identical =
true;
1838 if (feat_prev->IsSetComment() && feat->IsSetComment()
1840 are_identical =
false;
1842 const CBioSource& src_prev = feat_prev->GetData().GetBiosrc();
1843 const CBioSource& src = feat->GetData().GetBiosrc();
1846 are_identical =
false;
1853 are_identical =
false;
1858 are_identical =
false;
1863 "Multiple equivalent source features should be combined into one multi-interval feature",
1864 feat->GetOriginalFeature());
1868 }
catch (
const exception& e ) {
1869 if (
NStr::Find(e.what(),
"Error: Cannot resolve") == string::npos) {
1871 string(
"Exception while validating source features. EXCEPTION: ") +
1885 if ((*it)->IsGen() && (*it)->GetGen().IsSetCit()
1886 && !(*it)->GetGen().IsSetCit()
1887 && !(*it)->GetGen().IsSetJournal()
1888 && !(*it)->GetGen().IsSetDate()
1889 && (*it)->GetGen().IsSetSerial_number()) {
1911 CCacheImpl::TFeatValue::const_iterator feat = pubs.begin();
1912 if (feat != pubs.end()) {
1915 "Publication feature is full length, should be descriptor",
1916 feat->GetOriginalFeature());
1919 CCacheImpl::TFeatValue::const_iterator feat_prev = feat;
1921 if( feat_prev != pubs.end()) {
1925 for ( ; feat != pubs.end(); ++feat, ++feat_prev) {
1928 "Publication feature is full length, should be descriptor",
1929 feat->GetOriginalFeature());
1932 bool are_identical =
true;
1933 if (feat_prev->IsSetComment() && feat->IsSetComment()
1935 are_identical =
false;
1941 are_identical =
false;
1945 prev_label.swap(
label);
1950 if (are_identical) {
1952 "Multiple equivalent publication features should be combined into one multi-interval feature",
1953 feat->GetOriginalFeature());
1957 }
catch (
const exception& e ) {
1958 if (
NStr::Find(e.what(),
"Error: Cannot resolve") == string::npos) {
1960 string(
"Exception while validating pub features. EXCEPTION: ") +
1992 const CBioseq& seq,
const vector<CTempString>& labels)
1994 if (labels.size() <= 1) {
2002 static const string kWarningPrefix =
2003 "Multiple equivalent publications annotated on this sequence [";
2004 static const string::size_type kMaxSummaryLen = 100;
2009 TLabelCount label_count;
2011 ITERATE(vector<CTempString>, label_it, labels) {
2012 ++label_count[*label_it];
2016 vector<CTempString> sorted_dup_labels;
2017 ITERATE(TLabelCount, label_count_it, label_count) {
2018 int num_appearances = label_count_it->second;
2020 if( num_appearances > 1 ) {
2021 const CTempString & dup_label = label_count_it->first;
2022 sorted_dup_labels.push_back(dup_label);
2028 string err_msg = kWarningPrefix;
2029 ITERATE(vector<CTempString>, dup_label_it, sorted_dup_labels) {
2032 err_msg.resize(kWarningPrefix.length());
2033 if (summary.
length() > kMaxSummaryLen) {
2034 err_msg += summary.
substr(0, kMaxSummaryLen);
2054 vector<int> serials;
2055 vector<CTempString> published_labels;
2056 vector<CTempString> unpublished_labels;
2071 back_inserter(published_labels));
2073 back_inserter(unpublished_labels));
2077 bool otherpub =
false;
2079 switch ( (*pub_it)->Which() ) {
2081 muid = (*pub_it)->GetMuid();
2084 pmid = (*pub_it)->GetPmid();
2093 bool collision =
false;
2095 if ( muids_seen.
find(muid) != muids_seen.
end() ) {
2102 if ( pmids_seen.
find(pmid) != pmids_seen.
end() ) {
2110 "Multiple publications with identical PubMed ID", *
ctx, *it);
2129 if ( (*id)->IsGi() ) {
2130 gi = (*id)->GetGi();
2142 if ( (*id)->IsGi() ) {
2143 if ( gi == (*id)->GetGi() ) {
2145 "Replaced by gi (" +
2157 if ( (*id)->IsGi() ) {
2158 if ( gi == (*id)->GetGi() ) {
2182 if (
id.Match(**it)) {
2197 switch (seqdata.
Which()) {
2241 if (
prot[
prot.size() - 1] ==
'*' ) {
2251 if (mi && mi->IsSetCompleteness()) {
2258 }
catch (
const std::exception& ) {
2279 mix.
Set().push_back(*it);
2351 if (
prev.IsSetExcept() &&
prev.GetExcept() &&
prev.IsSetExcept_text()) {
2362 for (
auto it : currP.
GetName()) {
2367 for (
auto it : prevP.
GetName()) {
2414 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
2415 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
2430 if (entry.
IsSeq()) {
2452 if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech() && (*it)->GetMolinfo().GetTech() ==
CMolInfo::eTech_wgs) {
2473 if (entry.
IsSeq()) {
2565 if ((*id)->IsPdb()) {
2616 && (*it)->GetSet().IsSetClass()
2625 if ( (*loc)->IsNull() ) {
2629 if ( locs.size() - nulls < parts.size() ) {
2631 "Parts set contains too many Bioseqs", seq);
2633 }
else if ( locs.size() - nulls > parts.size() ) {
2635 "Parts set does not contain enough Bioseqs", seq);
2641 size_t size = locs.size();
2642 CSeg_ext::Tdata::const_iterator loc_it = locs.begin();
2643 CBioseq_set::TSeq_set::const_iterator part_it = parts.begin();
2644 for (
size_t i = 0;
i <
size; ++
i ) {
2646 if ( (*loc_it)->IsNull() ) {
2650 if ( !(*part_it)->IsSeq() ) {
2652 "Parts set component is not Bioseq", seq);
2656 if ( !
IsIdIn(loc_id, (*part_it)->GetSeq()) ) {
2658 "Segmented bioseq seq_ext does not correspond to parts "
2659 "packaging order", seq);
2667 ERR_POST_X(4,
"Seq-loc not for unique sequence");
2670 string err_msg =
"Unknown error:";
2671 err_msg += x1.
what();
2674 }
catch (std::exception &x2) {
2675 string err_msg =
"Unknown error:";
2676 err_msg += x2.what();
2691 if (! inst.
IsSetExt ())
return false;
2695 if (! (*iter)->IsLiteral() )
continue;
2707 bool has_gap =
false;
2710 if ((*iter)->IsLiteral() &&
2711 (!(*iter)->GetLiteral().IsSetSeq_data() || (*iter)->GetLiteral().GetSeq_data().IsGap())) {
2727 string title = sequence::CDeflineGenerator().GenerateDefline(bsh);
2743 "Complete genome in title without complete flag set",
2752 "Circular topology without complete flag set",
ctx, *desc);
2760 "Title contains 'complete genome' but sequence has gaps", seq);
2774 if ( !(*sg) )
continue;
2791 if ( !(*sg) )
continue;
2822 if (!(*sg))
continue;
2854 "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence.", seq);
2863 "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence.", seq);
2872 "Genome submission includes wrong gap type. Gaps for genomes should be Assembly Gaps with linkage evidence.", seq);
2912 bool has_biosample =
false;
2913 bool has_bioproject =
false;
2921 has_biosample =
true;
2925 has_bioproject =
true;
2938 if ( !it->GetLabel().IsStr() ) {
2941 const string&
label = it->GetLabel().GetStr();
2944 const string&
str = it->GetData().GetStr();
2945 auto fst =
str.find_first_of(
"0123456789");
2949 const string&
str = it->GetData().GetStr();
2950 auto lst =
str.find_first_of(
"0123456789");
2954 if ( (fr != 0) && (to != 0) ) {
2955 int df = to - fr + 1;
2968 if (!has_biosample && !has_bioproject) {
2970 "WGS master lacks both BioSample and BioProject",
2972 }
else if (!has_biosample) {
2974 "WGS master lacks BioSample",
2976 }
else if (!has_bioproject) {
2978 "WGS master lacks BioProject",
2981 if (!has_biosample || !has_bioproject) {
2989 bool only_local =
true;
2990 bool is_NCACNTNW =
false;
2991 bool is_patent =
false;
2993 if (!(*id_it)->IsLocal()) {
2995 if ((*id_it)->IsPatent()) {
3003 if (is_NCACNTNW || is_patent) {
3005 }
else if (is_circular) {
3007 }
else if (only_local) {
3030 if (vec[
i] ==
'N') {
3033 if (max_stretch < this_stretch) {
3034 max_stretch = this_stretch;
3039 if (this_stretch >= 10) {
3043 if (vec.
size() > 20 &&
i > vec.
size() - 10) {
3049 if (max_stretch < this_stretch) {
3050 max_stretch = this_stretch;
3055 if (max_stretch < this_stretch) {
3056 max_stretch = this_stretch;
3084 if (max_stretch >= 15) {
3091 "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3096 "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3108 bool at_least_one =
false;
3110 for (
CSeqVector_CI sv_iter(vec); (sv_iter) && rval; ++sv_iter) {
3111 if (*sv_iter !=
'N') {
3114 at_least_one =
true;
3119 return (rval && at_least_one);
3126 switch (seq_data.
Which()) {
3129 vector<char>::const_iterator it = seq_data.
GetNcbi4na().
Get().begin();
3130 unsigned char mask = 0xf0;
3131 unsigned char shift = 4;
3132 for (
size_t n = 0;
n <
len;
n++) {
3133 unsigned char c = ((*it) &
mask) >> shift;
3150 for (
size_t n = 0;
n <
len &&
n < s.length();
n++) {
3165 for (
size_t n = 0;
n <
len;
n++) {
3183 for (
CSeqMap_CI seq_iter(bsh, sel); seq_iter; ++seq_iter) {
3184 switch (seq_iter.GetType()) {
3186 count +=
CountNs(seq_iter.GetData(), seq_iter.GetLength());
3219 bool is_first =
true;
3223 if ( (*iter)->IsLoc() ) {
3226 if ( (*iter)->IsLiteral() ) {
3283 int max_stretch = 0;
3284 auto IsN = [](
char c) {
return c ==
'N'; };
3286 for (
auto begin_it = find_if_not(begin(vec), end(vec), IsN);
3287 begin_it != end(vec);) {
3288 auto distanceToEnd = distance(begin_it, end(vec));
3290 auto interval = (distanceToEnd > threshold) ? threshold : distanceToEnd;
3291 auto end_it = find_if(begin_it,
next(begin_it, interval), IsN);
3292 const auto current_stretch = distance(begin_it, end_it);
3293 if (current_stretch >= threshold) {
3297 if (current_stretch > max_stretch) {
3298 max_stretch = current_stretch;
3300 begin_it = find_if_not(end_it, end(vec), IsN);
3352 bool begin_ambig =
false, end_ambig =
false;
3360 bool is_circular =
false;
3383 "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases",
3388 "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases",
3403 TSeqPos num_ns = 0, max_stretch = 0;
3406 int pct_n = (num_ns * 100) / seq.
GetLength();
3412 if (max_stretch >= 15) {
3418 "Sequence has a stretch of at least 10 Ns within the first 20 bases", seq);
3422 "Sequence has a stretch of at least 10 Ns within the last 20 bases", seq);
3428 int pct_n =
PctNs(bsh);
3444 }
catch ( exception& ) {
3481 vector<TSeqPos> gapPositions;
3487 for (
CSeqMap_CI gap_it(bsh, sel); gap_it; ++gap_it) {
3489 TSeqPos gp_start = gap_it.GetPosition();
3490 TSeqPos gp_end = gap_it.GetEndPosition() - 1;
3492 gapPositions.push_back(gp_start);
3493 gapPositions.push_back(gp_end);
3500 vector<TSeqPos> featPositions;
3502 for (
CFeat_CI feat_it(bsh); feat_it; ++feat_it) {
3513 featPositions.push_back(ft_start);
3514 featPositions.push_back(ft_end);
3521 int remaininig_gaps = (
int) gapPositions.size() / 2;
3522 int remaining_feats = (
int) featPositions.size() / 2;
3524 if (remaininig_gaps < 1 || remaining_feats < 1) {
3531 TSeqPos gap_start = gapPositions[gap_idx];
3533 TSeqPos gap_end = gapPositions[gap_idx];
3537 TSeqPos feat_start = featPositions[feat_idx];
3539 TSeqPos feat_end = featPositions[feat_idx];
3545 while (remaininig_gaps >= 0 && remaining_feats >= 0) {
3546 if (gap_end < feat_start) {
3547 if (remaininig_gaps <= 0) {
3550 gap_start = gapPositions[gap_idx];
3552 gap_end = gapPositions[gap_idx];
3555 }
else if (feat_end < gap_start) {
3556 if (remaining_feats <= 0) {
3559 feat_start = featPositions[feat_idx];
3561 feat_end = featPositions[feat_idx];
3566 if (feat_start != gap_start || feat_end != gap_end) {
3569 if (remaininig_gaps <= 0) {
3572 gap_start = gapPositions[gap_idx];
3574 gap_end = gapPositions[gap_idx];
3577 if (remaining_feats <= 0) {
3580 feat_start = featPositions[feat_idx];
3582 feat_end = featPositions[feat_idx];
3590 }
catch (
const exception& ) {
3606 "Fuzzy length on " + rpr +
" Bioseq", seq);
3613 "Invalid Bioseq length [" +
len +
"]", seq);
3627 "HTGS 2 raw seq has no gaps and no graphs", seq);
3644 "Using a nucleic acid alphabet on a protein sequence",
3656 "Using a protein alphabet on a nucleic acid",
3665 "Sequence alphabet not set",
3670 bool check_alphabet =
false;
3671 unsigned int factor = 1;
3677 check_alphabet =
true;
3697 "Sequence alphabet not set",
3702 if (calc_len % factor) {
3711 if (calc_len > data_len) {
3713 "Bioseq.seq_data too short [" + data_len_str +
3714 "] for given length [" + s_len +
"]", seq);
3716 }
else if (calc_len < data_len) {
3718 "Bioseq.seq_data is larger [" + data_len_str +
3719 "] than given length [" + s_len +
"]", seq);
3722 if (check_alphabet) {
3723 unsigned int trailingX = 0;
3725 bool leading_x =
false, found_lower =
false, cds_5_prime =
false;
3732 for (
CSeqVector_CI sv_iter(*sv), sv_res_iter(sv_res); (sv_iter) && (sv_res_iter); ++sv_iter, ++sv_res_iter ) {
3739 else if (res ==
'*' && bsh.
IsAa()) {
3741 }
else if (res ==
'-' && bsh.
IsAa()) {
3749 if ( ++bad_cnt > 10 ) {
3751 "More than 10 invalid residues. Checking stopped",
3763 string msg =
"Invalid";
3764 if (seq.
IsNa() && strchr (
"EFIJLOPQXZ", res) !=
NULL) {
3765 msg +=
" nucleotide";
3766 }
else if (seq.
IsNa() && res ==
'U') {
3767 msg +=
" nucleotide";
3783 }
else if ( res ==
'-' || sv->IsInGap(pos - 1) ) {
3785 }
else if ( res ==
'*') {
3787 }
else if ( res ==
'X' ) {
3793 string msg =
"Invalid residue [";
3808 if (seq.
IsAa() && (leading_x || trailingX > 0)) {
3818 cds_seq = cds_seq.substr(1);
3820 cds_seq = cds_seq.substr(2);
3827 if (cds_seq.length() >= 3) {
3828 string lastcodon = cds_seq.substr(cds_seq.length() - 3);
3849 "Sequence starts with leading X", seq);
3854 string msg =
"Sequence ends in " +
3856 if ( trailingX > 1 ) {
3864 "Sequence contains lower-case characters", seq);
3867 if (terminations > 0 || dashes > 0) {
3884 string protein_label;
3890 if( ! prots.empty() ) {
3892 prots[0].GetData().GetProt();
3894 protein_label = first_prot.
GetName().front();
3898 }
catch (
const std::exception& ) {
3902 gene_label =
"gene?";
3905 protein_label =
"prot?";
3909 if (gap_at_start && dashes == 1) {
3911 "gap symbol at start of protein sequence (" + gene_label +
" - " + protein_label +
")",
3913 }
else if (gap_at_start) {
3915 "gap symbol at start of protein sequence (" + gene_label +
" - " + protein_label +
")",
3918 "[" +
NStr::SizetToString (dashes - 1) +
"] internal gap symbols in protein sequence (" + gene_label +
" - " + protein_label +
")",
3922 "[" +
NStr::SizetToString (dashes) +
"] internal gap symbols in protein sequence (" + gene_label +
" - " + protein_label +
")",
3927 if (terminations > 0) {
3928 string msg =
"[" +
NStr::SizetToString(terminations) +
"] termination symbols in protein sequence";
3929 msg +=
" (" + gene_label +
" - " + protein_label +
")";
3940 bool is_wgs =
IsWGS(bsh);
3944 bool has_gap_char =
false;
3949 const size_t run_len_cutoff = ( is_wgs ? 20 : 100 );
3950 for (
CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter, ++pos ) {
3960 has_gap_char =
true;
3965 if (run_len >= run_len_cutoff && start_pos > 1)
3978 "Raw nucleotide should not contain gap characters", seq);
3990 string id_test_label;
4007 if (seqlen > loclen) {
4012 }
else if (seqlen < loclen) {
4031 list< CRef<CSeq_loc> >::const_iterator i2 = i1;
4032 for (++i2; i2 != locs.end(); ++i2) {
4040 if ((**i1).IsWhole() && (**i2).IsWhole()) {
4043 "Segmented sequence has multiple references to " +
4048 "Segmented sequence has multiple references to " +
4049 sid +
" that are not SEQLOC_WHOLE", seq);
4060 bool got_partial =
false;
4062 if (!(*sd)->IsMolinfo() || !(*sd)->GetMolinfo().IsSetCompleteness()) {
4066 switch ((*sd)->GetMolinfo().GetCompleteness()) {
4071 "Complete segmented sequence with MolInfo partial", seq);
4077 "No-left inconsistent with segmented SeqLoc",
4085 "No-right inconsistent with segmented SeqLoc",
4093 "No-ends inconsistent with segmented SeqLoc",
4104 "Partial segmented sequence without MolInfo partial", seq);
4135 if ((*it)->IsSwissprot()) {
4147 }
else if (
cmp > 0) {
4153 if (start1 < start2) {
4155 }
else if (start2 < start1) {
4162 if (stop1 < stop2) {
4182 }
else if ((*sg)->IsLoc()) {
4183 const CSeq_id *
id = (*sg)->GetLoc().GetId();
4210 far_loc->
SetInt().SetFrom(start - 2);
4211 far_loc->
SetInt().SetTo(start - 1);
4220 far_loc->
SetInt().SetFrom(stop + 1);
4221 far_loc->
SetInt().SetTo(stop + 2);
4239 "Delta seq component should not be of type whole", seq);
4246 "Delta component is gi|0", seq);
4261 if (seq_len <= stop) {
4262 string id_label =
id->AsFastaString();
4265 +
") greater than length of " + id_label
4270 string id_label =
id->AsFastaString();
4272 "Scaffold points to some but not all of " +
4273 id_label +
", excluded portion contains features", seq);
4277 "Unable to find far delta sequence component", seq);
4280 }
catch (
const std::exception& ) {
4291 "-1 length on seq-loc of delta seq_ext", seq);
4294 if ( loc_str.empty() ) {
4299 "Short length (-1) on seq-loc (" + loc_str +
") of delta seq_ext", seq);
4304 if ( loc_len <= 10 ) {
4307 if ( loc_str.empty() ) {
4313 ") on seq-loc (" + loc_str +
") of delta seq_ext", seq);
4321 if ( loc_str.empty() ) {
4325 "No length for Seq-loc (" + loc_str +
") of delta seq-ext",
4335 }
else if (seg.
IsLoc()) {
4354 "proximity ligation",
4372 bool is_unspec =
false;
4376 int linktype = evidence.
GetType();
4377 if (linktype == 8) {
4427 "No CDelta_ext data for delta Bioseq", seq);
4430 bool any_tech_ok =
false;
4431 bool has_gi =
false;
4436 }
else if ((*id_it)->IsGi()) {
4441 if (!any_tech_ok && seq.
IsNa()
4455 bool last_is_gap =
false;
4456 int prev_gap_linkage = -1;
4458 int gap_linkage = -1;
4460 size_t num_gaps = 0;
4461 size_t num_adjacent_gaps = 0;
4462 bool non_interspersed_gaps =
false;
4464 int num_gap_known_or_spec = 0;
4465 int num_gap_unknown_unspec = 0;
4467 vector<CConstRef<CSeq_loc> > delta_locs;
4473 "NULL pointer in delta seq_ext valnode (segment " +
4477 switch ( (**sg).Which() ) {
4480 const CSeq_loc& loc = (**sg).GetLoc();
4482 delta_locs.push_back (
tmp);
4486 if ( !last_is_gap && !
first) {
4487 non_interspersed_gaps =
true;
4489 last_is_gap =
false;
4490 prev_gap_linkage = -1;
4507 "Seq-lit of length 0 in delta chain", seq);
4512 if ( !last_is_gap && !
first) {
4513 non_interspersed_gaps =
true;
4515 last_is_gap =
false;
4516 prev_gap_linkage = -1;
4519 vector<TSeqPos> badIdx;
4521 const string* ss =
nullptr;
4522 switch (data.
Which()) {
4535 ITERATE (vector<TSeqPos>, ci, badIdx) {
4537 "Invalid residue [" +
4548 ITERATE (vector<TSeqPos>, it, badIdx) {
4550 "Invalid residue [" +
4551 ss->substr(*it, 1) +
"] at position [" +
4560 if (max_ns > -1 && adjacent_ns > max_ns) {
4579 num_gap_unknown_unspec++;
4582 num_gap_known_or_spec++;
4596 "First delta seq component is a gap", seq);
4600 (prev_gap_type == gap_type ||
4601 prev_gap_linkage != gap_linkage ||
4604 ++num_adjacent_gaps;
4613 "Gap of length 0 in delta chain", seq);
4616 "Gap of length 0 with unknown fuzz in delta chain", seq);
4621 "Gap of unknown length should have length 100", seq);
4625 prev_gap_type = gap_type;
4626 prev_gap_linkage = gap_linkage;
4634 "CDelta_seq::Which() is e_not_set", seq);
4638 if (num_gap_unknown_unspec > 0 && num_gap_known_or_spec == 0) {
4639 if (num_gap_unknown_unspec > 1) {
4642 " Seq-gaps have unknown type and unspecified linkage", seq);
4645 "Single Seq-gap has unknown type and unspecified linkage", seq);
4660 if ( non_interspersed_gaps && !has_gi && mi &&
4674 "HTGS delta seq should have gaps between all sequence runs", seq);
4676 if ( num_adjacent_gaps >= 1 ) {
4677 string msg = (num_adjacent_gaps == 1) ?
4678 "There is 1 adjacent gap in delta seq" :
4680 " adjacent gaps in delta seq";
4690 "Last delta seq component is a gap", seq);
4694 if (num_gaps == 0 && mi) {
4699 "HTGS 2 delta seq has no gaps and no graphs", seq);
4704 if (delta_locs.size() > 1) {
4706 vector<CConstRef<CSeq_loc> >::iterator it1 = delta_locs.begin();
4707 vector<CConstRef<CSeq_loc> >::iterator it2 = it1;
4709 while (it2 != delta_locs.end()) {
4712 string seq_label = (*it1)->GetId()->AsFastaString();
4718 +
" on a Bioseq " + seq_label,
4728 "Self-referential delta sequence", seq);
4737 if (delta_i->Empty()) {
4745 if (res ==
'N' && !sv.
IsInGap(pos - 1)) {
4747 "Ambiguous residue N is adjacent to a gap around position " +
NStr::SizetToString (pos + 1),
4752 if (delta_len > 0 && pos + delta_len <
len) {
4753 if (sv.
IsInGap(pos + delta_len - 1)) {
4755 if (res ==
'N' && !sv.
IsInGap(pos + delta_len)) {
4757 "Ambiguous residue N is adjacent to a gap around position " +
NStr::SizetToString(pos + delta_len + 1),
4765 }
catch (
const std::exception& ) {
4774 bool has_gi =
false;
4776 if ((*id_it)->IsGi()) {
4789 int linkevarray[13];
4790 for (
int i = 0;
i < 13;
i++) {
4793 bool is_unspec =
false;
4797 int linktype = evidence.
GetType();
4798 if (linktype == 8) {
4802 if (linktype == 255) {
4803 (linkevarray[11])++;
4805 else if (linktype < 0 || linktype > 10) {
4806 (linkevarray[12])++;
4809 (linkevarray[linktype])++;
4812 if (linkevarray[8] > 0 && linkcount > linkevarray[8]) {
4814 "Seq-gap type has unspecified and additional linkage evidence", seq);
4816 for (
int i = 0;
i < 13;
i++) {
4817 if (linkevarray[
i] > 1) {
4825 "Seq-gap with linkage evidence must have linkage field set to linked", seq);
4836 if (linkevarray[8] > 0 && linkcount == linkevarray[8]) {
4840 "Contamination gaps must have linkage evidence 'unspecified'", seq);
4845 " should not have linkage evidence", seq);
4855 "Seq-gap type == scaffold is missing required linkage evidence", seq);
4859 bool suppress_SEQ_INST_SeqGapProblem =
false;
4864 if ((**it).IsCreate_date())
4868 suppress_SEQ_INST_SeqGapProblem =
true;
4873 if (!suppress_SEQ_INST_SeqGapProblem)
4875 "Seq-gap type == repeat and linkage == linked is missing required linkage evidence", seq);
4880 "Contamination gap-types must be linked and have linkage-evidence of type 'unspecified'", seq);
4897 rpr =
"constructed";
4899 const string err0 =
"Bioseq-ext not allowed on " + rpr +
" Bioseq";
4900 const string err1 =
"Missing or incorrect Bioseq-ext on " + rpr +
" Bioseq";
4901 const string err2 =
"Missing Seq-data on " + rpr +
" Bioseq";
4902 const string err3 =
"Seq-data not allowed on " + rpr +
" Bioseq";