99 using namespace sequence;
120 if (!
m_TSE || m_Imp.ShouldSubdivide()) {
149 }
catch (
const exception& e) {
151 string(
"Exception while validating feature. EXCEPTION: ") +
184 "Coding region on TSA transcribed RNA should not be on the minus strand", feat);
205 "Invalid feature for a protein Bioseq.", feat);
214 "Invalid feature for a nucleotide Bioseq.", feat);
224 "Peptide processing feature should be remapped to the appropriate protein bioseq",
266 size_t pos1 =
NStr::Find (inf_accession,
":");
267 size_t pos2 =
NStr::Find (inf_accession,
"|");
268 size_t pos = string::npos;
275 if (pos == string::npos) {
278 prefix = inf_accession.substr(0, pos);
280 accession = inf_accession.substr(pos + 1);
290 if (
str.length() < 3) {
293 char ch =
str.c_str()[0];
294 if (ch !=
'S' && ch !=
'E' && ch !=
'D')
return false;
296 if (ch !=
'R')
return false;
298 if (ch !=
'A' && ch !=
'P' && ch !=
'X' && ch !=
'R' && ch !=
'S' && ch !=
'Z')
return false;
311 if (!
isdigit(*it) && *it !=
'.') {
328 string prefix, remainder;
330 bool is_insd =
false, is_refseq =
false, is_blast =
false;
339 if (is_insd || is_refseq) {
340 if (remainder.length() > 3) {
341 if (remainder.c_str()[2] ==
'_') {
361 if (dot_pos == string::npos ||
NStr::IsBlank(remainder.substr(dot_pos + 1))) {
364 const string& cps = remainder.substr(dot_pos + 1);
365 const char *cp = cps.c_str();
366 while (*cp != 0 &&
isdigit (*cp)) {
375 if (acc_code == -5 || acc_code == -6) {
377 }
else if (acc_code != 0) {
379 }
else if (fetch_accession) {
386 }
else if (is_blast && is_similar_to) {
388 }
else if (is_similar_to) {
395 if (
NStr::Find (remainder,
" ") != string::npos) {
408 vector<string> accessions;
416 same_species =
false;
420 remainder = remainder.substr(14);
425 remainder = remainder.substr (1);
441 ||
NStr::Equal(prefix,
"similar to RNA sequence, mRNA")
442 ||
NStr::Equal(prefix,
"similar to RNA sequence, EST")
443 ||
NStr::Equal(prefix,
"similar to RNA sequence, other RNA")) {
458 string prefix, remainder;
459 bool same_species =
false;
473 if (same_species && !is_similar_to) {
478 for (
size_t i = 0;
i < accessions.size();
i++) {
489 const char& ch = *str_itr;
494 if (num_spaces > 3) {
496 }
else if (num_spaces > 0){
514 bool found_short =
false;
518 TSeqPos last_start = li.GetRange().GetFrom();
519 TSeqPos last_stop = li.GetRange().GetTo();
521 last_id->
Assign(li.GetSeq_id());
524 while (li && !found_short) {
525 TSeqPos this_start = li.GetRange().GetFrom();
526 TSeqPos this_stop = li.GetRange().GetTo();
527 if (
abs ((
int)this_start - (
int)last_stop) < 11 ||
abs ((
int)this_stop - (
int)last_start) < 11) {
528 if (li.GetSeq_id().Equals(*last_id)) {
535 for (
auto id_it : last_bsh.
GetId()) {
536 if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
544 last_start = this_start;
545 last_stop = this_stop;
546 last_id->
Assign(li.GetSeq_id());
582 bool is_short =
false;
584 if (! m_Imp.IsIndexerVersion()) {
596 }
else if (partial_right &&
633 if ((*it)->IsSetData() && (*it)->GetData().IsGene()
652 if ((*it)->IsSetId()) {
653 if ((*it)->GetId().Equals(
id)) {
655 }
else if ((*it)->GetId().IsLocal()) {
658 if (!far_feats.empty()) {
672 "Cross-referenced feature does not link reciprocally",
689 "CDS not contained within cross-referenced mRNA", feat);
696 "Feature gene xref does not match Feature ID cross-referenced gene feature",
704 "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA ("
705 + label1 +
"," + label2 +
")",
711 "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA ("
712 + label1 +
"," + label2 +
")",
717 "Cross-referenced feature does not link reciprocally",
725 "CDS not contained within cross-referenced mRNA", feat);
730 "Cross-referenced feature does not link reciprocally",
734 "Cross-referenced feature does not have its own cross-reference", feat);
754 for (
auto it = feat.
GetXref().begin(); it != feat.
GetXref().end(); it++) {
762 if (!m_Imp.IsStandaloneAnnot() && !
m_TSE) {
767 "SeqFeatXref with no id or data field", feat);
770 vector<CConstRef<CSeq_feat> > far_feats;
771 if (m_Imp.IsStandaloneAnnot()) {
772 for (
auto it = m_Imp.GetSeqAnnot()->GetData().GetFtable().begin(); it != m_Imp.GetSeqAnnot()->GetData().GetFtable().end(); it++) {
774 far_feats.push_back(*it);
779 for (
auto it = far_handles.begin(); it != far_handles.end(); it++) {
780 far_feats.push_back(it->GetSeq_feat());
783 if (far_feats.empty()) {
785 "Cross-referenced feature cannot be found",
788 for (
auto ff = far_feats.begin(); ff != far_feats.end(); ff++) {
792 if (xref.
GetData().
Which() != (*ff)->GetData().Which()) {
794 "SeqFeatXref contains both id and data, data type conflicts with data on feature with id",
802 "Cross-referenced feature cannot be found",
808 "Gene feature has gene cross-reference",
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
User-defined methods of the data storage class.
@ eErr_SEQ_FEAT_InvalidFeatureForNucleotide
@ eErr_SEQ_FEAT_InvalidFeatureForProtein
@ eErr_SEQ_FEAT_SeqFeatXrefFeatureMissing
@ eErr_SEQ_FEAT_InvalidForType
@ eErr_SEQ_FEAT_UnnecessaryGeneXref
@ eErr_INTERNAL_Exception
@ eErr_SEQ_FEAT_CDSonMinusStrandTranscribedRNA
@ eErr_SEQ_FEAT_SeqFeatXrefProblem
@ eErr_SEQ_FEAT_SeqFeatXrefNotReciprocal
@ eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem
static bool IsLegalInferenceDatabase(const string &db)
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
static void GetPrefixAndRemainder(const string &inference, string &prefix, string &remainder)
static bool ProhibitXref(CSeqFeatData::ESubtype subtype1, CSeqFeatData::ESubtype subtype2)
ESubtype GetSubtype(void) const
string GetKey(EVocabulary vocab=eVocabulary_full) const
static bool AllowXref(CSeqFeatData::ESubtype subtype1, CSeqFeatData::ESubtype subtype2)
namespace ncbi::objects::
bool HasSeqFeatXref(const CSeqFeatXref::TId &id) const
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
static bool s_IsPseudo(const CSeq_feat &feat)
static bool s_GeneRefsAreEquivalent(const CGene_ref &g1, const CGene_ref &g2, string &label)
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
vector< CSeq_feat_Handle > TSeq_feat_Handles
void ValidateSeqFeatXref(const CSeq_feat &feat)
~CValidError_feat() override
bool x_HasNonReciprocalXref(const CSeq_feat &feat, const CFeat_id &id, CSeqFeatData::ESubtype subtype)
CBioseq_Handle x_GetCachedBsh(const CSeq_loc &loc)
void ValidateSeqFeatContext(const CSeq_feat &feat, const CBioseq &seq)
static EInferenceValidCode ValidateInferenceAccession(string accession, bool fetch_accession, bool is_similar_to, CScope *scope=nullptr)
CValidError_feat(CValidError_imp &imp)
void ValidateOneFeatXrefPair(const CSeq_feat &feat, const CSeq_feat &far_feat)
bool IsIntronShort(const CSeq_feat &feat)
void SetTSE(CSeq_entry_Handle seh)
bool GetTSACDSOnMinusStrandErrors(const CSeq_feat &feat, const CBioseq &seq)
bool IsOverlappingGenePseudo(const CSeq_feat &feat)
static bool GetPrefixAndAccessionFromInferenceAccession(string inf_accession, string &prefix, string &accession)
bool DoesCDSHaveShortIntrons(const CSeq_feat &feat)
@ eInferenceValidCode_bad_prefix
@ eInferenceValidCode_spaces
@ eInferenceValidCode_bad_accession_version
@ eInferenceValidCode_comment
@ eInferenceValidCode_same_species_misused
@ eInferenceValidCode_empty
@ eInferenceValidCode_valid
@ eInferenceValidCode_bad_body
@ eInferenceValidCode_single_field
@ eInferenceValidCode_bad_accession
@ eInferenceValidCode_bad_accession_type
@ eInferenceValidCode_accession_version_not_public
@ eInferenceValidCode_unrecognized_database
void ValidateSeqFeat(const CSeq_feat &feat)
void x_ValidateSeqFeatExceptXref(const CSeq_feat &feat)
static EInferenceValidCode ValidateInference(string inference, bool fetch_accession, CScope *scope=nullptr)
static vector< string > GetAccessionsFromInferenceString(string inference, string &prefix, string &remainder, bool &same_species)
CBioseq_Handle GetBioseqHandleFromLocation(CScope *scope, const CSeq_loc &loc, const CTSE_Handle &tse)
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static const char * str(char *buf, int n)
Public API for finding the gene(s) on a given feature using the same criteria as the flatfile generat...
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
@ eDiag_Error
Error message.
@ eDiag_Warning
Warning message.
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
ENa_strand GetStrand(void) const
Get the location's strand.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
TSeqPos GetBioseqLength(void) const
const CTSE_Handle & GetTSE_Handle(void) const
const TId & GetId(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
@ eNocase
Case insensitive compare.
static const char label[]
TId GetId(void) const
Get the variant data.
EProcessed
processing status
TProcessed GetProcessed(void) const
Get the Processed member data.
bool IsSetProcessed(void) const
Check if a value has been assigned to Processed data member.
@ eProcessed_signal_peptide
@ eProcessed_transit_peptide
const TData & GetData(void) const
Get the Data member data.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsProt(void) const
Check if variant Prot is selected.
bool IsCdregion(void) const
Check if variant Cdregion is selected.
const TId & GetId(void) const
Get the Id member data.
const TLocal & GetLocal(void) const
Get the variant data.
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
const TLocation & GetLocation(void) const
Get the Location member data.
bool IsLocal(void) const
Check if variant Local is selected.
const TId & GetId(void) const
Get the Id member data.
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
bool IsSetId(void) const
Check if a value has been assigned to Id data member.
bool IsSetPseudo(void) const
annotated on pseudogene? Check if a value has been assigned to Pseudo data member.
const TGene & GetGene(void) const
Get the variant data.
bool IsSetId(void) const
the feature copied Check if a value has been assigned to Id data member.
const TProt & GetProt(void) const
Get the variant data.
const TXref & GetXref(void) const
Get the Xref member data.
vector< CRef< CSeqFeatXref > > TXref
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
@ e_Txinit
transcription initiation
@ e_Rsite
restriction site (for maps really)
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
TTech GetTech(void) const
Get the Tech member data.
TBiomol GetBiomol(void) const
Get the Biomol member data.
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
const TMolinfo & GetMolinfo(void) const
Get the variant data.
@ eTech_tsa
transcriptome shotgun assembly
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
@ e_Molinfo
info on the molecule and techniques
constexpr bool empty(list< Ts... >) noexcept
#define FOR_EACH_SEQFEATXREF_ON_SEQFEAT(Itr, Var)
FOR_EACH_SEQFEATXREF_ON_SEQFEAT EDIT_EACH_SEQFEATXREF_ON_SEQFEAT.
#define FOR_EACH_CHAR_IN_STRING(Itr, Var)
FOR_EACH_CHAR_IN_STRING EDIT_EACH_CHAR_IN_STRING.
CSingleFeatValidator * FeatValidatorFactory(const CSeq_feat &feat, CScope &scope, CValidError_imp &imp)
bool s_IsSraPrefix(string str)
static int ValidateAccessionFormat(string accession)
bool GeneXrefConflicts(const CSeq_feat &feat, const CSeq_feat &gene)
bool s_IsAllDigitsOrPeriods(string str)
bool s_HasId(const CSeq_feat &feat, const CSeqFeatXref::TId::TLocal &id)
bool FeaturePairIsTwoTypes(const CSeq_feat &feat1, const CSeq_feat &feat2, CSeqFeatData::ESubtype subtype1, CSeqFeatData::ESubtype subtype2)