63 auto& node = m_Objs[
"[n] nucleotide Bioseq[s] [is] present"].Info();
66 node.Add(*
context.BioseqObjRef());
73 m_Objs[
"[n] nucleotide Bioseq[s] [is] present"];
84 auto& node = m_Objs[
"[n] protein sequence[s] [is] present"].Info();
87 node.Add(*
context.BioseqObjRef());
94 for (
auto& id_it : seq.
GetId()) {
109 m_Objs[
"[n] protein[s] [has] invalid ID[s]."].Add(*
context.BioseqObjRef()).Fatal();
123 if (protein_id_prefix.
empty()) {
126 string protein_id_prefix_lowercase = protein_id_prefix;
132 CReportNode& canonical_forms_node = m_Objs[
"canonical forms"][protein_id_prefix_lowercase];
133 string canonical_protein_id_prefix;
134 if (canonical_forms_node.
empty()) {
137 canonical_protein_id_prefix = protein_id_prefix;
138 canonical_forms_node[protein_id_prefix];
142 canonical_protein_id_prefix = canonical_forms_node.
GetMap().
begin()->first;
145 m_Objs[
kEmptyStr][
"[n] sequence[s] [has] protein ID prefix [(]" + canonical_protein_id_prefix].Fatal().Add(*
context.BioseqObjRef());
155 if( reports_collected.
GetMap().
size() <= 1 ) {
174 for (
const auto& p: sum.
NRuns) {
175 details += (details.empty() ?
" " :
", ") + to_string(p.first) +
"-" + to_string(p.second);
177 m_Objs[
"[n] sequence[s] [has] runs of 10 or more Ns"][sum.
Label +
" has runs of Ns at the following locations: " + details].Ext().Fatal().Add(*
context.BioseqObjRef());
190 if (!sum.
HasRef && sum.
N * 100. / sum.
Len > 5) {
191 m_Objs[
"[n] sequence[s] [has] more than 5% Ns"].Add(*
context.BioseqObjRef());
199 static const char*
kRRNASpacer[] = {
"internal",
"transcribed",
"spacer" };
202 DISCREPANCY_CASE(INTERNAL_TRANSCRIBED_SPACER_RRNA, FEAT,
eOncaller,
"Look for rRNAs that contain either 'internal', 'transcribed' or 'spacer'")
206 const string rna_name = feat.GetData().GetRna().GetRnaProductName();
209 m_Objs[
"[n] rRNA feature products contain 'internal', 'transcribed', or 'spacer'"].Add(*
context.SeqFeatObjRef(feat));
234 bool str1_has_similarity_word =
false, str2_has_similarity_word =
false;
241 str1_has_similarity_word =
true;
245 str2_has_similarity_word =
true;
248 if (str1_has_similarity_word && str2_has_similarity_word) {
267 if (
NStr::Find(product,
"transposon") != string::npos ||
NStr::Find(product,
"transposase") != string::npos) {
274 return constraint.
Match(product);
297 static const char*
kOverlap0 =
"[n] coding region[s] overlap[S] another coding region with a similar or identical name.";
298 static const char*
kOverlap1 =
"[n] coding region[s] overlap[S] another coding region with a similar or identical name, but [has] the appropriate note text.";
299 static const char*
kOverlap2 =
"[n] coding region[s] overlap[S] another coding region with a similar or identical name and [does] not have the appropriate note text.";
303 if (products.
find(feat) == products.
end()) {
304 string name =
context.GetProdForFeature(*feat);
307 return products[feat];
315 const auto& cds =
context.FeatCDS();
317 for (
size_t i = 0;
i < cds.size();
i++) {
318 const CSeq_loc& loc_i = cds[
i]->GetLocation();
320 for (
size_t j =
i + 1; j < cds.size(); j++) {
321 const CSeq_loc& loc_j = cds[j]->GetLocation();
326 if (prod_i.empty()) {
358 context.ReplaceSeq_feat(*obj, *sf, *new_feat);
375 auto mol_info =
context.GetMolinfo();
380 m_Objs[
"[n] partial CDS[s] in complete sequence[s]"].Add(*
context.SeqFeatObjRef(feat));
390 if (feat.GetData().IsRna() && !
context.IsPseudo(feat)) {
392 switch (feat.GetData().GetSubtype()) {
405 const CRNA_ref & rna_ref = feat.GetData().GetRna();
423 const CRNA_ref & rna_ref = feat.GetData().GetRna();
429 switch (rna_ext.
Which()) {
451 if (!feat.GetNamedQual(
"product").empty()) {
456 m_Objs[
"[n] RNA feature[s] [has] no product and [is] not pseudo"].Add(*
context.SeqFeatObjRef(feat),
false);
471 static const char*
kContained =
"[n] coding region[s] [is] completely contained in another coding region.";
472 static const char*
kContainedNote =
"[n] coding region[s] [is] completely contained in another coding region, but have note.";
473 static const char*
kContainedSame =
"[n] coding region[s] [is] completely contained in another coding region on the same strand.";
474 static const char*
kContainedOpps =
"[n] coding region[s] [is] completely contained in another coding region, but on the opposite strand.";
483 const auto& cds =
context.FeatCDS();
484 for (
size_t i = 0;
i < cds.size();
i++) {
485 const CSeq_loc& loc_i = cds[
i]->GetLocation();
487 for (
size_t j =
i + 1; j < cds.size(); j++) {
488 const CSeq_loc& loc_j = cds[j]->GetLocation();
510 m_ReportItems = m_Objs[
kContained].Export(*this)->GetSubitems();
523 new_feat->
SetData().SetImp().SetKey(
"misc_feature");
525 if (stringobj && !stringobj->
Value.empty()) {
528 context.ReplaceSeq_feat(*obj, *sf, *new_feat);
536 static const char* kMsg =
"[n] sequence[s] [has] a zero basecount for a nucleotide";
542 m_Objs[kMsg][
"[n] sequence[s] [has] no As"].Ext().Add(*
context.BioseqObjRef());
545 m_Objs[kMsg][
"[n] sequence[s] [has] no Cs"].Ext().Add(*
context.BioseqObjRef());
548 m_Objs[kMsg][
"[n] sequence[s] [has] no Gs"].Ext().Add(*
context.BioseqObjRef());
551 m_Objs[kMsg][
"[n] sequence[s] [has] no Ts"].Ext().Add(*
context.BioseqObjRef());
563 if (
set.IsSetClass()) {
564 switch (
set.GetClass()) {
570 m_Objs[
"[n] set[s] [is] of type eco, mut, phy or pop"].Add(*
context.BioseqSetObjRef(
true));
594 auto all_feat =
context.GetAllFeat();
595 if (all_feat.begin() == all_feat.end()) {
596 m_Objs[
"[n] bioseq[s] [has] no features"].Add(*
context.BioseqObjRef());
603 const int kSeqLength = 5000;
606 auto all_feat =
context.GetAllFeat();
607 if (all_feat.begin() == all_feat.end()) {
608 m_Objs[
"[n] bioseq[s] [is] longer than 5000nt and [has] no features"].Add(*
context.BioseqObjRef());
616 const int kSeqLength = 50000;
619 auto all_feat =
context.GetAllFeat();
620 if (all_feat.begin() == all_feat.end()) {
621 m_Objs[
"[n] bioseq[s] [is] longer than 50000nt and [has] no features"].Add(*
context.BioseqObjRef());
636 static const size_t TAIL = 30;
641 for (
size_t i = 0;
i < seq_data.length();
i++) {
642 if (seq_data[
i] ==
'A' || seq_data[
i] ==
'a') {
670 size_t stop = besh.GetInst_Length() - cut_from_end;
673 if (start < stop && start + len > stop) {
676 string seq_out = seq_in.substr(0, stop - start);
680 else if (start >= stop) {
681 seqmap_i = seqmap_i.
Remove();
698 if (feat.IsSetLocation()) {
700 for (; loc_ci; ++loc_ci) {
702 m_Objs[
"[n] feature[s] [has] ordered location[s]"].Add(*
context.SeqFeatObjRef(feat, &feat));
715 while (new_loc_creator) {
729 context.ReplaceSeq_feat(*obj, *sf, *new_feat);
742 const CGene_ref& gene_ref = feat->GetData().GetGene();
745 m_Objs[
"[n] gene[s] [has] no locus tag[s]."].Fatal().Add(*
context.SeqFeatObjRef(*feat));
770 if (feat.IsSetData() && feat.GetData().IsGene()) {
771 const CGene_ref& gene_ref = feat.GetData().GetGene();
776 m_Objs[
"None of [n] gene[s] has locus tag."].Fatal().Add(*
context.SeqFeatObjRef(feat));
800 const CGene_ref& gene_ref = feat.GetData().GetGene();
811 if (!locus_tag.empty() && !
context.IsBadLocusTagFormat(locus_tag)) {
818 ss <<
"[n] feature[s] [has] locus tag prefix [(]" << prefix <<
".";
819 m_Objs[ss.str()].Add(*
context.SeqFeatObjRef(feat));
829 if (m_Objs.GetMap().size() > 1) {
873 for (
const auto&
id : bioseq.
GetId()) {
874 switch (id->Which()) {
888 const auto& genes =
context.FeatGenes();
890 const CGene_ref& gene_ref = gene->GetData().GetGene();
898 if (!locus_tag.empty() &&
context.IsBadLocusTagFormat(locus_tag)) {
899 m_Objs[
"[n] locus tag[s] [is] incorrectly formatted."].Fatal().Add(*
context.SeqFeatObjRef(*gene));
911 m_Objs[
"[n] segset[s] [is] present"].Add(*
context.BioseqSetObjRef());
924 for (
const auto& ann : bioseq.
GetAnnot()) {
925 if (ann->IsGraph()) {
937 size_t q = m_Objs[
"q"].GetCount();
938 size_t n = m_Objs[
"t"].GetCount() - q;
941 ret[
"Quality scores are missing on some(" + to_string(
n) +
") sequences"];
955 m_Objs[
"[n] bacterial sequence[s] [has] mRNA features"].Add(*
context.SeqFeatObjRef(feat));
964 static const string kDiscMessage =
"[n] feature[s] contain[S] invalid BGPIPE qualifiers";
971 if (feat.GetExcept_text() ==
"ribosomal slippage" && feat.IsSetComment() && feat.GetComment().find(
"programmed frameshift") != string::npos) {
979 const CCdregion & cdregion = feat.GetData().GetCdregion();
1024 if (feat.IsSetData() && feat.GetData().IsCdregion()) {
1030 const string& locus = gene.
GetLocus();
1031 string product =
context.GetProdForFeature(feat);
1032 genes[locus].push_back(make_pair(
context.SeqFeatObjRef(feat), product));
1043 for (
auto& gene : genes) {
1044 if (gene.second.size() > 1) {
1045 TGenesList::const_iterator cur_gene = gene.second.cbegin();
1046 const string& product = cur_gene->second;
1048 for (++cur_gene; cur_gene != gene.second.cend(); ++cur_gene) {
1049 const string& cur_product = cur_gene->second;
1050 if (product != cur_product) {
1056 string sub =
"[n] coding regions have the same gene name (" + gene.first +
") as another coding region but a different product";
1057 for (
auto& rec : gene.second) {
1058 m_Objs[
"[n] coding region[s] [has] the same gene name as another coding region but a different product"][sub].Ext().Add(*rec.first,
false);
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_set_EditHandle –.
TSeqPos GetLength(void) const
bool IsSetLength(void) const
bool IsSkippable(void) const
TGeneLocusMap m_GeneLocusMap
static string GetBiomolName(CMolInfo::TBiomol biomol)
@RNA_ref.hpp User-defined methods of the data storage class.
string GetRnaProductName(void) const
virtual vector< CRef< CReportItem > > GetSubitems() const =0
CRef< CReportItem > Export(CDiscrepancyCore &test, bool unique=true) const
Non-const iterator over CSeqMap (allows to edit the sequence).
namespace ncbi::objects::
static bool IsAa(EMol mol)
static string GetMoleculeClass(EMol mol)
static bool IsNa(EMol mol)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
bool Match(const CMatchString &str) const
void SetMatch_text(const TMatch_text &value)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
const_iterator begin() const
const_iterator end() const
const_iterator find(const key_type &key) const
static const char * kRRNASpacer[]
static const char * kContainedOpps
static bool HasOverlapNote(const CSeq_feat &feat)
static bool HasContainedNote(const CSeq_feat &feat)
static const char * kIgnoreSimilarProductWords[]
static const string kDiscMessage
static bool SetOverlapNote(CSeq_feat &feat)
static const char * kContainedNote
static const char * kOverlap2
static const char * kOverlap1
static const char * kSimilarProductWords[]
static const size_t kNumIgnoreSimilarProductWords
static const string kInconsistent_Moltype
static bool ShouldIgnore(const string &product)
static const char * kOverlap0
static bool StrandsMatch(ENa_strand strand1, ENa_strand strand2)
static const char * kContainedSame
static const char * kContained
static const CSeq_id * GetProteinId(const CBioseq &seq)
static const size_t kRRNASpacer_len
static const string kOverlappingCDSNoteText
static bool ProductNamesAreSimilar(const string &product1, const string &product2)
static string GetProdName(const CSeq_feat *feat, map< const CSeq_feat *, string > &products, CDiscrepancyContext &context)
static const size_t kNumSimilarProductWords
#define DISCREPANCY_AUTOFIX(name)
#define DISCREPANCY_CASE(name, type, group, descr)
#define DISCREPANCY_SUMMARIZE(name)
unsigned int TSeqPos
Type for sequence locations and lengths.
constexpr size_t ArraySize(const Element(&)[Size])
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
C * SerialClone(const C &src)
Create on heap a clone of the source object.
CRef< CSeq_loc > MakeSeq_loc(EMakeType make_type=eMake_CompactType) const
return constructed CSeq_loc with all changes
void Delete(void)
Delete current element, and make iterator to point to the next element.
ENa_strand GetStrand(void) const
Get the location's strand.
bool HasChanges(void) const
return true of any part was changed since initialization
const CSeq_loc & GetEmbeddingSeq_loc(void) const
Get the nearest seq-loc containing the current range.
@ eEmpty_Allow
ignore empty locations
@ eMake_PreserveType
use most compact Seq-loc type (default)
@ eContains
First CSeq_loc contains second.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eNoOverlap
CSeq_locs do not overlap or abut.
void SetClass(TClass v) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
CSeqMap_I & Remove(void)
Remove current segment.
void GetSequence(string &buffer, CSeqUtil::ECoding buffer_coding) const
Get current sequence as a string with the selected encoding.
const CSeq_data & GetData(void) const
will allow only regular data segments (whole, plus strand)
SSeqMapSelector & SetFlags(TFlags flags)
Select segment type(s)
void SetSequence(const string &buffer, CSeqUtil::ECoding buffer_coding, CSeq_data::E_Choice seq_data_coding)
Set sequence data.
TSeqPos GetLength(void) const
return length of current segment
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ eNocase
Case insensitive compare.
bool IsSetPseudo(void) const
pseudogene Check if a value has been assigned to Pseudo data member.
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
const TLocus & GetLocus(void) const
Get the Locus member data.
TPseudo GetPseudo(void) const
Get the Pseudo member data.
void SetCase_sensitive(TCase_sensitive value)
Assign a value to Case_sensitive data member.
void SetWhole_word(TWhole_word value)
Assign a value to Whole_word data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
const TGen & GetGen(void) const
Get the variant data.
const TName & GetName(void) const
Get the variant data.
const TExt & GetExt(void) const
Get the Ext member data.
@ e_Name
for naming "other" type
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
const TLocation & GetLocation(void) const
Get the Location member data.
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
const TAa & GetAa(void) const
Get the Aa member data.
TNcbieaa GetNcbieaa(void) const
Get the variant data.
const TComment & GetComment(void) const
Get the Comment member data.
const TGene & GetGene(void) const
Get the variant data.
ENa_strand
strand of nucleic acid
bool IsGeneral(void) const
Check if variant General is selected.
const TGeneral & GetGeneral(void) const
Get the variant data.
bool IsNull(void) const
Check if variant Null is selected.
@ e_Other
for historical reasons, 'other' = 'refseq'
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_genbank
converted genbank
@ eClass_segset
segmented sequence + parts
const TInst & GetInst(void) const
Get the Inst member data.
bool CanGetAnnot(void) const
Check if it is safe to call GetAnnot method.
bool IsSetMol(void) const
Check if a value has been assigned to Mol data member.
const TSource & GetSource(void) const
Get the variant data.
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
const TAnnot & GetAnnot(void) const
Get the Annot member data.
const TId & GetId(void) const
Get the Id member data.
bool IsSetInst(void) const
the sequence data Check if a value has been assigned to Inst data member.
TLength GetLength(void) const
Get the Length member data.
TMol GetMol(void) const
Get the Mol member data.
TBiomol GetBiomol(void) const
Get the Biomol member data.
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
const TMolinfo & GetMolinfo(void) const
Get the variant data.
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
E_Choice Which(void) const
Which variant is currently selected.
@ eCompleteness_complete
complete biological entity
void AddComment(CSeq_feat &feat, const string &comment)
const struct ncbi::grid::netcache::search::fields::SIZE size
GenericValue< UTF8<> > Value
GenericValue with UTF8 encoding.
#define FOR_EACH_CODEBREAK_ON_CDREGION(Itr, Var)
FOR_EACH_CODEBREAK_ON_CDREGION EDIT_EACH_CODEBREAK_ON_CDREGION.
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define RAW_FIELD_IS_EMPTY_OR_UNSET(Var, Fld)
RAW_FIELD_IS_EMPTY_OR_UNSET macro.
#define GET_FIELD_OR_DEFAULT(Var, Fld, Dflt)
GET_FIELD_OR_DEFAULT base macro.
#define FIELD_EQUALS(Var, Fld, Value)
FIELD_EQUALS base macro.
#define STRING_FIELD_NOT_EMPTY(Var, Fld)
STRING_FIELD_NOT_EMPTY base macro.
#define GET_STRING_FLD_OR_BLANK(Var, Fld)
GET_STRING_FLD_OR_BLANK base macro.
vector< pair< size_t, size_t > > NRuns
Selector used in CSeqMap methods returning iterators.
static CS_CONTEXT * context