50 using namespace sequence;
65 m_HasDashXStart =
false;
67 m_InternalStopCodons = 0;
68 m_TranslTerminalX = 0;
70 m_UnableToTranslate =
false;
72 m_UnparsedTranslExcept =
false;
73 m_NumNonsenseIntrons = 0;
74 m_HasException =
false;
82 bool ignore_exceptions,
84 bool standalone_annot,
89 bool is_nt_or_ng_or_nw,
102 for (
auto it = feat.
GetQual().begin(); it != feat.
GetQual().end(); it++) {
109 bool has_errors =
false, unclassified_except =
false,
110 mismatch_except =
false, frameshift_except =
false,
111 rearrange_except =
false, product_replaced =
false,
112 mixed_population =
false, low_quality =
false,
113 report_errors =
true, other_than_mismatch =
false,
114 rna_editing =
false, transcript_or_proteomic =
false;
117 if (!ignore_exceptions &&
132 transcript_or_proteomic);
135 m_HasException = !report_errors;
138 m_ProblemFlags |= x_CheckCDSFrame(feat, scope);
142 for (
auto it = feat.
GetQual().begin(); it != feat.
GetQual().end(); it++) {
143 if ((*it)->IsSetQual() &&
NStr::Equal((*it)->GetQual(),
"transl_except")) {
144 m_UnparsedTranslExcept =
true;
150 bool got_stop =
false;
155 m_UnableToTranslate =
true;
163 m_TranslStart = transl_prot.c_str()[0];
164 m_ProblemFlags |= eCDSTranslationProblem_BadStart;
167 bool no_product =
true;
169 if (!m_UnableToTranslate) {
172 m_TranslExceptProblems = x_GetTranslExceptProblems(feat, loc_handle, scope, is_refseq);
174 m_NumNonsenseIntrons = x_CountNonsenseIntrons(feat, scope);
175 if (x_ProteinHasTooManyXs(transl_prot)) {
176 m_ProblemFlags |= eCDSTranslationProblem_TooManyX;
180 if (m_InternalStopCodons > 5) {
188 const CSeq_id* protid =
nullptr;
193 m_ProblemFlags |= eCDSTranslationProblem_UnableToFetch;
194 }
else if (protid && (!far_fetch_cds || feat.
IsSetProduct())) {
196 }
else if (!standalone_annot && transl_prot.length() > 6) {
197 if (!is_nt_or_ng_or_nw && (!is_nc || !single_seq)) {
198 m_ProblemFlags |= eCDSTranslationProblem_NoProtein;
203 bool show_stop =
true;
205 if (prot_handle && prot_handle.
IsAa()) {
214 if (m_TransLen == m_ProtLen || has_accession) {
215 if (prot_vec.
size() > 0 && transl_prot.length() > 0 &&
216 prot_vec[0] != transl_prot[0]) {
220 m_ProblemFlags |= eCDSTranslationProblem_ShouldStartPartial;
221 }
else if (transl_prot[0] ==
'-' || transl_prot[0] ==
'X') {
222 m_HasDashXStart =
true;
225 m_Mismatches = x_GetTranslationMismatches(feat, prot_vec, transl_prot, has_accession);
229 m_Mismatches.size() == 0) {
232 if (!no_beg && !no_end) {
234 if (is_gpipe && is_genomic) {
238 m_ProblemFlags |= eCDSTranslationProblem_ShouldBePartialButIsnt;
240 m_ProblemFlags |= eCDSTranslationProblem_ShouldNotBePartialButIs;
248 if (!transl_prot.empty()) {
250 m_TranslTerminalX = x_CountTerminalXs(transl_prot, (got_stop && (transl_prot.length() == prot_vec.
size() + 1)));
251 m_ProdTerminalX = x_CountTerminalXs(prot_vec);
256 x_GetCdTransErrors(feat, prot_handle, show_stop, got_stop, scope);
259 if (x_JustifiesException()) {
261 other_than_mismatch =
true;
262 }
else if (m_Mismatches.size() > 0) {
266 if (!report_errors && !no_product) {
268 if (!frameshift_except && !rearrange_except && !mixed_population && !low_quality) {
269 m_ProblemFlags |= eCDSTranslationProblem_UnnecessaryException;
271 }
else if (unclassified_except && !other_than_mismatch) {
272 if (m_Mismatches.size() * 50 <= m_ProtLen) {
273 m_ProblemFlags |= eCDSTranslationProblem_ErroneousException;
275 }
else if (!product_replaced && !transcript_or_proteomic && !rna_editing) {
276 m_ProblemFlags |= eCDSTranslationProblem_UnqualifiedException;
288 if (!got_stop && !no_end) {
289 m_ProblemFlags |= eCDSTranslationProblem_NoStop;
290 }
else if (got_stop && no_end) {
291 m_ProblemFlags |= eCDSTranslationProblem_StopPartial;
292 }
else if (got_stop && !no_end) {
293 m_RaggedLength = x_CheckForRaggedEnd(feat, scope);
309 bool nonsense_intron =
false;
315 tmp_cds->
SetLocation().SetInt().SetId().Assign(
id);
319 tmp_cds->
SetData().SetCdregion();
324 bool alt_start =
false;
329 nonsense_intron =
true;
333 return nonsense_intron;
339 TSeqPos last_start = 0, last_stop = 0, start, stop;
352 for (CSeq_loc_CI curr(loc); curr; ++curr) {
353 start = curr.GetRange().GetFrom();
354 stop = curr.GetRange().GetTo();
358 if (last_start - stop == 4) {
359 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, stop + 1, last_start - 1, strand, scope)) {
364 if (start - last_stop == 4) {
365 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, last_stop + 1, start - 1, strand, scope)) {
383 vector<CRef<CSeq_loc> > intron_locs;
385 TSeqPos last_start = 0, last_stop = 0, start, stop;
397 for (CSeq_loc_CI curr(loc); curr; ++curr) {
398 start = curr.GetRange().GetFrom();
399 stop = curr.GetRange().GetTo();
403 if (last_start - stop == 4) {
404 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, stop + 1, last_start - 1, strand, &scope)) {
406 id->Assign(curr.GetSeq_id());
407 CRef<CSeq_loc> intron_loc(
new CSeq_loc(*
id, stop + 1, last_start - 1, strand));
408 intron_locs.push_back(intron_loc);
412 if (start - last_stop == 4) {
413 if (x_IsThreeBaseNonsense(feat, curr.GetSeq_id(), cdr, last_stop + 1, start - 1, strand, &scope)) {
415 id->Assign(curr.GetSeq_id());
416 CRef<CSeq_loc> intron_loc(
new CSeq_loc(*
id, last_stop + 1, start - 1, strand));
417 intron_locs.push_back(intron_loc);
437 size_t num_x = 0, num_nonx = 0;
439 ITERATE(
string, it, transl_prot) {
448 if (num_x > num_nonx) {
457 const string& except_text,
458 bool& unclassified_except,
459 bool& mismatch_except,
460 bool& frameshift_except,
461 bool& rearrange_except,
462 bool& product_replaced,
463 bool& mixed_population,
466 bool& transcript_or_proteomic)
469 unclassified_except =
true;
472 mismatch_except =
true;
475 frameshift_except =
true;
478 rearrange_except =
true;
481 product_replaced =
true;
484 mixed_population =
true;
493 transcript_or_proteomic =
true;
515 rval |= eCDSTranslationProblem_FrameNotPartial;
518 ||
NStr::Find(comment_text,
"coding region disrupted by sequencing gap") != string::npos) {
521 rval |= eCDSTranslationProblem_FrameNotConsensus;
531 if (m_ProblemFlags & eCDSTranslationProblem_FrameNotPartial ||
532 m_ProblemFlags & eCDSTranslationProblem_FrameNotConsensus ||
533 m_ProblemFlags & eCDSTranslationProblem_NoStop ||
534 m_ProblemFlags & eCDSTranslationProblem_StopPartial ||
535 m_ProblemFlags & eCDSTranslationProblem_PastStop ||
536 m_ProblemFlags & eCDSTranslationProblem_ShouldStartPartial ||
537 m_ProblemFlags & eCDSTranslationProblem_BadStart ||
538 m_ProblemFlags & eCDSTranslationProblem_NoProtein ||
539 m_ProtLen != m_TransLen ||
540 m_InternalStopCodons > 0 ||
541 m_RaggedLength > 0 || m_HasDashXStart ||
542 m_UnableToTranslate) {
544 }
else if (x_JustifiesException(m_TranslExceptProblems)) {
555 size_t transl_terminal_x = 0;
556 size_t i = transl_prot.length() - 1;
557 if (
i > 0 && transl_prot[
i] ==
'*' && skip_stop) {
561 if (transl_prot[
i] ==
'X') {
568 if (
i == 0 && transl_prot[0] ==
'X') {
571 return transl_terminal_x;
576 size_t prod_terminal_x = 0;
578 while (prod_len > 0 && prot_vec[prod_len] ==
'X') {
582 if (prod_len == 0 && prot_vec[prod_len] ==
'X') {
585 return prod_terminal_x;
598 if (
len == prot_len || has_accession) {
599 if (
len > prot_len) {
606 if (t_res != p_res) {
611 }
else if (t_res ==
'-') {
613 mismatches.push_back({ p_res, t_res,
i });
616 mismatches.push_back({ p_res, t_res,
i });
632 if (
NStr::FindNoCase(except_text,
"translation initiation by tRNA-Leu at CUG codon") ==
NPOS)
return false;
634 for (
auto it = feat.
GetQual().begin(); it != feat.
GetQual().end(); it++) {
654 bool alt_start =
false;
661 tmp_cds->
SetData().SetCdregion();
666 tmp_cds->
SetData().SetCdregion().SetCode().Assign(cdregion.
GetCode());
670 tmp_cds->
SetData().SetCdregion().ResetFrame();
675 if (!(*cbr)->IsSetLoc()) {
692 TSeqPos to = from + codon_length - 1;
695 if (codon_length == 3 ||
696 ((codon_length == 1 || codon_length == 2) && to ==
len - 1)) {
709 if ((from % 3) != start_pos) {
710 problems.push_back({ eTranslExceptPhase, 0, 0 });
713 if ((*cbr)->IsSetAa() && (*cbr)->IsSetLoc()) {
720 *tmp_cds, *scope, cb_trans,
726 size_t prot_pos = from / 3;
728 unsigned char ex = 0;
729 vector<char> seqData;
731 bool not_set =
false;
733 switch ((*cbr)->GetAa().Which()) {
735 str = (*cbr)->GetAa().GetNcbi8aa();
740 str = (*cbr)->GetAa().GetNcbi8aa();
745 seqData.push_back((*cbr)->GetAa().GetNcbieaa());
759 if (prot_pos == 0 && ex !=
'M') {
760 if (prot_pos == 0 && ex ==
'L' &&
x_LeuCUGstart(feat) && is_refseq) {
763 problems.push_back({ eTranslExceptSuspicious, ex, prot_pos });
769 if (from_end < 2 &&
NStr::Equal(except_char,
"*")) {
772 if (prot_pos == 0 && ex ==
'L' &&
x_LeuCUGstart(feat) && is_refseq) {
775 problems.push_back({ eTranslExceptUnnecessary, ex, prot_pos });
806 problems.push_back({ eTranslExceptUnexpected, ex, prot_pos });
824 for (
auto it = problems.begin(); it != problems.end(); it++) {
825 if (it->problem == eTranslExceptPhase) {
835 CSeq_loc_CI loc_it(loc);
853 if (end < seq_len - 1) {
928 int ragged =
len % 3;
936 SRelLoc rl(loc, (*cbr)->GetLoc(), scope);
938 if ((*rit)->GetTo() > last_pos) {
939 last_pos = (*rit)->GetTo();
947 if ((codon_length == 0 || codon_length == 1) &&
948 last_pos ==
len - 1) {
965 "adjusted for low-quality genome",
966 "annotated by transcript or proteomic data",
967 "artificial frameshift",
968 "reasons given in citation",
969 "transcribed product replaced",
970 "unclassified transcription discrepancy",
1003 bool ignore_exceptions,
1020 size_t exception_flags = 0;
1022 if (!ignore_exceptions &&
1027 bool has_errors =
false, other_than_mismatch =
false;
1068 if (nuc_len != rna_len) {
1070 other_than_mismatch =
true;
1071 if (nuc_len < rna_len) {
1072 size_t count_a = 0, count_no_a = 0;
1075 if ((*iter ==
'A') || (*iter ==
'a')) {
1081 if (count_a < (19 * count_no_a)) {
1085 }
else if (count_a > 0 && count_no_a == 0) {
1087 other_than_mismatch =
true;
1089 if (is_gpipe && is_genomic) {
1096 if (report_errors) {
1101 rna_len = nuc_len =
min(nuc_len, rna_len);
1104 if (report_errors) {
1110 if (rna_len == nuc_len && nuc_len > 0) {
1115 while ((nuc_ci && rna_ci) && (nuc_ci.
GetPos() < nuc_len)) {
1116 if (*nuc_ci != *rna_ci) {
1123 if (mismatches > 0) {
1133 }
catch (
const std::exception&) {
1136 if (!report_errors) {
1140 if (mismatches * 50 <= total) {
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
vector< STranslExceptProblem > TTranslExceptProblems
static size_t x_CountTerminalXs(const string &transl_prot, bool skip_stop)
void CalculateTranslationProblems(const CSeq_feat &feat, CBioseq_Handle loc_handle, CBioseq_Handle prot_handle, bool ignore_exceptions, bool far_fetch_cds, bool standalone_annot, bool single_seq, bool is_gpipe, bool is_genomic, bool is_refseq, bool is_nt_or_ng_or_nw, bool is_nc, bool has_accession, CScope *scope)
static vector< CRef< CSeq_loc > > GetNonsenseIntrons(const CSeq_feat &feat, CScope &scope)
CCDSTranslationProblems()
static bool x_ProteinHasTooManyXs(const string &transl_prot)
vector< STranslationMismatch > TTranslationMismatches
static TTranslationMismatches x_GetTranslationMismatches(const CSeq_feat &feat, const CSeqVector &prot_vec, const string &transl_prot, bool has_accession)
static int x_CheckForRaggedEnd(const CSeq_feat &feat, CScope *scope)
void x_GetCdTransErrors(const CSeq_feat &feat, CBioseq_Handle product, bool show_stop, bool got_stop, CScope *scope)
bool x_JustifiesException() const
static bool x_IsThreeBaseNonsense(const CSeq_feat &feat, const CSeq_id &id, const CCdregion &cdr, TSeqPos start, TSeqPos stop, ENa_strand strand, CScope *scope)
static size_t x_CheckCDSFrame(const CSeq_feat &feat, CScope *scope)
static bool x_Is5AtEndSpliceSiteOrGap(const CSeq_loc &loc, CScope &scope)
TTranslExceptProblems x_GetTranslExceptProblems(const CSeq_feat &feat, CBioseq_Handle loc_handle, CScope *scope, bool is_refseq)
static void x_GetExceptionFlags(const string &except_text, bool &unclassified_except, bool &mismatch_except, bool &frameshift_except, bool &rearrange_except, bool &product_replaced, bool &mixed_population, bool &low_quality, bool &rna_editing, bool &transcript_or_proteomic)
static size_t x_CountNonsenseIntrons(const CSeq_feat &feat, CScope *scope)
@Gb_qual.hpp User-defined methods of the data storage class.
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
namespace ncbi::objects::
static const char location[]
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
ENa_strand GetStrand(void) const
Get the location's strand.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
TSeqPos LocationOffset(const CSeq_loc &outer, const CSeq_loc &inner, EOffsetType how=eOffset_FromStart, CScope *scope=0)
returns (TSeqPos)-1 if the locations don't overlap
int SeqLocPartialCheck(const CSeq_loc &loc, CScope *scope)
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
@ eOffset_FromEnd
relative to end of location
@ eOffset_FromStart
For positive-orientation strands, start = left and end = right; for reverse-orientation strands,...
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
vector< CRef< TRange > > TRanges
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
@ fIs5PrimePartial
= 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
TSeqPos GetBioseqLength(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetPos(void) const
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
void SetCoding(TCoding coding)
CScope & GetScope(void) const
void Reset(void)
Reset reference object.
static TThisType GetEmpty(void)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static int Compare(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Compare of a substring with another string.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
bool IsCdregion(void) const
Check if variant Cdregion is selected.
const TQual & GetQual(void) const
Get the Qual member data.
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
const TLocation & GetLocation(void) const
Get the Location member data.
TFrame GetFrame(void) const
Get the Frame member data.
const TData & GetData(void) const
Get the Data member data.
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
bool CanGetPartial(void) const
Check if it is safe to call GetPartial method.
const TCode & GetCode(void) const
Get the Code member data.
void SetData(TData &value)
Assign a value to Data data member.
const TCdregion & GetCdregion(void) const
Get the variant data.
bool CanGetExcept_text(void) const
Check if it is safe to call GetExcept_text method.
TPseudo GetPseudo(void) const
Get the Pseudo member data.
const TProduct & GetProduct(void) const
Get the Product member data.
bool IsSetQual(void) const
Check if a value has been assigned to Qual data member.
bool CanGetExcept(void) const
Check if it is safe to call GetExcept method.
const TComment & GetComment(void) const
Get the Comment member data.
TPartial GetPartial(void) const
Get the Partial member data.
bool CanGetCode(void) const
Check if it is safe to call GetCode method.
TExcept GetExcept(void) const
Get the Except member data.
bool CanGetProduct(void) const
Check if it is safe to call GetProduct method.
const TQual & GetQual(void) const
Get the Qual member data.
const TCode_break & GetCode_break(void) const
Get the Code_break member data.
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
bool IsSetCode_break(void) const
individual exceptions Check if a value has been assigned to Code_break data member.
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
bool CanGetPseudo(void) const
Check if it is safe to call GetPseudo method.
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
@ eFrame_three
reading frame
@ e_Ncbieaa
ASCII value of NCBIeaa code.
ENa_strand
strand of nucleic acid
@ e_Ncbieaa
extended ASCII 1 letter aa codes
range(_Ty, _Ty) -> range< _Ty >
static const char * str(char *buf, int n)
Location relative to a base Seq-loc: one (usually) or more ranges of offsets.
@ eMRNAExcept_Unclassified
@ eMRNAExcept_ProductReplaced
CStaticArraySet< const char *, PCase_CStr > TBypassMrnaTransCheckSet
static bool x_LeuCUGstart(const CSeq_feat &feat)
DEFINE_STATIC_ARRAY_MAP(TBypassMrnaTransCheckSet, sc_BypassMrnaTransCheck, sc_BypassMrnaTransCheckText)
static const char *const sc_BypassMrnaTransCheckText[]
size_t InterpretMrnaException(const string &except_text)
size_t GetMRNATranslationProblems(const CSeq_feat &feat, size_t &mismatches, bool ignore_exceptions, CBioseq_Handle nuc, CBioseq_Handle rna, bool far_fetch, bool is_gpipe, bool is_genomic, CScope *scope)
@ eMRNAProblem_UnnecessaryException
@ eMRNAProblem_UnableToFetch
@ eMRNAProblem_TranscriptLenLess
@ eMRNAProblem_PolyATail95
@ eMRNAProblem_TranscriptLenMore
@ eMRNAProblem_ProductReplaced
@ eMRNAProblem_ErroneousException
@ eMRNAProblem_PolyATail100