NCBI C++ ToolKit
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilities.hpp 101299 2023-11-28 18:18:38Z stakhovv $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat
27  *
28  * File Description:
29  * Definition for utility classes and functions.
30  */
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistr.hpp>
41 //#include <objects/taxon3/T3Data.hpp>
42 //#include <objects/taxon3/Taxon3_reply.hpp>
46 #include <objmgr/seq_vector.hpp>
48 #include <serial/iterator.hpp>
50 #include <vector>
51 #include <list>
57 class CGb_qual;
58 class CScope;
59 class CSeq_entry;
60 class CT3Data;
61 class CT3Reply;
63 BEGIN_SCOPE(validator)
66 // =============================================================================
67 // Functions
68 // =============================================================================
70 bool IsClassInEntry(const CSeq_entry& se, CBioseq_set::EClass clss);
71 bool IsDeltaOrFarSeg(const CSeq_loc& loc, CScope* scope);
72 bool IsBlankStringList(const list< string >& str_list);
73 TGi GetGIForSeqId(const CSeq_id& id);
77 CSeqVector GetSequenceFromLoc(const CSeq_loc& loc, CScope& scope,
82  bool product = false);
84 string GetSequenceStringFromLoc(const CSeq_loc& loc, CScope& scope);
87 inline
88 bool IsResidue(unsigned char residue) { return residue <= 250; }
91 string GetAccessionFromBioseq(const CBioseq& bioseq, int* version);
92 string GetAccessionFromBioseqSet(const CBioseq_set& bsst, int* version);
105 typedef enum {
114 EAccessionFormatError ValidateAccessionString (const string& accession, bool require_version);
116 bool s_IdXrefsAreReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna);
117 bool s_FeatureIdsMatch (const CFeat_id& f1, const CFeat_id& f2);
118 bool s_StringHasPMID (const string& str);
119 bool HasBadCharacter (const string& str);
120 bool EndsWithBadCharacter (const string& str);
122 typedef enum {
132 int CheckDate (const CDate& date, bool require_full_date = false);
133 bool NCBI_VALIDATOR_EXPORT IsDateInPast(const CDate& date);
134 string GetDateErrorDescription (int flags);
136 bool IsBioseqTSA (const CBioseq& seq, CScope* scope);
138 #if 0
139 // disabled for now
140 bool IsNCBIFILESeqId (const CSeq_id& id);
141 #endif
143 string GetValidatorLocationLabel (const CSeq_loc& loc, CScope& scope);
144 void AppendBioseqLabel(string& str, const CBioseq& sq, bool supress_context);
145 string GetBioseqIdLabel(const CBioseq& sq);
147 bool NCBI_VALIDATOR_EXPORT HasECnumberPattern (const string& str);
149 bool SeqIsPatent (const CBioseq& seq);
150 bool SeqIsPatent (const CBioseq_Handle& seq);
152 bool s_PartialAtGapOrNs(CScope* scope, const CSeq_loc& loc, unsigned int tag, bool only_gap = false);
156 typedef enum {
164 (const CBioseq_Handle& bsh,
165  EBioseqEndIsType& begin_n,
166  EBioseqEndIsType& begin_gap,
167  EBioseqEndIsType& end_n,
168  EBioseqEndIsType& end_gap,
169  bool &begin_ambig,
170  bool &end_ambig);
172 bool ShouldCheckForNsAndGap(const CBioseq_Handle& bsh);
175 (const CSeqVector& vec,
176 EBioseqEndIsType& begin_n,
177 EBioseqEndIsType& begin_gap,
178 EBioseqEndIsType& end_n,
179 EBioseqEndIsType& end_gap,
180 bool& begin_ambig,
181 bool& end_ambig);
185 /// Indicates whether feature is a dicistronic gene
186 /// @param f Seq-feat-Handle [in]
187 /// @return Boolean
191 typedef enum {
201 /// Reports how two features duplicate each other
202 /// @param f1 Seq-feat-Handle [in]
203 /// @param f2 Seq-feat-Handle [in]
204 /// @return EDuplicateFeatureType return value indicates how features are duplicates
206  (const CSeq_feat_Handle& f1,
207  const CSeq_feat_Handle& f2,
208  bool check_partials = false,
209  bool case_sensitive = false);
211 bool IsLocFullLength (const CSeq_loc& loc, const CBioseq_Handle& bsh);
212 bool PartialsSame (const CSeq_loc& loc1, const CSeq_loc& loc2);
214 // specific-host functions
215 /// returns true and error_msg will be empty, if specific host is valid
216 /// returns true and error_msg will be "Host is empty", if specific host is empty
217 /// returns false if specific host is invalid
218 bool NCBI_VALIDATOR_EXPORT IsSpecificHostValid(const string& host, string& error_msg);
219 /// returns the corrected specific host, if the specific host is invalid and can be corrected
220 /// returns an empty string, if the specific host is invalid and cannot be corrected
221 /// returns the original value except the preceding/trailing spaces, if the specific host is valid
222 string NCBI_VALIDATOR_EXPORT FixSpecificHost(const string& host);
224 bool NCBI_VALIDATOR_EXPORT IsCommonName (const CT3Data& data);
226 bool NCBI_VALIDATOR_EXPORT FindMatchInOrgRef (const string& str, const COrg_ref& org);
228 bool NCBI_VALIDATOR_EXPORT IsLikelyTaxname(const string& val);
229 string InterpretSpecificHostResult(const string& host, const CT3Reply& reply, const string& orig_host = kEmptyStr);
232 // function is used to convert a pub title into a 'term' parameter of CEutilsClient::Search method
233 void NCBI_VALIDATOR_EXPORT ConvertToEntrezTerm(string& title);
235 string NCBI_VALIDATOR_EXPORT TranslateCodingRegionForValidation(const CSeq_feat& feat, CScope &scope, bool& alt_start);
237 // if special text is found in a feature exception, translation errors will not be reported
238 bool NCBI_VALIDATOR_EXPORT ReportTranslationErrors(const string& except_text);
240 // checks to see if this feature would be reported as having a bad start codon
241 bool NCBI_VALIDATOR_EXPORT HasBadStartCodon(const CSeq_feat& feat, CScope &scope, bool ignore_exceptions);
243 // checks to see if this location and translation has a bad start codon
244 // note that this might not be reported if the feature is pseudo, or has an appropriate exception
245 bool NCBI_VALIDATOR_EXPORT HasBadStartCodon(const CSeq_loc& loc, const string& transl_prot);
247 size_t CountInternalStopCodons(const string& transl_prot);
248 bool NCBI_VALIDATOR_EXPORT HasInternalStop(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions);
251 bool HasBadProteinStart(const CSeqVector& sv);
254 size_t CountProteinStops(const CSeqVector& sv);
255 bool NCBI_VALIDATOR_EXPORT HasStopInProtein(const CSeq_feat& feat, CScope& scope);
257 void FeatureHasEnds(const CSeq_feat& feat, CScope* scope, bool& no_beg, bool& no_end);
258 CBioseq_Handle GetCDSProductSequence(const CSeq_feat& feat, CScope* scope, const CTSE_Handle & tse, bool far_fetch, bool& is_far);
259 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CBioseq_Handle& prot_handle, const string& transl_prot);
260 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot);
261 void CalculateEffectiveTranslationLengths(const string& transl_prot, const CSeqVector& prot_vec, size_t &len, size_t& prot_len);
262 bool NCBI_VALIDATOR_EXPORT HasNoStop(const CSeq_feat& feat, CScope* scope);
264 bool NCBI_VALIDATOR_EXPORT IsSequenceFetchable(const CSeq_id& id, CScope* scope = nullptr);
265 bool NCBI_VALIDATOR_EXPORT IsSequenceFetchable(const string& seq_id, CScope* scope = nullptr);
269 bool IsNTNCNWACAccession(const string& acc);
270 bool IsNTNCNWACAccession(const CSeq_id& id);
271 bool IsNTNCNWACAccession(const CBioseq& seq);
272 bool IsNG(const CSeq_id& id);
273 bool IsNG(const CBioseq& seq);
275 bool IsTemporary(const CSeq_id& id);
277 bool IsOrganelle(int genome);
283 bool ConsistentWithA(Char ch);
284 bool ConsistentWithC(Char ch);
285 bool ConsistentWithG(Char ch);
286 bool ConsistentWithT(Char ch);
290 END_SCOPE(validator)
294 #endif /* VALIDATOR___UTILITIES__HPP */
static CRef< CScope > m_Scope
CBioseq_Handle –.
CBioseq_set_Handle –.
Definition: Date.hpp:53
CFeat_id –.
Definition: Feat_id.hpp:66
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Base class for all serializable objects.
Definition: serialbase.hpp:150
CT3Reply –.
Definition: T3Reply.hpp:66
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static uch flags
Definition: t0006.c:12
vector< CSeq_id_Handle > TIds
Definition: scope.hpp:143
CSeqVector constructor flags.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
char Char
Alias for char.
Definition: ncbitype.h:93
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
Definition: ncbi_export.h:913
vector< CRef< CDbtag > > TDbxref
Definition: Seq_feat_.hpp:123
int len
static int version
Definition: mdb_load.c:29
const char * tag
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
bool IsBlankStringList(const list< string > &str_list)
Definition: utilities.cpp:114
CBioseq_Handle BioseqHandleFromLocation(CScope *m_Scope, const CSeq_loc &loc)
Definition: utilities.cpp:1277
bool IsDateInPast(const CDate &date)
Definition: utilities.cpp:826
CSeqVector GetSequenceFromLoc(const CSeq_loc &loc, CScope &scope, CBioseq_Handle::EVectorCoding coding=CBioseq_Handle::eCoding_Iupac)
Definition: utilities.cpp:203
bool ConsistentWithT(Char ch)
Definition: utilities.cpp:2886
string GetDateErrorDescription(int flags)
Definition: utilities.cpp:855
bool IsResidue(unsigned char residue)
Definition: utilities.hpp:88
void CalculateEffectiveTranslationLengths(const string &transl_prot, const CSeqVector &prot_vec, size_t &len, size_t &prot_len)
Definition: utilities.cpp:2597
CConstRef< CSeq_id > GetReportableSeqIdForAlignment(const CSeq_align &align, CScope &scope)
Definition: utilities.cpp:399
CBioseq_set_Handle GetNucProtSetParent(const CBioseq_Handle &bioseq)
Definition: utilities.cpp:581
bool HasNoStop(const CSeq_feat &feat, CScope *scope)
Definition: utilities.cpp:2673
bool IsCommonName(const CT3Data &data)
Definition: utilities.cpp:1944
CScope::TIds GetSeqIdsForGI(TGi gi)
Definition: utilities.cpp:142
bool ShouldCheckForNsAndGap(const CBioseq_Handle &bsh)
Definition: utilities.cpp:1307
Definition: utilities.hpp:156
@ eBioseqEndIsType_Last
Definition: utilities.hpp:158
@ eBioseqEndIsType_None
Definition: utilities.hpp:157
@ eBioseqEndIsType_All
Definition: utilities.hpp:159
bool IsDicistronic(const CSeq_feat_Handle &f)
Definition: utilities.cpp:1825
bool HasECnumberPattern(const string &str)
Definition: utilities.cpp:1088
bool IsNTNCNWACAccession(const string &acc)
Definition: utilities.cpp:2755
CBioseq_Handle GetNucBioseq(const CBioseq_set_Handle &bioseq_set)
Definition: utilities.cpp:587
string GetBioseqIdLabel(const CBioseq &sq)
Definition: utilities.cpp:981
void AppendBioseqLabel(string &str, const CBioseq &sq, bool supress_context)
Definition: utilities.cpp:1064
CBioseq_Handle GetCDSProductSequence(const CSeq_feat &feat, CScope *scope, const CTSE_Handle &tse, bool far_fetch, bool &is_far)
Definition: utilities.cpp:2574
bool HasBadCharacter(const string &str)
Definition: utilities.cpp:755
CBioseq_set_Handle GetGenProdSetParent(const CBioseq_set_Handle &set)
Definition: utilities.cpp:570
bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
Definition: utilities.cpp:3050
bool HasMisSpellFlag(const CT3Data &data)
Definition: utilities.cpp:1966
string SpecificHostValueToCheck(const string &val)
Definition: utilities.cpp:2042
string GetAccessionFromBioseqSet(const CBioseq_set &bsst, int *version)
Definition: utilities.cpp:433
bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:3080
bool IsClassInEntry(const CSeq_entry &se, CBioseq_set::EClass clss)
Definition: utilities.cpp:79
bool EndsWithBadCharacter(const string &str)
Definition: utilities.cpp:768
bool s_FeatureIdsMatch(const CFeat_id &f1, const CFeat_id &f2)
Definition: utilities.cpp:717
bool PartialsSame(const CSeq_loc &loc1, const CSeq_loc &loc2)
Definition: utilities.cpp:1463
bool IsLocFullLength(const CSeq_loc &loc, const CBioseq_Handle &bsh)
Definition: utilities.cpp:1451
string GetSequenceStringFromLoc(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:175
string GetValidatorLocationLabel(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:958
bool IsOrganelle(int genome)
Definition: utilities.cpp:2831
EDuplicateFeatureType IsDuplicate(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool check_partials=false, bool case_sensitive=false)
Reports how two features duplicate each other.
Definition: utilities.cpp:1838
bool IsDicistronicGene(const CSeq_feat_Handle &f)
Indicates whether feature is a dicistronic gene.
Definition: utilities.cpp:1818
bool HasBadProteinStart(const CSeqVector &sv)
Definition: utilities.cpp:2467
bool ConsistentWithA(Char ch)
Definition: utilities.cpp:2871
void CheckBioseqEndsForNAndGap(const CBioseq_Handle &bsh, EBioseqEndIsType &begin_n, EBioseqEndIsType &begin_gap, EBioseqEndIsType &end_n, EBioseqEndIsType &end_gap, bool &begin_ambig, bool &end_ambig)
Definition: utilities.cpp:1422
bool s_StringHasPMID(const string &str)
Definition: utilities.cpp:727
CSeqVector GetSequenceFromFeature(const CSeq_feat &feat, CScope &scope, CBioseq_Handle::EVectorCoding coding=CBioseq_Handle::eCoding_Iupac, bool product=false)
Definition: utilities.cpp:214
bool DoesFeatureHaveUnnecessaryException(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2939
bool IsDeltaOrFarSeg(const CSeq_loc &loc, CScope *scope)
Definition: utilities.cpp:90
bool IsNG(const CSeq_id &id)
Definition: utilities.cpp:2790
bool ReportTranslationErrors(const string &except_text)
Definition: utilities.cpp:2341
bool HasInternalStop(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2416
string InterpretSpecificHostResult(const string &host, const CT3Reply &reply, const string &orig_host=kEmptyStr)
Definition: utilities.cpp:2085
Definition: utilities.hpp:122
@ eDateValid_bad_str
Definition: utilities.hpp:124
@ eDateValid_empty_date
Definition: utilities.hpp:130
@ eDateValid_valid
Definition: utilities.hpp:123
@ eDateValid_bad_year
Definition: utilities.hpp:125
@ eDateValid_bad_day
Definition: utilities.hpp:127
@ eDateValid_bad_other
Definition: utilities.hpp:129
@ eDateValid_bad_month
Definition: utilities.hpp:126
@ eDateValid_bad_season
Definition: utilities.hpp:128
bool ConsistentWithC(Char ch)
Definition: utilities.cpp:2876
bool IsSequenceFetchable(const CSeq_id &id, CScope *scope=nullptr)
Definition: utilities.cpp:2714
bool IsLikelyTaxname(const string &val)
Definition: utilities.cpp:2150
string GetAccessionFromBioseq(const CBioseq &bioseq, int *version)
Definition: utilities.cpp:427
bool s_IdXrefsAreReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna)
bool SeqIsPatent(const CBioseq &seq)
Definition: utilities.cpp:1155
bool IsFarLocation(const CSeq_loc &loc, const CSeq_entry_Handle &seh)
Definition: utilities.cpp:159
string FixSpecificHost(const string &host)
returns the corrected specific host, if the specific host is invalid and can be corrected returns an ...
Definition: utilities.cpp:2189
TGi GetGIForSeqId(const CSeq_id &id)
Definition: utilities.cpp:125
size_t CountInternalStopCodons(const string &transl_prot)
Definition: utilities.cpp:2393
bool IsSpecificHostValid(const string &host, string &error_msg)
returns true and error_msg will be empty, if specific host is valid returns true and error_msg will b...
Definition: utilities.cpp:2182
CRef< CSeqVector > MakeSeqVectorForResidueCounting(const CBioseq_Handle &bsh)
Definition: utilities.cpp:2455
EAccessionFormatError ValidateAccessionString(const string &accession, bool require_version)
Definition: utilities.cpp:624
CBioseq_set_Handle GetSetParent(const CBioseq_set_Handle &set, CBioseq_set::TClass set_class)
Definition: utilities.cpp:532
void ConvertToEntrezTerm(string &title)
Definition: utilities.cpp:2220
Definition: utilities.hpp:191
@ eDuplicate_Duplicate
Definition: utilities.hpp:193
@ eDuplicate_DuplicateDifferentTable
Definition: utilities.hpp:195
@ eDuplicate_SameIntervalDifferentLabel
Definition: utilities.hpp:194
@ eDuplicate_Not
Definition: utilities.hpp:192
@ eDuplicate_SameIntervalDifferentLabelDifferentTable
Definition: utilities.hpp:196
const CSeq_feat::TDbxref TDbtags
Definition: utilities.hpp:199
bool IsTemporary(const CSeq_id &id)
Definition: utilities.cpp:2817
bool ConsistentWithG(Char ch)
Definition: utilities.cpp:2881
bool IsBioseqTSA(const CBioseq &seq, CScope *scope)
Definition: utilities.cpp:884
Definition: utilities.hpp:105
@ eAccessionFormat_too_long
Definition: utilities.hpp:110
@ eAccessionFormat_missing_version
Definition: utilities.hpp:111
@ eAccessionFormat_valid
Definition: utilities.hpp:106
@ eAccessionFormat_bad_version
Definition: utilities.hpp:112
@ eAccessionFormat_no_start_letters
Definition: utilities.hpp:107
@ eAccessionFormat_wrong_number_of_digits
Definition: utilities.hpp:108
@ eAccessionFormat_null
Definition: utilities.hpp:109
bool FindMatchInOrgRef(const string &str, const COrg_ref &org)
Definition: utilities.cpp:1985
int CheckDate(const CDate &date, bool require_full_date=false)
Definition: utilities.cpp:780
vector< TSeqPos > GetMismatches(const CSeq_feat &feat, const CBioseq_Handle &prot_handle, const string &transl_prot)
Definition: utilities.cpp:2658
bool g_IsMasterAccession(const CSeq_id &id)
Definition: utilities.cpp:2999
void FeatureHasEnds(const CSeq_feat &feat, CScope *scope, bool &no_beg, bool &no_end)
Definition: utilities.cpp:2546
string GetAccessionFromObjects(const CSerialObject *obj, const CSeq_entry *ctx, CScope &scope, int *version)
Definition: utilities.cpp:443
bool HasStopInProtein(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2519
size_t CountProteinStops(const CSeqVector &sv)
Definition: utilities.cpp:2504
bool s_PartialAtGapOrNs(CScope *scope, const CSeq_loc &loc, unsigned int tag, bool only_gap=false)
Definition: utilities.cpp:1176
void AdjustSpecificHostForTaxServer(string &spec_host)
Definition: utilities.cpp:2032
bool HasBadStartCodon(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2362
string TranslateCodingRegionForValidation(const CSeq_feat &feat, CScope &scope, bool &alt_start)
Definition: utilities.cpp:2266
static const char * str(char *buf, int n)
Definition: stats.c:84
Modified on Thu Dec 07 10:08:25 2023 by rev. 669887