1 /* $Id: utilities.hpp 101299 2023-11-28 18:18:38Z stakhovv $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat
27  *
28  * File Description:
29  * Definition for utility classes and functions.
30  */
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistr.hpp>
41 //#include <objects/taxon3/T3Data.hpp>
42 //#include <objects/taxon3/Taxon3_reply.hpp>
46 #include <objmgr/seq_vector.hpp>
48 #include <serial/iterator.hpp>
50 #include <vector>
51 #include <list>
57 class CGb_qual;
58 class CScope;
59 class CSeq_entry;
60 class CT3Data;
61 class CT3Reply;
63 BEGIN_SCOPE(validator)
66 // =============================================================================
67 // Functions
68 // =============================================================================
70 bool IsClassInEntry(const CSeq_entry& se, CBioseq_set::EClass clss);
71 bool IsDeltaOrFarSeg(const CSeq_loc& loc, CScope* scope);
72 bool IsBlankStringList(const list< string >& str_list);
73 TGi GetGIForSeqId(const CSeq_id& id);
77 CSeqVector GetSequenceFromLoc(const CSeq_loc& loc, CScope& scope,
82  bool product = false);
84 string GetSequenceStringFromLoc(const CSeq_loc& loc, CScope& scope);
87 inline
88 bool IsResidue(unsigned char residue) { return residue <= 250; }
91 string GetAccessionFromBioseq(const CBioseq& bioseq, int* version);
92 string GetAccessionFromBioseqSet(const CBioseq_set& bsst, int* version);
105 typedef enum {
114 EAccessionFormatError ValidateAccessionString (const string& accession, bool require_version);
116 bool s_IdXrefsAreReciprocal(const CSeq_feat &cds, const CSeq_feat &mrna);
117 bool s_FeatureIdsMatch (const CFeat_id& f1, const CFeat_id& f2);
118 bool s_StringHasPMID (const string& str);
119 bool HasBadCharacter (const string& str);
120 bool EndsWithBadCharacter (const string& str);
122 typedef enum {
132 int CheckDate (const CDate& date, bool require_full_date = false);
133 bool NCBI_VALIDATOR_EXPORT IsDateInPast(const CDate& date);
134 string GetDateErrorDescription (int flags);
136 bool IsBioseqTSA (const CBioseq& seq, CScope* scope);
138 #if 0
139 // disabled for now
140 bool IsNCBIFILESeqId (const CSeq_id& id);
141 #endif
143 string GetValidatorLocationLabel (const CSeq_loc& loc, CScope& scope);
144 void AppendBioseqLabel(string& str, const CBioseq& sq, bool supress_context);
145 string GetBioseqIdLabel(const CBioseq& sq);
147 bool NCBI_VALIDATOR_EXPORT HasECnumberPattern (const string& str);
149 bool SeqIsPatent (const CBioseq& seq);
150 bool SeqIsPatent (const CBioseq_Handle& seq);
152 bool s_PartialAtGapOrNs(CScope* scope, const CSeq_loc& loc, unsigned int tag, bool only_gap = false);
156 typedef enum {
164 (const CBioseq_Handle& bsh,
165  EBioseqEndIsType& begin_n,
166  EBioseqEndIsType& begin_gap,
167  EBioseqEndIsType& end_n,
168  EBioseqEndIsType& end_gap,
169  bool &begin_ambig,
170  bool &end_ambig);
172 bool ShouldCheckForNsAndGap(const CBioseq_Handle& bsh);
175 (const CSeqVector& vec,
176 EBioseqEndIsType& begin_n,
177 EBioseqEndIsType& begin_gap,
178 EBioseqEndIsType& end_n,
179 EBioseqEndIsType& end_gap,
180 bool& begin_ambig,
181 bool& end_ambig);
185 /// Indicates whether feature is a dicistronic gene
186 /// @param f Seq-feat-Handle [in]
187 /// @return Boolean
191 typedef enum {
201 /// Reports how two features duplicate each other
202 /// @param f1 Seq-feat-Handle [in]
203 /// @param f2 Seq-feat-Handle [in]
204 /// @return EDuplicateFeatureType return value indicates how features are duplicates
206  (const CSeq_feat_Handle& f1,
207  const CSeq_feat_Handle& f2,
208  bool check_partials = false,
209  bool case_sensitive = false);
211 bool IsLocFullLength (const CSeq_loc& loc, const CBioseq_Handle& bsh);
212 bool PartialsSame (const CSeq_loc& loc1, const CSeq_loc& loc2);
214 // specific-host functions
215 /// returns true and error_msg will be empty, if specific host is valid
216 /// returns true and error_msg will be "Host is empty", if specific host is empty
217 /// returns false if specific host is invalid
218 bool NCBI_VALIDATOR_EXPORT IsSpecificHostValid(const string& host, string& error_msg);
219 /// returns the corrected specific host, if the specific host is invalid and can be corrected
220 /// returns an empty string, if the specific host is invalid and cannot be corrected
221 /// returns the original value except the preceding/trailing spaces, if the specific host is valid
222 string NCBI_VALIDATOR_EXPORT FixSpecificHost(const string& host);
224 bool NCBI_VALIDATOR_EXPORT IsCommonName (const CT3Data& data);
226 bool NCBI_VALIDATOR_EXPORT FindMatchInOrgRef (const string& str, const COrg_ref& org);
228 bool NCBI_VALIDATOR_EXPORT IsLikelyTaxname(const string& val);
229 string InterpretSpecificHostResult(const string& host, const CT3Reply& reply, const string& orig_host = kEmptyStr);
232 // function is used to convert a pub title into a 'term' parameter of CEutilsClient::Search method
233 void NCBI_VALIDATOR_EXPORT ConvertToEntrezTerm(string& title);
235 string NCBI_VALIDATOR_EXPORT TranslateCodingRegionForValidation(const CSeq_feat& feat, CScope &scope, bool& alt_start);
237 // if special text is found in a feature exception, translation errors will not be reported
238 bool NCBI_VALIDATOR_EXPORT ReportTranslationErrors(const string& except_text);
240 // checks to see if this feature would be reported as having a bad start codon
241 bool NCBI_VALIDATOR_EXPORT HasBadStartCodon(const CSeq_feat& feat, CScope &scope, bool ignore_exceptions);
243 // checks to see if this location and translation has a bad start codon
244 // note that this might not be reported if the feature is pseudo, or has an appropriate exception
245 bool NCBI_VALIDATOR_EXPORT HasBadStartCodon(const CSeq_loc& loc, const string& transl_prot);
247 size_t CountInternalStopCodons(const string& transl_prot);
248 bool NCBI_VALIDATOR_EXPORT HasInternalStop(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions);
251 bool HasBadProteinStart(const CSeqVector& sv);
254 size_t CountProteinStops(const CSeqVector& sv);
255 bool NCBI_VALIDATOR_EXPORT HasStopInProtein(const CSeq_feat& feat, CScope& scope);
257 void FeatureHasEnds(const CSeq_feat& feat, CScope* scope, bool& no_beg, bool& no_end);
258 CBioseq_Handle GetCDSProductSequence(const CSeq_feat& feat, CScope* scope, const CTSE_Handle & tse, bool far_fetch, bool& is_far);
259 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CBioseq_Handle& prot_handle, const string& transl_prot);
260 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot);
261 void CalculateEffectiveTranslationLengths(const string& transl_prot, const CSeqVector& prot_vec, size_t &len, size_t& prot_len);
262 bool NCBI_VALIDATOR_EXPORT HasNoStop(const CSeq_feat& feat, CScope* scope);
264 bool NCBI_VALIDATOR_EXPORT IsSequenceFetchable(const CSeq_id& id, CScope* scope = nullptr);
265 bool NCBI_VALIDATOR_EXPORT IsSequenceFetchable(const string& seq_id, CScope* scope = nullptr);
269 bool IsNTNCNWACAccession(const string& acc);
270 bool IsNTNCNWACAccession(const CSeq_id& id);
271 bool IsNTNCNWACAccession(const CBioseq& seq);
272 bool IsNG(const CSeq_id& id);
273 bool IsNG(const CBioseq& seq);
275 bool IsTemporary(const CSeq_id& id);
277 bool IsOrganelle(int genome);
283 bool ConsistentWithA(Char ch);
284 bool ConsistentWithC(Char ch);
285 bool ConsistentWithG(Char ch);
286 bool ConsistentWithT(Char ch);
290 END_SCOPE(validator)
294 #endif /* VALIDATOR___UTILITIES__HPP */
