NCBI C++ ToolKit
cleanup.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef CLEANUP___CLEANUP__HPP
2 #define CLEANUP___CLEANUP__HPP
3 
4 /* $Id: cleanup.hpp 102112 2024-04-02 18:07:29Z stakhovv $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Robert Smith, Michael Kornbluh
30  *
31  * File Description:
32  * Basic Cleanup of CSeq_entries.
33  * .......
34  *
35  */
36 #include <objmgr/scope.hpp>
38 #include <objects/seq/MolInfo.hpp>
39 #include <objects/pub/Pub.hpp>
42 
43 
46 
47 class CSeq_entry;
48 class CBioseq;
49 class CBioseq_set;
50 class CSeq_annot;
51 class CSeq_feat;
52 class CSeq_submit;
53 class COrgName;
54 class CSubmit_block;
55 class CAuthor;
56 class CAuth_list;
57 class CName_std;
58 
59 class CSeq_entry_Handle;
60 class CBioseq_Handle;
61 class CBioseq_set_Handle;
62 class CSeq_annot_Handle;
63 class CSeq_feat_Handle;
64 
65 class CCleanupChange;
66 class IObjtoolsListener;
67 
69 {
70 public:
71 
73  eClean_NoReporting = 0x1,
74  eClean_GpipeMode = 0x2,
75  eClean_NoNcbiUserObjects = 0x4,
76  eClean_SyncGenCodes = 0x8,
77  eClean_NoProteinTitles = 0x10,
78  eClean_KeepTopSet = 0x20,
79  eClean_KeepSingleSeqSet = 0x40,
80  eClean_InHugeSeqSet = 0x80,
81  };
82 
85  eScope_UseInPlace
86  };
87 
88  // Construtor / Destructor
89  CCleanup(CScope* scope = nullptr, EScopeOptions scope_handling = eScope_Copy);
90  CCleanup(const CCleanup&) = delete;
91  CCleanup& operator=(const CCleanup&) = delete;
92 
93  ~CCleanup();
94 
95  void SetScope(CScope* scope);
96 
98 
99  // BASIC CLEANUP
100 
101  TChanges BasicCleanup(CSeq_entry& se, Uint4 options = 0);
102  /// Cleanup a Seq-submit.
103  TChanges BasicCleanup(CSeq_submit& ss, Uint4 options = 0);
104  /// Cleanup a Bioseq_set.
105  TChanges BasicCleanup(CBioseq_set& bss, Uint4 options = 0);
106  /// Cleanup a Seq-Annot.
107  TChanges BasicCleanup(CSeq_annot& sa, Uint4 options = 0);
108  /// Cleanup a Seq-feat.
109  TChanges BasicCleanup(CSeq_feat& sf, Uint4 options = 0);
110  /// Cleanup a BioSource.
111  TChanges BasicCleanup(CBioSource& src, Uint4 options = 0);
112  // Cleanup a Submit-block
113  TChanges BasicCleanup(CSubmit_block& block, Uint4 options = 0);
114  // Cleanup descriptors
115  TChanges BasicCleanup(CSeqdesc& desc, Uint4 options = 0);
116  TChanges BasicCleanup(CSeq_descr & desc, Uint4 options = 0);
117 
118  // Handle versions.
119  TChanges BasicCleanup(CSeq_entry_Handle& seh, Uint4 options = 0);
120  TChanges BasicCleanup(CBioseq_Handle& bsh, Uint4 options = 0);
121  TChanges BasicCleanup(CBioseq_set_Handle& bssh, Uint4 options = 0);
122  TChanges BasicCleanup(CSeq_annot_Handle& sak, Uint4 options = 0);
123  TChanges BasicCleanup(CSeq_feat_Handle& sfh, Uint4 options = 0);
124 
125  // Extended Cleanup
126  /// Cleanup a Seq-entry.
127  TChanges ExtendedCleanup(CSeq_entry& se, Uint4 options = 0);
128  /// Cleanup a Seq-submit.
129  TChanges ExtendedCleanup(CSeq_submit& ss, Uint4 options = 0);
130  /// Cleanup a Seq-Annot.
131  TChanges ExtendedCleanup(CSeq_annot& sa, Uint4 options = 0);
132 
133  // Handle versions
134  static TChanges ExtendedCleanup(CSeq_entry_Handle& seh, Uint4 options = 0);
135 
136  // Useful cleanup functions
137 
138  static bool ShouldStripPubSerial(const CBioseq& bs);
139 
140 
141 /// Moves protein-specific features from nucleotide sequences in the Seq-entry to
142 /// the appropriate protein sequence.
143 /// @param seh Seq-entry Handle to edit [in]
144 /// @return Boolean return value indicates whether any changes were made
145  static bool MoveProteinSpecificFeats(CSeq_entry_Handle seh);
146 
147  /// Moves one feature from nucleotide bioseq to
148  /// the appropriate protein sequence.
149  /// @param fh Feature to edit
150  /// @return Boolean return value indicates whether any changes were made
151  static bool MoveFeatToProtein(CSeq_feat_Handle fh);
152 
153 /// Calculates whether a Gene-xref is unnecessary (because it refers to the
154 /// same gene as would be calculated using overlap)
155 /// @param sf Seq-feat with the xref [in]
156 /// @param scope Scope in which to search for location [in]
157 /// @param gene_xref Gene-ref of gene-xref [in]
158 /// @return Boolean return value indicates whether gene-xref is unnecessary
159  static bool IsGeneXrefUnnecessary(const CSeq_feat& sf, CScope& scope, const CGene_ref& gene_xref);
160 
161 /// Removes unnecessary Gene-xrefs
162 /// @param f Seq-feat to edit [in]
163 /// @param scope Scope in which to search for locations [in]
164 /// @return Boolean return value indicates whether gene-xrefs were removed
165  static bool RemoveUnnecessaryGeneXrefs(CSeq_feat& f, CScope& scope);
166 
167 /// Removes unnecessary Gene-xrefs on features in Seq-entry
168 /// @param seh Seq-entry-Handle to edit [in]
169 /// @return Boolean return value indicates whether gene-xrefs were removed
170  static bool RemoveUnnecessaryGeneXrefs(CSeq_entry_Handle seh);
171 
172 /// Removes non-suppressing Gene-xrefs
173 /// @param f Seq-feat to edit [in]
174 /// @return Boolean return value indicates whether gene-xrefs were removed
175  static bool RemoveNonsuppressingGeneXrefs(CSeq_feat& f);
176 
177 
178 /// Repairs non-reciprocal xref pairs for specified feature if xrefs between
179 /// subtypes are permitted and feature with missing xref does not have an
180 /// xref to a different feature of the same subtype
181 /// @param f Seq-feat to edit [in]
182 /// @param tse top-level Seq-entry in which to search for the other half of the xref pair
183 /// @return Boolean return value indicates whether xrefs were created
184  static bool RepairXrefs(const CSeq_feat& f, const CTSE_Handle& tse);
185 
186 /// Repairs non-reciprocal xref pairs for specified feature pair if xrefs between
187 /// subtypes are permitted and feature with missing xref does not have an
188 /// xref to a different feature of the same subtype
189 /// @param f Seq-feat to edit [in]
190 /// @param tse top-level Seq-entry in which to search for the other half of the xref pair
191 /// @return Boolean return value indicates whether xrefs were created
192  static bool RepairXrefs(const CSeq_feat& src, CSeq_feat_Handle& dst, const CTSE_Handle& tse);
193 
194 /// Repairs non-reciprocal xref pairs in specified seq-entry
195 /// @param seh Seq-entry to edit [in]
196 /// @return Boolean return value indicates whether xrefs were created
197  static bool RepairXrefs(CSeq_entry_Handle seh);
198 
199 /// Detects gene features with matching locus
200 /// @param f Seq-feat parent feature of gene_xref [in]
201 /// @param gene_xref Gene-ref of gene-xref [in]
202 /// @param bsh CBioseq_Handle parent bioseq in which to search for genes [in]
203 /// @return Boolean return value indicates whether a gene feature with matching locus has been found
204  static bool FindMatchingLocusGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh);
205 
206 /// Removes orphaned locus Gene-xrefs
207 /// @param f Seq-feat to edit [in]
208 /// @param bsh CBioseq_Handle in which to search for gene features [in]
209 /// @return Boolean return value indicates whether gene-xrefs were removed
210  static bool RemoveOrphanLocusGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh);
211 
212 /// Detects gene features with matching locus_tag
213 /// @param f Seq-feat parent feature of gene_xref [in]
214 /// @param gene_xref Gene-ref of gene-xref [in]
215 /// @param bsh CBioseq_Handle parent bioseq in which to search for genes [in]
216 /// @return Boolean return value indicates whether a gene feature with matching locus_tag has been found
217  static bool FindMatchingLocus_tagGene(CSeq_feat& f, const CGene_ref& gene_xref, CBioseq_Handle bsh);
218 
219 /// Removes orphaned locus_tag Gene-xrefs
220 /// @param f Seq-feat to edit [in]
221 /// @param bsh CBioseq_Handle in which to search for gene features [in]
222 /// @return Boolean return value indicates whether gene-xrefs were removed
223  static bool RemoveOrphanLocus_tagGeneXrefs(CSeq_feat& f, CBioseq_Handle bsh);
224 
225 /// Extends a location to the specificed position.
226 /// @param loc Seq-loc to extend
227 /// @param pos position of new end of location
228 /// @param scope Scope in which to look for sequences
229 /// @return Boolean return value indicates whether the location was extended
230  static bool SeqLocExtend(CSeq_loc& loc, size_t pos, CScope& scope);
231 
232 
233 /// Extends a coding region up to 50 nt. if the coding region:
234 /// 1. does not end with a stop codon
235 /// 2. is adjacent to a stop codon
236 /// 3. is not pseudo
237 /// @param f Seq-feat to edit
238 /// @param bsh CBioseq_Handle on which the feature is located
239 /// @return Boolean return value indicates whether the feature was extended
240  static bool ExtendToStopIfShortAndNotPartial(CSeq_feat& f, CBioseq_Handle bsh, bool check_for_stop = true);
241 
242 /// Checks whether it is possible to extend the original location up to improved one. It is possible only if
243 /// the original location is less than improved
244 /// @param orig Seq-loc to check
245 /// @param improved Seq-loc original location may be extended to
246 /// @return Boolean return value indicates whether the extention is possible
247  static bool LocationMayBeExtendedToMatch(const CSeq_loc& orig, const CSeq_loc& improved);
248 
249 /// Extends a feature up to limit nt to a stop codon, or to the end of the sequence
250 /// if limit == 0 (partial will be set if location extends to end of sequence but
251 /// no stop codon is found)
252 /// @param f Seq-feat to edit
253 /// @param bsh CBioseq_Handle on which the feature is located
254 /// @param limit maximum number of nt to extend, or 0 if unlimited
255 /// @return Boolean return value indicates whether the feature was extended
256  static bool ExtendToStopCodon(CSeq_feat& f, CBioseq_Handle bsh, size_t limit);
257  static bool ExtendStopPosition(CSeq_feat& f, const CSeq_feat* cdregion, size_t extension = 0);
258 
259 /// Translates coding region and selects best frame (without stops, or longest)
260 /// @param cds Coding region Seq-feat to edit
261 /// @param scope Scope in which to find coding region
262 /// @return Boolean return value indicates whether the coding region was changed
263  static bool SetBestFrame(CSeq_feat& cds, CScope& scope);
264 
265 /// Chooses best frame based on location
266 /// 1. If the location is 5' complete, then the frame must be one.
267 /// 2. If the location is 5' partial and 3' complete, select a frame using the
268 /// value of the location length modulo 3.
269 /// @param cdregion Coding Region in which to set frame
270 /// @param loc Location to use for setting frame
271 /// @param scope Scope in which to find location sequence(s)
272 /// @return Boolean return value indicates whether the frame was changed
273  static bool SetFrameFromLoc(CCdregion &cdregion, const CSeq_loc& loc, CScope& scope);
274  static bool SetFrameFromLoc(CCdregion::EFrame &frame, const CSeq_loc& loc, CScope& scope);
275 
276 /// 1. Set the partial flags when the CDS is partial and codon_start is 2 or 3
277 /// 2. Make the CDS partial at the 5' end if there is no start codon
278 /// 3. Make the CDS partial at the 3' end if there is no stop codon
279 /// @param cds Coding region Seq-feat to edit
280 /// @param scope Scope in which to find coding region and coding region's protein
281 /// product sequence
282 /// @return Boolean return value indicates whether the coding region changed
283  static bool SetCDSPartialsByFrameAndTranslation(CSeq_feat& cds, CScope& scope);
284 
285 
286 /// Clear internal partials
287  static bool ClearInternalPartials(CSeq_loc& loc, bool is_first = true, bool is_last = true);
288  static bool ClearInternalPartials(CSeq_loc_mix& mix, bool is_first = true, bool is_last = true);
289  static bool ClearInternalPartials(CPacked_seqint& pint, bool is_first = true, bool is_last = true);
290  static bool ClearInternalPartials(CSeq_entry_Handle seh);
291 
292 /// Set feature partial based on feature location
293  static bool SetFeaturePartial(CSeq_feat& f);
294 
295 /// Update EC numbers
296 /// @param ec_num_list Prot-ref ec number list to clean
297 /// @return Boolean value indicates whether any changes were made
298  static bool UpdateECNumbers(CProt_ref::TEc & ec_num_list);
299 
300 /// Delete EC numbers
301 /// @param ec_num_list Prot-ref ec number list to clean
302 /// @return Boolean value indicates whether any changes were made
303  static bool RemoveBadECNumbers(CProt_ref::TEc & ec_num_list);
304 
305 /// Fix EC numbers
306 /// @param entry Seq-entry-handle to clean
307 /// @return Boolean value indicates whether any changes were made
308  static bool FixECNumbers(CSeq_entry_Handle entry);
309 
310 /// Set partialness of gene to match longest feature contained in gene
311 /// @param gene Seq-feat to edit
312 /// @param scope Scope in which to find gene
313 /// @return Boolean return value indicates whether the gene changed
314  static bool SetGenePartialByLongestContainedFeature(CSeq_feat& gene, CScope& scope);
315 
316  static void SetProteinName(CProt_ref& prot, const string& protein_name, bool append);
317  static void SetProteinName(CSeq_feat& cds, const string& protein_name, bool append, CScope& scope);
318  static void SetMrnaName(CSeq_feat& mrna, const string& protein_name);
319  static const string& GetProteinName(const CProt_ref& prot);
320  static const string& GetProteinName(const CSeq_feat& cds, CSeq_entry_Handle seh);
321 
322 /// Sets MolInfo::tech for a sequence
323 /// @param seq Bioseq to edit
324 /// @param tech tech value to set
325 /// @return Boolean tech was changed
326  static bool SetMolinfoTech(CBioseq_Handle seq, CMolInfo::ETech tech);
327 
328 /// Sets MolInfo::biomol for a sequence
329 /// @param seq Bioseq to edit
330 /// @param biomol biomol value to set
331 /// @return Boolean biomol was changed
332  static bool SetMolinfoBiomol(CBioseq_Handle seq, CMolInfo::EBiomol biomol);
333 
334 
335 /// Adds missing MolInfo descriptor to sequence
336 /// @param seq Bioseq to edit
337 /// @return Boolean return value indicates whether descriptor was added
338  static bool AddMissingMolInfo(CBioseq& seq, bool is_product);
339 
340 /// Creates missing protein title descriptor
341 /// @param seq Bioseq to edit
342 /// @return Boolean return value indicates whether title was added
343  static bool AddProteinTitle(CBioseq_Handle bsh);
344 
345 /// Removes NcbiCleanup User Objects in the Seq-entry
346 /// @param seq_entry Seq-entry to edit
347 /// @return Boolean return value indicates whether object was removed
348  static bool RemoveNcbiCleanupObject(CSeq_entry &seq_entry);
349 
350 /// Adds NcbiCleanup User Object to Seq-descr
351  static void AddNcbiCleanupObject(int ncbi_cleanup_version, CSeq_descr& descr);
352 
353 /// Looks up Org-refs in the Seq-entry
354 /// @param seh Seq-entry to edit
355 /// @return Boolean return value indicates whether object was updated
356  static bool TaxonomyLookup(CSeq_entry_Handle seh);
357 
358 
359 /// Sets genetic codes for coding regions on Bioseq-Handle
360 /// @param Bioseq-Handle to examine
361 /// @return Boolean indicates whether any coding regions were updated
362  static bool SetGeneticCodes(CBioseq_Handle bsh);
363 
364 /// Adjusts protein title to reflect partialness
365 /// @param Bioseq to adjust
366 /// @return Boolean indicates whether title was updated
367  static bool AddPartialToProteinTitle(CBioseq &bioseq);
368 
369 /// Removes protein product from pseudo coding region
370 /// @param cds Seq-feat to adjust
371 /// @param scope Scope in which to find protein sequence and remove it
372 /// @return Boolean indicates whether anything changed
373  static bool RemovePseudoProduct(CSeq_feat& cds, CScope& scope);
374 
375  static CRef<CSeq_entry> AddProtein(const CSeq_feat& cds, CScope& scope);
376 
377 /// Expands gene to include features it cross-references
378 /// @param gene Seq-feat to adjust
379 /// @param tse Top-level Seq-entry in which to find other features
380 /// @return Boolean indicates whether anything changed
381  static bool ExpandGeneToIncludeChildren(CSeq_feat& gene, CTSE_Handle& tse);
382 
383 /// Performs WGS specific cleanup
384 /// @param entry Seq-entry to edit
385 /// @return Boolean return value indicates whether object was updated
386  static bool WGSCleanup(CSeq_entry_Handle entry, bool instantiate_missing_proteins = true, Uint4 options = 0,
387  bool run_extended_cleanup = true);
388 
389 /// For table2asn -c s
390 /// Adds an exception of "low-quality sequence region" to coding regions
391 /// and mRNAs that are not pseudo and have an intron <11bp in length
392 /// @param entry Seq-entry to edit
393 /// @return Boolean return value indicates whether object was updated
394  static bool AddLowQualityException(CSeq_entry_Handle entry);
395 
396 /// Normalize Descriptor Order on a specific Seq-entry
397 /// @param entry Seq-entry to edit
398 /// @return Boolean return value indicates whether object was updated
399  static bool NormalizeDescriptorOrder(CSeq_descr& descr);
400 
401 /// Normalize Descriptor Order on a specific Seq-entry
402 /// @param seh Seq-entry-Handle to edit
403 /// @return Boolean return value indicates whether object was updated
404  static bool NormalizeDescriptorOrder(CSeq_entry_Handle seh);
405 
406 /// Remove all titles in Seqdescr except the last, because it is the
407 /// only one that would be displayed in the flatfile
408 /// @param seq Bioseq-Handle to edit
409 /// @return Boolean return value indicates whether any titles were removed
410  static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSeq seq);
411 
412 /// Remove all titles in Seqdescr except the last, because it is the
413 /// only one that would be displayed in the flatfile
414 /// @param set Bioseq-set-Handle to edit
415 /// @return Boolean return value indicates whether any titles were removed
416  static bool RemoveUnseenTitles(CSeq_entry_EditHandle::TSet set);
417 
418 /// Add GenBank Wrapper Set
419 /// @param entry Seq-entry to edit
420 /// @return Boolean return value indicates whether object changed
421  static bool AddGenBankWrapper(CSeq_entry_Handle seh);
422 
423 
424 /// For Publication Citations
425 /// Get labels for a pubdesc. To be used in citations.
426  static void GetPubdescLabels
427  (const CPubdesc& pd,
428  vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
429  vector<string>& published_labels, vector<string>& unpublished_labels);
430 
431 /// Get list of pubs that can be used for citations for Seq-feat on a Bioseq-handle
432 /// @param bsh Bioseq-handle to search
433 /// @return vector<CConstRef<CPub> > ordered list of pubs
434 /// Note that Seq-feat.cit appear in the flatfile using the position
435 /// in the list
436  static vector<CConstRef<CPub> > GetCitationList(CBioseq_Handle bsh);
437 
438 /// Remove duplicate publications
439  static bool RemoveDuplicatePubs(CSeq_descr& descr);
440 
441  /// Some pubs should not be promoted to nuc-prot set from sequence
442  static bool OkToPromoteNpPub(const CPubdesc& pd);
443 
444  /// For some sequences, pubs should not be promoted to nuc-prot set from sequence
445  static bool OkToPromoteNpPub(const CBioseq& b);
446 
447  static bool PubAlreadyInSet(const CPubdesc& pd, const CSeq_descr& descr);
448 
449 /// Convert full-length publication features to publication descriptors.
450 /// @param seh Seq-entry to edit
451 /// @return bool indicates whether any changes were made
452  static bool ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh);
453 
454 /// Rescue pubs from Site-ref features
455 /// @param seh Seq-entry to edit
456 /// @return bool indicates whether any changes were made
457  static bool RescueSiteRefPubs(CSeq_entry_Handle seh);
458 
459 /// Is this a "minimal" pub? (If yes, do not rescue from a Seq-feat.cit)
460  static bool IsMinPub(const CPubdesc& pd, bool is_refseq_prot);
461 
462  //helper function for moving feature to pubdesc descriptor
463  static void MoveOneFeatToPubdesc(CSeq_feat_Handle feat, CRef<CSeqdesc> d, CBioseq_Handle b, bool remove_feat = true);
464 
465 /// Remove duplicate biosource descriptors
466  static bool RemoveDupBioSource(CSeq_descr& descr);
467 
468 /// Get BioSource from feature to use for source descriptor
469  static CRef<CBioSource> BioSrcFromFeat(const CSeq_feat& f);
470 
471  static bool AreBioSourcesMergeable(const CBioSource& src1, const CBioSource& src2);
472  static bool MergeDupBioSources(CSeq_descr& descr);
473  static bool MergeDupBioSources(CBioSource& src1, const CBioSource& add);
474 
475 
476 /// Convert full-length source features to source descriptors
477 /// @param seh Seq-entry to edit
478 /// @return bool indicates whether any changes were made
479  static bool ConvertSrcFeatsToSrcDescs(CSeq_entry_Handle seh);
480 
481 /// Examine all genes and gene xrefs in the Seq-entry.
482 /// If no genes have locus and some have locus tag AND no gene xrefs have locus-tag
483 /// and some gene xrefs have locus, change all gene xrefs to use locus tag.
484 /// If no genes have locus tag and some have locus AND no gene xrefs have locus
485 /// and some gene xrefs have locus tag, change all gene xrefs to use locus.
486 /// @param seh Seq-entry to edit
487 /// @return bool indicates whether any changes were made
488  static bool FixGeneXrefSkew(CSeq_entry_Handle seh);
489 
490 /// Convert nuc-prot sets with just one sequence to just the sequence
491 /// can't be done during the explore phase because it changes a seq to a set
492 /// @param seh Seq-entry to edit
493 /// @return bool indicates whether any changes were made
494  static bool RenormalizeNucProtSets(CSeq_entry_Handle seh);
495 
496 /// decodes various tags, including carriage-return-line-feed constructs
497  static bool DecodeXMLMarkChanged(std::string & str);
498 
499  static CRef<CSeq_loc> GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, CScope& scope);
500  static CRef<CSeq_loc> GetProteinLocationFromNucleotideLocation(const CSeq_loc& nuc_loc, const CSeq_feat& cds, CScope& scope, bool require_inframe = false);
501 
502 /// Find proteins that are not packaged in the same nuc-prot set as the
503 /// coding region for which they are a product, and move them to that
504 /// nuc-prot set. Ignore coding regions that are in gen-prod-sets.
505 /// @param seh Seq-entry to edit
506 /// @return bool indicates whether any changes were made
507  static bool RepackageProteins(CSeq_entry_Handle seh);
508  static bool RepackageProteins(const CSeq_feat& cds, CBioseq_set_Handle np);
509 
510  static bool ConvertDeltaSeqToRaw(CSeq_entry_Handle seh, CSeq_inst::EMol filter = CSeq_inst::eMol_not_set);
511 
512 /// Parse string into code break and add to coding region.
513 /// @param feat feature that contains coding region - necessary to determine codon boundaries
514 /// @param cds coding region to which code breaks will be added
515 /// @param str string from which to parse code break
516 /// @param scope scope in which to find sequences referenced (used for location comparisons)
517 /// @return bool indicates string was successfully parsed and code break was added
518  static bool ParseCodeBreak(const CSeq_feat& feat,
519  CCdregion& cds,
520  const CTempString& str,
521  CScope& scope,
522  IObjtoolsListener* pMessageListener=nullptr);
523 
524 /// Parses all valid transl_except Gb-quals into code-breaks for cdregion,
525 /// then removes the transl_except Gb-quals that were successfully parsed
526 /// @param feat feature that contains coding region
527 /// @param scope scope in which to find sequences referenced (used for location comparisons)
528 /// @return bool indicates changes were made
529  static bool ParseCodeBreaks(CSeq_feat& feat, CScope& scope);
530 
531  static size_t MakeSmallGenomeSet(CSeq_entry_Handle entry);
532 
533 /// From SQD-4329
534 /// For each sequence with a source that has an IRD db_xref, create a misc_feature
535 /// across the entire span and move the IRD db_xref from the source to the misc_feature.
536 /// Create a suppressing gene xref for the misc_feature.
537 /// @param entry Seq-entry on which to search for sources and create features
538 /// @return bool indicates changes were made
539  static bool MakeIRDFeatsFromSourceXrefs(CSeq_entry_Handle entry);
540 
541 /// From GB-7563
542 /// An action has been requested that will do the following:
543 /// 1. This action should be limited to protein sequences where the product
544 /// is an exact match to a specified text (the usual string constraint
545 /// is not needed).
546 /// 2. Protein sequences for which the coding region is 5' partial should
547 /// not be affected.
548 /// 3. When the protein name matches, the following actions should be taken
549 /// if and only if the first amino acid of the protein sequence is not
550 /// M (methionine):
551 /// a. The first amino acid of the protein sequence should be changed to
552 /// methionine.
553 /// b. The coding region should have the text "RNA editing" added to
554 /// Seq-feat.except_text (separated from any existing text by a semicolon).
555 /// If Seq-feat.except is not already true, it should be set to true.
556 /// c. A code-break should be added to Cdregion.code-break where the
557 /// Code-break.loc is the location of the first codon of the coding region
558 /// and Code-break.aa is ncbieaa 'M' (Indexers will refer to "code-breaks"
559 /// as "translation exceptions" because these appear in the flatfile as a
560 /// /transl_except qualifier.
561 ///
562 /// It will be the responsibility of the caller to only invoke this function
563 /// for coding regions where the product name is a match, and the protein sequence
564 /// does not already start with an M.
565 
566  static bool FixRNAEditingCodingRegion(CSeq_feat& cds);
567 
568  /// utility function for setting code break location given offset
569  /// pos is the position of the amino acid where the translation exception
570  /// occurs (starts with 1)
571  static void SetCodeBreakLocation(CCode_break& cb, size_t pos, const CSeq_feat& cds);
572 
573  static bool IsMethionine(const CCode_break& cb);
574 
575  /// utility function for finding the code break for a given amino acid position
576  /// pos is the position of the amino acid where the translation exception
577  /// occurs (starts with 1)
578  static CConstRef<CCode_break> GetCodeBreakForLocation(size_t pos, const CSeq_feat& cds);
579 
580  // From the request in GB-7166, we want to be able to move /gene
581  // qualifiers that have been added to the coding region but not the
582  // parent gene to the parent gene.
583  // If the coding region also has /locus_tag qualifier which is different
584  // from the one on the parent gene features, do not move the qualifier.
585  // If there are two coding regions that are mapped to the same gene,
586  // do not move the qualifier.
587  static bool NormalizeGeneQuals(CSeq_feat& cds, CSeq_feat& gene);
588  static bool NormalizeGeneQuals(CBioseq_Handle bsh);
589  static bool NormalizeGeneQuals(CSeq_entry_Handle seh);
590  typedef pair<CSeq_feat_Handle, CSeq_feat_Handle> TFeatGenePair; // by convention, cds first, gene second
591  static vector<TFeatGenePair> GetNormalizableGeneQualPairs(CBioseq_Handle bsh);
592 
593  // This function is used to do generic string cleanup on User-object string fields
594  // and apply specific cleanups to known types of User-object
595  static bool CleanupUserObject(CUser_object& object);
596 
597  // for cleaning up authors, lists of authors, and affiliation
598  static bool CleanupAuthor(CAuthor& author, bool fix_initials = true);
599  static bool CleanupAuthList(CAuth_list& al, bool fix_initials = true);
600  static void ResetAuthorNames(CAuth_list::TNames& names);
601  static bool CleanupAffil(CAffil& af);
602  static bool IsEmpty(const CAuth_list::TAffil& affil);
603 
604  // for cleaning up collection-date subsource qualifiers
605  static bool CleanupCollectionDates(CSeq_entry_Handle seh, bool month_first);
606 
607  static void AutodefId(CSeq_entry_Handle seh);
608 
609  // for finding the correct amino acid letter given an abbreviation
610  static char ValidAminoAcid(string_view abbrev);
611 
612 private:
614 
615  static bool x_CleanupUserField(CUser_field& field);
616 
617  static bool x_MergeDupOrgNames(COrgName& on1, const COrgName& add);
618  static bool x_MergeDupOrgRefs(COrg_ref& org1, const COrg_ref& add);
619 
620  static bool x_HasShortIntron(const CSeq_loc& loc, size_t min_len = 11);
621  static bool x_AddLowQualityException(CSeq_feat& feat);
622  static bool x_AddLowQualityException(CSeq_entry_Handle entry, CSeqFeatData::ESubtype subtype);
623 
624  static bool s_IsProductOnFeat(const CSeq_feat& cds);
625  static void s_SetProductOnFeat(CSeq_feat& feat, const string& protein_name, bool append);
626 
627  static bool s_CleanupGeneOntology(CUser_object& obj);
628  static bool s_CleanupStructuredComment(CUser_object& obj);
629  static bool s_RemoveEmptyFields(CUser_object& obj);
630  static bool s_CleanupGenomeAssembly(CUser_object& obj);
631  static bool s_CleanupDBLink(CUser_object& obj);
632  static bool s_AddNumToUserField(CUser_field &field);
633 
634  static bool s_CleanupNameStdBC(CName_std& name, bool fix_initials);
635  static void s_ExtractSuffixFromInitials(CName_std& name);
636  static void s_FixEtAl(CName_std& name);
637 
638  // for cleaning pubdesc
639  static bool s_Flatten(CPub_equiv& pub_equiv);
640 };
641 
642 
643 
646 
647 #endif /* CLEANUP___CLEANUP__HPP */
User-defined methods of the data storage class.
static CRef< CSeq_loc > ExtendToStopCodon(CRef< CSeq_feat > feat, CScope *scope)
@Affil.hpp User-defined methods of the data storage class.
Definition: Affil.hpp:56
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CAuthor –.
Definition: Author.hpp:59
CBioseq_EditHandle –.
CBioseq_Handle –.
CBioseq_set_EditHandle –.
CBioseq_set_Handle –.
CCdregion –.
Definition: Cdregion.hpp:66
EScopeOptions
Definition: cleanup.hpp:83
@ eScope_Copy
Definition: cleanup.hpp:84
EValidOptions
Definition: cleanup.hpp:72
CCleanup(const CCleanup &)=delete
static bool s_Flatten(CPub_equiv &pub_equiv)
pair< CSeq_feat_Handle, CSeq_feat_Handle > TFeatGenePair
Definition: cleanup.hpp:590
CRef< CScope > m_Scope
Definition: cleanup.hpp:613
CCleanup & operator=(const CCleanup &)=delete
CCode_break –.
Definition: Code_break.hpp:66
CConstRef –.
Definition: ncbiobj.hpp:1266
@Name_std.hpp User-defined methods of the data storage class.
Definition: Name_std.hpp:56
CObject –.
Definition: ncbiobj.hpp:180
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CScope –.
Definition: scope.hpp:92
CSeq_annot_Handle –.
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSubmit_block –.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Definition: set.hpp:45
static const struct name_t names[]
static void DLIST_NAME() append(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:78
static const char * str(char *buf, int n)
Definition: stats.c:84
static FILE * f
Definition: readconf.c:23
string
Definition: cgiapp.hpp:690
string GetProteinName(const CBioseq_Handle &seq)
Return protein name from corresponding Prot-ref feature.
Definition: sequence.cpp:356
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_CLEANUP_EXPORT
Definition: ncbi_export.h:953
list< string > TEc
Definition: Prot_ref_.hpp:110
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
void GetPubdescLabels(const objects::CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
CRef< CSeq_loc > SeqLocExtend(const CSeq_loc &loc, size_t pos, CScope *scope)
Definition: loc_edit.cpp:546
Modified on Fri Sep 20 14:57:56 2024 by modify_doxy.py rev. 669887