NCBI C++ ToolKit
sequence.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef SEQUENCE__HPP
2 #define SEQUENCE__HPP
3 
4 /* $Id: sequence.hpp 99963 2023-05-24 19:16:13Z foleyjp $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Clifford Clausen & Aaron Ucko
30 *
31 * File Description:
32 * Sequence utilities requiring CScope
33 * Obtains or constructs a sequence's title. (Corresponds to
34 * CreateDefLine in the C toolkit.)
35 */
36 
37 #include <corelib/ncbistd.hpp>
38 #include <serial/serial.hpp>
39 #include <serial/objistr.hpp>
40 #include <serial/objostr.hpp>
41 
47 #include <util/strsearch.hpp>
49 #include <util/range_coll.hpp>
50 #include <objmgr/feat_ci.hpp>
51 #include <objmgr/bioseq_ci.hpp>
52 #include <objmgr/scope.hpp>
55 
58 
59 // Forward declarations
60 class CSeq_id;
61 class CSeq_loc_mix;
62 class CSeq_point;
63 class CPacked_seqpnt;
64 class CBioseq_Handle;
65 class CSeq_loc_Mapper;
66 class CSeqVector;
67 class CCdregion;
68 class CSeq_feat;
69 class CSeq_entry;
70 class CSeq_entry_handle;
71 class CGenetic_code;
72 class CMolInfo;
73 class CSeq_gap;
74 class COrgMod;
75 
76 BEGIN_SCOPE(sequence)
77 
78 /** @addtogroup ObjUtilSequence
79  *
80  * @{
81  */
82 
83 
84 /** @name SeqIdConv
85  * Conversions between seq-id types
86  * @{
87  */
88 
89 
91  eWithAccessionVersion, ///< accession.version (when possible)
92  eWithoutAccessionVersion ///< accession only, even if version is available
93 };
94 
95 
96 /// Retrieve a particular seq-id from a given bioseq handle. This uses
97 /// CSynonymsSet internally to decide which seq-id should be used.
99  eGetId_ForceGi = 0x0000, ///< return only a gi-based seq-id
100  eGetId_ForceAcc = 0x0001, ///< return only an accession based seq-id
101  eGetId_Best = 0x0002, ///< return the "best" gi (uses FindBestScore(),
102  ///< with CSeq_id::CalculateScore() as the score
103  ///< function
104  eGetId_HandleDefault = 0x0003, ///< returns the ID associated with a bioseq-handle
105 
106  eGetId_Seq_id_Score = 0x0004, ///< use CSeq_id::Score() as the scoring function
107  eGetId_Seq_id_BestRank = 0x0005, ///< use CSeq_id::BestRank() as the scoring function
108  eGetId_Seq_id_WorstRank = 0x0006, ///< use CSeq_id::WorstRank() as the scoring function
109  eGetId_Seq_id_FastaAARank = 0x0007, ///< use CSeq_id::FastaAARank() as the scoring function
110  eGetId_Seq_id_FastaNARank = 0x0008, ///< use CSeq_id::FastaNARank() as the scoring function
111 
112  ///< "canonical" here means "most specific"; this differs from "best" in
113  ///< that "best" is intended for display purposes
115 
116  eGetId_TypeMask = 0x00FF, ///< Mask for requested id type
117 
118  /// Check if the seq-id is present in the scope
119  eGetId_VerifyId = 0x0100,
120 
121  /// Throw exception on errors. If not set, an empty value is returned.
123 
125 };
126 typedef int EGetIdType;
127 
128 
129 /// Given an accession string retrieve the GI id.
130 /// If no GI was found returns 0 or throws CSeqIdFromHandleException
131 /// depending on the flags.
132 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
133 /// flags are checked.
135 TGi GetGiForAccession(const string& acc,
136  CScope& scope,
137  EGetIdType flags = 0);
138 
139 /// Retrieve the accession for a given GI.
140 /// If no accession was found returns an empty string or throws
141 /// CSeqIdFromHandleException depending on the flags.
142 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
143 /// flags are checked.
145 string GetAccessionForGi(TGi gi,
146  CScope& scope,
148  EGetIdType flags = 0);
149 
150 /// Given a Seq-id retrieve the corresponding GI.
151 /// If no GI was found returns 0 or throws CSeqIdFromHandleException
152 /// depending on the flags.
153 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
154 /// flags are checked.
156 TGi GetGiForId(const objects::CSeq_id& id,
157  CScope& scope,
158  EGetIdType flags = 0);
159 
160 /// Retrieve the accession string for a Seq-id.
161 /// If no accession was found returns an empty string or throws
162 /// CSeqIdFromHandleException depending on the flags.
163 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
164 /// flags are checked.
166 string GetAccessionForId(const objects::CSeq_id& id,
167  CScope& scope,
169  EGetIdType flags = 0);
170 
171 /// Return a selected ID type for a given bioseq handle. This function
172 /// will try to use the most efficient method possible to determine which
173 /// ID fulfills the requested parameter. This version will call
174 /// sequence::GetId() with the bioseq handle's seq-id.
175 ///
176 /// @param id Source id to evaluate
177 /// @param scope Scope for seq-id resolution.
178 /// @param type Type of ID to return
179 /// @return A requested seq-id.
180 /// Depending on the flags set in 'type' this function can verify
181 /// if the requested ID exists in the scope and throw
182 /// CSeqIdFromHandleException if the request cannot be satisfied.
184 CSeq_id_Handle GetId(const CBioseq_Handle& handle,
186 
187 /// Return a selected ID type for a seq-id. This function
188 /// will try to use the most efficient method possible to determine which
189 /// ID fulfills the requested parameter. The following logic is used:
190 ///
191 /// - For seq-id type eGetId_HandleDefault, the original seq-id is returned.
192 /// This satisfies the condition of returning a bioseq-handle's seq-id if
193 /// sequence::GetId() is applied to a CBioseq_Handle.
194 ///
195 /// - For seq-id type eGetId_ForceAcc, the returned set of seq-ids will first
196 /// be evaluated for a "best" id (which, given seq-id scoring, will be
197 /// a textseq-id if one exists). If the returned best ID is a textseq-id,
198 /// this id will be returned. Otherwise, an exception is thrown or an
199 /// empty handle returned.
200 ///
201 /// - For seq-id type eGetId_ForceGi, the returned set of IDs is scanned for
202 /// an ID of type gi. If this is found, it is returned; otherwise, an
203 /// exception is thrown or an empty handle returned. If the supplied ID is
204 /// already a gi and eGetId_VerifyId flag is not set, no work is done.
205 ///
206 /// @param id Source id to evaluate
207 /// @param scope Scope for seq-id resolution.
208 /// @param type Type of ID to return
209 /// @return A requested seq-id.
210 /// Depending on the flags set in 'type' this function can verify
211 /// if the requested ID exists in the scope and throw
212 /// CSeqIdFromHandleException if the request cannot be satisfied.
214 CSeq_id_Handle GetId(const CSeq_id& id, CScope& scope,
216 
217 /// Return a selected ID type for a seq-id handle.
218 /// Arguments (except 'id') and behavior is the same as of
219 /// GetId(const CSeq_id& id, ...).
221 CSeq_id_Handle GetId(const CSeq_id_Handle& id, CScope& scope,
223 
224 /// Return a selected ID type from a set of Seq-ids
225 /// Arguments (except 'id') and behavior is the same as of
226 /// GetId(const CSeq_id& id, ...).
230 
231 /// Return a selected ID type from a Bioseq
232 /// Arguments (except 'seq') and behavior is the same as of
233 /// GetId(const CBioseq_Handle& seq, ...).
235 CSeq_id_Handle GetId(const CBioseq& seq,
237 
238 /* @} */
239 
240 
241 /** @name FindLatestSequence
242  * Walk the replace history to find the latest revision of a sequence
243  * @{
244  */
245 
246 /// Given a seq-id check its replace history and try to find the latest
247 /// revision. The function stops and returns NULL if it detects some
248 /// strange conditions like an infinite recursion. If the bioseq
249 /// contains no history information, the original id is returned.
252 
255 
256 /// Check replace history up to the specified date. Returns the latest
257 /// bioseq up to the date or the original id if the bioseq contains no
258 /// history or is already newer than the specified date.
261  CScope& scope,
262  const CTime& tlim);
263 
266  CScope& scope,
267  const CTime& tlim);
268 
269 /* @} */
270 
271 
272 /** @name GetTitle
273  * Get sequence's title (used in various flat-file formats.)
274  * Deprecated in favor of CDeflineGenerator.
275  * @{
276  */
277 
278 /// This function is here rather than in CBioseq because it may need
279 /// to inspect other sequences. The reconstruct flag indicates that it
280 /// should ignore any existing title Seqdesc.
282  fGetTitle_Reconstruct = 0x1, ///< ignore existing title Seqdesc.
283  fGetTitle_Organism = 0x2, ///< append [organism]
284  fGetTitle_AllProteins = 0x4, ///< normally just names the first
285  fGetTitle_NoExpensive = 0x8 ///< skip potential expensive operations
286 };
287 typedef int TGetTitleFlags;
288 
290 string GetTitle(const CBioseq_Handle& hnd, TGetTitleFlags flags = 0);
292 bool GetTitle(const CBioseq& seq, string* title_ptr, TGetTitleFlags flags = 0);
293 
294 /* @} */
295 
296 
297 /** @name Source and Product
298  * Mapping locations through features
299  * @{
300  */
301 
302 enum ES2PFlags {
303  fS2P_NoMerge = 0x1, ///< don't merge adjacent intervals on the product
304  fS2P_AllowTer = 0x2 ///< map the termination codon as a legal location
305 };
306 typedef int TS2PFlags; // binary OR of ES2PFlags
307 
310  const CSeq_loc& source_loc, TS2PFlags flags = 0,
311  CScope* scope = 0, int* frame = 0);
312 
313 enum EP2SFlags {
314  fP2S_Extend = 0x1 ///< if hitting ends, extend to include partial codons
315 };
316 typedef int TP2SFlags; // binary OR of ES2PFlags
317 
319 CRef<CSeq_loc> ProductToSource(const CSeq_feat& feat, const CSeq_loc& prod_loc,
320  TP2SFlags flags = 0, CScope* scope = 0);
321 
322 /* @} */
323 
324 
325 /** @name Overlapping
326  * Searching for features
327  * @{
328  */
329 
331  /// requires explicit association, rather than analysis based on overlaps
333 
334  /// don't perform any expensive tests, such as ones that require fetching
335  /// additional sequences
337 
338  /// favor longer features over shorter features
340 
341  /// Pay no attention to strands when finding the best feat. This may be
342  /// useful for, e.g., trans-spliced genes.
344 
345  /// default options: do everything
347 };
348 typedef int TBestFeatOpts;
349 
350 
351 /// Storage for features and scores.
352 typedef pair<Int8, CConstRef<CSeq_feat> > TFeatScore;
353 typedef vector<TFeatScore> TFeatScores;
354 
355 // To avoid putting custom logic into the GetOverlappingFeatures
356 // function, we allow plugins
358 public:
360  virtual void processSAnnotSelector(
361  SAnnotSelector &sel ) = 0;
362 
363  virtual void setUpFeatureIterator (
364  CBioseq_Handle &bioseq_handle,
365  unique_ptr<CFeat_CI> &feat_ci,
366  TSeqPos circular_length ,
368  const CSeq_loc& loc,
369  SAnnotSelector &sel,
370  CScope &scope,
371  ENa_strand &strand ) = 0;
372 
373  virtual void processLoc(
374  CBioseq_Handle &bioseq_handle,
375  CRef<CSeq_loc> &loc,
376  TSeqPos circular_length ) = 0;
377 
378  virtual void processMainLoop(
379  bool &shouldContinueToNextIteration,
380  CRef<CSeq_loc> &cleaned_loc_this_iteration,
381  CRef<CSeq_loc> &candidate_feat_loc,
382  EOverlapType &overlap_type_this_iteration,
383  bool &revert_locations_this_iteration,
384  CBioseq_Handle &bioseq_handle,
385  const CMappedFeat &feat,
386  TSeqPos circular_length,
387  SAnnotSelector::EOverlapType annot_overlap_type ) = 0;
388 
389  virtual void postProcessDiffAmount(
390  Int8 &cur_diff,
391  CRef<CSeq_loc> &cleaned_loc_this_iteration,
392  CRef<CSeq_loc> &candidate_feat_loc,
393  CScope &scope,
394  SAnnotSelector &sel,
395  TSeqPos circular_length ) = 0;
396 };
397 
398 /// Find all features overlapping the location. Features and corresponding
399 /// scores are stored in the 'feats' vector. The scores are calculated as
400 /// difference between the input location and each feature's location.
401 /// NOTE: 'overlap_type' defines how the location must be related to the feature.
402 /// For eOverlap_Subset, eOverlap_SubsetRev, eOverlap_CheckIntervals,
403 /// eOverlap_CheckIntRev and eOverlap_Interval the relationship is
404 /// reversed. E.g. with eOverlap_Contains, the location will contain
405 /// the feature, but with eOverlap_Subset the feature will be defined
406 /// on a subset of the location.
408 void GetOverlappingFeatures(const CSeq_loc& loc,
409  CSeqFeatData::E_Choice feat_type,
410  CSeqFeatData::ESubtype feat_subtype,
411  EOverlapType overlap_type,
412  TFeatScores& feats,
413  CScope& scope,
414  const TBestFeatOpts opts = 0,
416 
417 
418 /// See the note above on 'overlap_type' meaning.
421  CSeqFeatData::E_Choice feat_type,
422  EOverlapType overlap_type,
423  CScope& scope,
426 /// See the note above on 'overlap_type' meaning.
429  CSeqFeatData::ESubtype feat_type,
430  EOverlapType overlap_type,
431  CScope& scope,
434 
437 GetBestGeneForMrna(const CSeq_feat& mrna_feat,
438  CScope& scope,
441 
444 GetBestGeneForCds(const CSeq_feat& cds_feat,
445  CScope& scope,
448 
451 GetBestMrnaForCds(const CSeq_feat& cds_feat,
452  CScope& scope,
455 
458 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
459  CScope& scope,
462 
464 void GetMrnasForGene(const CSeq_feat& gene_feat,
465  CScope& scope,
466  list< CConstRef<CSeq_feat> >& mrna_feats,
469 
471 void GetCdssForGene(const CSeq_feat& gene_feat,
472  CScope& scope,
473  list< CConstRef<CSeq_feat> >& cds_feats,
476 
477 /////////////////////////////////////////////////////////////////////////////
478 // Versions of functions with lookup by feature id
481 GetBestGeneForMrna(const CSeq_feat& mrna_feat,
482  const CTSE_Handle& tse,
485 
488 GetBestGeneForCds(const CSeq_feat& cds_feat,
489  const CTSE_Handle& tse,
492 
495 GetBestMrnaForCds(const CSeq_feat& cds_feat,
496  const CTSE_Handle& tse,
499 
502 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
503  const CTSE_Handle& tse,
506 
508 void GetMrnasForGene(const CSeq_feat& gene_feat,
509  const CTSE_Handle& tse,
510  list< CConstRef<CSeq_feat> >& mrna_feats,
513 
515 void GetCdssForGene(const CSeq_feat& gene_feat,
516  const CTSE_Handle& tse,
517  list< CConstRef<CSeq_feat> >& cds_feats,
520 
524  CSeqFeatData::E_Choice feat_type,
525  sequence::EOverlapType overlap_type,
526  CScope& scope,
529 
533  CSeqFeatData::ESubtype feat_type,
534  sequence::EOverlapType overlap_type,
535  CScope& scope,
538 
541 
542 
543 /// Get the best overlapping feature for a SNP (variation) feature
544 /// @param snp_feat
545 /// SNP feature object
546 /// @param type
547 /// type of overlapping feature
548 /// @param scope
549 /// @param search_both_strands
550 /// search is performed on both strands, starting with the one specified
551 /// by the feature's location.
552 /// @return
553 /// the overlapping fetaure or NULL if not found
557  CScope& scope,
558  bool search_both_strands = true);
559 
560 /// Get the best overlapping feature for a SNP (variation)
561 /// @param snp_feat
562 /// SNP feature object
563 /// @param subtype
564 /// subtype of overlapping feature
565 /// @param scope
566 /// @param search_both_strands
567 /// search is performed on both strands, starting with the one specified
568 /// by the feature's location.
569 /// @return
570 /// the overlapping fetaure or NULL if not found
573  CSeqFeatData::ESubtype subtype,
574  CScope& scope,
575  bool search_both_strands = true);
576 
577 
578 /// Convenience functions for popular overlapping types
582  eTransSplicing_Auto ///< Ignore overlap strand if the source location
583  ///< has mixed/both strand.
584 };
587  const CSeq_loc& loc, CScope& scope,
588  ETransSplicing eTransSplicing = eTransSplicing_Auto);
589 
590 
591 /// Finds gene for feature, but obeys SeqFeatXref directives
594 
595 /// Determines whether given feature is pseudo, using gene associated with feature
596 /// if necessary
597 /// Checks to see if a feature is pseudo. Looks for pseudo flag set on feature,
598 /// looks for pseudogene qualifier on feature, performs same checks for gene
599 /// associated with feature
600 /// @param feat Seq-feat to check
601 /// @param scope CScope to use when looking for associated gene
602 /// @return Boolean return value indicates whether any of the "pseudo" markers are found
604 bool IsPseudo(const CSeq_feat& feat, CScope& scope);
605 
606 
609 
610 
613 
614 
617 
618 
621 
622 
625 
626 
627 /// Get the encoding CDS feature of a given protein sequence.
629 const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope);
631 const CSeq_feat* GetCDSForProduct(const CBioseq_Handle& product);
634 
635 
636 /// Get the mature peptide feature of a protein
638 const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope);
640 const CSeq_feat* GetPROTForProduct(const CBioseq_Handle& product);
641 
642 
643 /// Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
645 const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope);
647 const CSeq_feat* GetmRNAForProduct(const CBioseq_Handle& product);
650 
651 /* @} */
652 
653 
654 /** @name Sequences
655  * Searching for bioseqs etc.
656  * @{
657  */
658 
659 /// Get the encoding nucleotide sequnce of a protein.
661 const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope);
664 
665 /// Get the parent bioseq for a part of a segmented bioseq
668 
669 /// Return the org-ref associated with a given sequence. This will throw
670 /// a CException if there is no org-ref associated with the sequence
672 const COrg_ref& GetOrg_ref(const CBioseq_Handle& handle);
673 /// Return the pointer to org-ref associated with a given sequence
674 /// or null if there is no org-ref associated with the sequence
676 const COrg_ref* GetOrg_refOrNull(const CBioseq_Handle& handle);
677 
678 /// return the tax-id associated with a given sequence. This will return 0
679 /// if no tax-id can be found.
681 TTaxId GetTaxId(const CBioseq_Handle& handle);
682 
683 /// Retrieve the MolInfo object for a given bioseq handle. If the supplied
684 /// sequence does not have a MolInfo associated with it, this will return NULL
686 const CMolInfo* GetMolInfo(const CBioseq& bioseq);
687 
689 const CMolInfo* GetMolInfo(const CBioseq_Handle& handle);
690 
691 /// Retrieve the BioSource object for a given bioseq handle. If the supplied
692 /// sequence does not have a MolInfo associated with it, this will return NULL
694 const CBioSource* GetBioSource(const CBioseq& bioseq);
695 
697 const CBioSource* GetBioSource(const CBioseq_Handle& handle);
698 
699 
700 /// Retrieve the Bioseq Handle from a location.
701 /// location refers to a single bioseq:
702 /// return the bioseq
703 /// location referes to multiple bioseqs:
704 /// if parts of a segmentd bioseq, returns the segmentd bioseq.
705 /// otherwise, return the first bioseq that could be found
706 /// (first localy then, if flag is eGetBioseq_All, remote)
710 
711 
712 /// Return protein name from corresponding Prot-ref feature.
713 /// Throws exception if the sequence is not a protein,
714 /// or if there is no unambiguously best Prot-ref feature,
715 /// or if the feature doesn't return non-empty label.
717 string GetProteinName(const CBioseq_Handle& seq);
718 
721 
724 
726 void GetOrg_refForProduct(const CBioseq_Handle& bsh, const COrg_ref*);
727 
728 /// Find an Org-ref for the given Bioseq:
729 /// If it's a protein then look on the source feature of the product.
730 /// Otherwise find a source descriptor for the sequence.
731 /// Otherwise, try to find a source feature for the sequence.
732 /// Return nullptr if we still turn up empty.
734 const COrg_ref* GetOrg_refForBioseq(const CBioseq_Handle& bsh);
735 
736 /// Find a BioSource for the given Bioseq:
737 /// If it's a protein then look for the source feature of the product.
738 /// Otherwise find a source descriptor for the sequence.
739 /// Otherwise, try to find a source feature for the sequence.
740 /// Return nullptr if we still turn up empty.
743 
744 /* @} */
745 
746 
749 {
750 public:
751  // Enumerated list of document management errors
752  enum EErrCode {
754  eRequestedIdNotFound
755  };
756 
757  // Translate the specific error code into a string representations of
758  // that error code.
759  virtual const char* GetErrCodeString(void) const override;
760 
762 };
763 
764 
765 END_SCOPE(sequence)
766 
767 
768 /// FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
769 
771 public:
772  enum EFlags : long {
773  fAssembleParts = 1 << 0, ///< assemble FAR delta sequences; on by dflt
774  fInstantiateGaps = 1 << 1, ///< honor specifed gap mode; on by default
775  fSuppressRange = 1 << 2, ///< never include location details in defline
776  fReverseStrand = 1 << 3, ///< flip the (implicit) location
777  fKeepGTSigns = 1 << 4, ///< don't convert '>' to '_' in title
778  fMapMasksUp = 1 << 5, ///< honor masks specified at a lower level
779  fMapMasksDown = 1 << 6, ///< honor masks specified at a higher level
780  fNoExpensiveOps = 1 << 7, ///< don't try too hard to find titles
781  fShowModifiers = 1 << 8, ///< show key-value pair modifiers (e.g. "[organism=Homo sapiens]")
782  fNoDupCheck = 1 << 9, ///< skip check for duplicate sequence IDs
783  fShowGapModifiers = 1 << 10, ///< show gap key-value pair modifiers (e.g. "[linkage-evidence=map;strobe]"). Only works if gap mode is eGM_count.
784  fKeepUnknGapNomLen = 1 << 11, ///< Keep unknown gap's nominal length. That is, when a gap has an unknown length but nominal length, use that instead of just making it 100.
785  fShowGapsOfSizeZero = 1 << 12, ///< Use this to show gaps of size zero as a lone hyphen at the end of a line.
786  fEnableGI = 1 << 13, ///< Use this flag to enable GI output in the defline
787  fHideGenBankPrefix = 1 << 14, ///< Hide gb| prefix for genbank only seq_id's
788  fHTMLEncode = 1 << 15, ///< Encode the title string for HTML display
789  fIgnoreOriginalID = 1 << 16, ///< Disregard original ID when constructing defline
790  // historically misnamed as eFlagName
791  eAssembleParts = fAssembleParts,
792  eInstantiateGaps = fInstantiateGaps,
793  fUseAutoDef = 1 << 17, ///< Disregard original ID when constructing defline
794  fBaseFirstUnused = 1 << 18, ///< first avalailabe for derived classes
795  fDoNotUseAutoDef = 1 << 19,
796  fShowGnlAndAcc = 1 << 20, ///< Show general id and accession in the defline
797  };
798  typedef long TFlags; ///< binary OR of EFlags
799 
800  /// How to represent gaps with fInstantiateGaps enabled, as it is
801  /// by default. (Disabling fInstantiateGaps is equivalent to
802  /// requesting eGM_one_dash.)
803  enum EGapMode {
804  eGM_one_dash, ///< A single dash, followed by a line break.
805  eGM_dashes, ///< Multiple inline dashes.
806  eGM_letters, ///< Multiple inline Ns or Xs as appropriate (default).
807  eGM_count ///< >?N or >?unk100, as appropriate.
808  };
809 
811  virtual ~CFastaOstream();
812 
813  /// Unspecified locations designate complete sequences;
814  /// non-empty custom titles override the usual title determination logic
815  virtual void Write (const CSeq_entry_Handle& handle,
816  const CSeq_loc* location = 0);
817  virtual void Write (const CBioseq_Handle& handle,
818  const CSeq_loc* location = 0,
819  const string& custom_title = kEmptyStr);
820  virtual void WriteTitle (const CBioseq_Handle& handle,
821  const CSeq_loc* location = 0,
822  const string& custom_title = kEmptyStr);
823  virtual void WriteSequence(const CBioseq_Handle& handle,
824  const CSeq_loc* location = 0,
826 
827  /// These versions may set up a temporary object manager scope
828  /// In the common case of a raw bioseq, no scope is needed
829  void Write(const CSeq_entry& entry, const CSeq_loc* location = 0,
830  bool no_scope = false);
831  void Write(const CBioseq& seq, const CSeq_loc* location = 0,
832  bool no_scope = false, const string& custom_title = kEmptyStr);
833  void WriteTitle(const CBioseq& seq, const CSeq_loc* location = 0,
834  bool no_scope=false, const string& custom_title=kEmptyStr);
835 
836  /// Used only by Write(CSeq_entry[_Handle], ...); permissive by default
837  virtual bool SkipBioseq(const CBioseq& /* seq */) { return false; }
838  /// Delegates to the non-handle version by default for
839  /// compatibility with older code; newer code should override this
840  /// version.
841  virtual bool SkipBioseq(const CBioseq_Handle& handle)
842  { return SkipBioseq(*handle.GetCompleteBioseq()); }
843 
844  /// Which residues to mask out in subsequent output.
845  /// These do NOT automatically reset between calls to Write;
846  /// you must do so yourself by setting them to null.
847  enum EMaskType {
848  eSoftMask = 1, ///< write as lowercase rather than uppercase
849  eHardMask = 2 ///< write as N for nucleotides, X for peptides
850  };
851  CConstRef<CSeq_loc> GetMask(EMaskType type) const;
852  void SetMask(EMaskType type, CConstRef<CSeq_loc> location);
853 
854  /// Other parameters...
855  TSeqPos GetWidth (void) const { return m_Width; }
856  void SetWidth (TSeqPos width);
857  TFlags GetAllFlags(void) const { return m_Flags; }
858  void SetAllFlags(TFlags flags) { m_Flags = flags; }
859  void SetFlag (EFlags flag) { m_Flags |= flag; }
860  void ResetFlag (EFlags flag) { m_Flags &= ~flag; }
861  void SetGapMode (EGapMode mode) { m_GapMode = mode; }
862  EGapMode GetGapMode(void) const { return m_GapMode; }
863 
864  /// This indicates the text of the modifiers of a gap.
866  /// String representing the gap type.
867  /// Examples: "short_arm", "telomere", etc.
868  string gap_type;
869  /// A vector representing the linkage-evidences of the gap.
870  /// Example linkage-evidences: "align genus", "within clone", etc.
871  vector<string> gap_linkage_evidences;
872 
873  // more fields may be added in the future.
874 
875  /// This will write the modifiers in
876  /// FASTA format. (example: "[gap-type=short_arm]")
877  void WriteAllModsAsFasta( CNcbiOstream & out ) const;
878  };
879 
880  /// Given a CSeq_gap object, this outputs the
881  /// Gap information
882  ///
883  /// @param seq_gap
884  /// This is the seq_gap information we're using to figure out
885  /// the gap mod text
886  /// @param out_gap_mod_text
887  /// This holds the result.
888  static void
889  GetGapModText(
890  const CSeq_gap & seq_gap,
891  SGapModText & out_gap_mod_text );
892 
893 protected:
895  unique_ptr<sequence::CDeflineGenerator> m_Gen;
896 
897  virtual void x_WriteSeqIds ( const CBioseq& bioseq,
898  const CSeq_loc* location);
899  virtual void x_WriteAsFasta ( const CBioseq& bioseq );
900  virtual void x_GetBestId(CConstRef<CSeq_id>& gi_id, CConstRef<CSeq_id>& best_id, bool& hide_prefix, const CBioseq& bioseq);
901  //virtual void x_WriteModifiers ( const CBioseq_Handle & handle );
902  virtual void x_WriteSeqTitle( const CBioseq_Handle & handle,
903  const string& custom_title);
904  virtual void x_WriteBuffer( const char* buf, unsigned int count) { m_Out.write(buf, count); };
905 
907 
908 private:
914  // avoid recomputing for every sequence
916  TCharBuf m_Dashes, m_LC_Ns, m_LC_Xs, m_UC_Ns, m_UC_Xs;
917 
918  sequence::CDeflineGenerator::TUserFlags x_GetTitleFlags(void) const;
919 
920  //void x_PrintStringModIfNotDup(
921  // bool *seen, const CTempString & key, const CTempString & value );
922  //void x_PrintIntModIfNotDup(
923  // bool *seen, const CTempString & key, const int value );
924 
925  CConstRef<CSeq_loc> x_MapMask(CSeq_loc_Mapper& mapper, const CSeq_loc& mask,
926  const CSeq_id* base_seq_id, CScope* scope);
927 
929  void x_GetMaskingStates(TMSMap& masking_states,
930  const CSeq_id* base_seq_id,
931  const CSeq_loc* location,
932  CScope* scope);
933 
934  void x_WriteSequence(const CSeqVector& vec,
935  const TMSMap& masking_state);
936 };
937 
938 
939 /// Public interface for coding region translation function
940 /// Uses CTrans_table in <objects/seqfeat/Genetic_code_table.hpp>
941 /// for rapid translation from a given genetic code, allowing all
942 /// of the iupac nucleotide ambiguity characters
943 
945 {
946 public:
947 
949  eThrowException = 0,
951  ePad
952  };
953 
954  /// translation coding region into ncbieaa protein sequence
956  static void TranslateCdregion (string& prot,
957  const CBioseq_Handle& bsh,
958  const CSeq_loc& loc,
959  const CCdregion& cdr,
960  bool include_stop = true,
961  bool remove_trailing_X = false,
962  bool* alt_start = 0,
963  ETranslationLengthProblemOptions options = eThrowException);
964 
966  static void TranslateCdregion(string& prot,
967  const CSeq_feat& cds,
968  CScope& scope,
969  bool include_stop = true,
970  bool remove_trailing_X = false,
971  bool* alt_start = 0,
972  ETranslationLengthProblemOptions options = eThrowException);
973 };
974 
975 
977 {
978 public:
979  /// @sa TTranslationFlags
981  fDefault = 0,
982  fNoStop = (1<<0), ///< = 0x1 Do not include stop in translation
983  fRemoveTrailingX = (1<<1), ///< = 0x2 Remove trailing Xs from protein
984  fIs5PrimePartial = (1<<2), ///< = 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
985  fIs3PrimePartial = (1<<3) ///< = 0x8 May not end in stop codon (because sequence is 3' partial)
986  };
987 
988  typedef int TTranslationFlags;
989 
990 
991 
992  /// Translate a string using a specified genetic code
993  /// @param seq
994  /// String containing IUPAC representation of sequence to be translated
995  /// @param code
996  /// Genetic code to use for translation (NULL to use default)
997  /// @param include_stop
998  /// If true, translate through stop codons and include trailing stop
999  /// (true by default)
1000  /// @param remove_trailing_X
1001  /// If true, remove trailing Xs from protein translation (false by
1002  /// default)
1003  /// @param alt_start
1004  /// Pointer to bool to indicate whether an alternative start codon was
1005  /// used
1006  /// @param is_5prime_complete
1007  /// If true, only translate first codon if start codon, otherwise
1008  /// translate as dash (-) to indicate problem with sequence
1009 
1010  NCBI_DEPRECATED static void Translate(const string& seq,
1011  string& prot,
1012  const CGenetic_code* code,
1013  bool include_stop = true,
1014  bool remove_trailing_X = false,
1015  bool* alt_start = NULL,
1016  bool is_5prime_complete = true,
1017  bool is_3prime_complete = true);
1018 
1019  /// Translate a string using a specified genetic code
1020  /// @param seq
1021  /// String containing IUPAC representation of sequence to be translated
1022  /// @param code
1023  /// Genetic code to use for translation (NULL to use default)
1024  /// @param flags
1025  /// Binary OR of "ETranslationFlags"
1026  /// @param alt_start
1027  /// Pointer to bool to indicate whether an alternative start codon was
1028  /// used
1029  static void Translate(const string& seq,
1030  string& prot,
1031  TTranslationFlags flags = fDefault,
1032  const CGenetic_code* code = NULL,
1033  bool* alt_start = NULL);
1034 
1035  /// Translate a seq-vector using a specified genetic code
1036  /// if the code is NULL, then the default genetic code is used
1037  /// @param seq
1038  /// CSeqVector of sequence to be translated
1039  /// @param code
1040  /// Genetic code to use for translation (NULL to use default)
1041  /// @param include_stop
1042  /// If true, translate through stop codons and include trailing stop
1043  /// (true by default)
1044  /// @param remove_trailing_X
1045  /// If true, remove trailing Xs from protein translation (false by
1046  /// default)
1047  /// @param alt_start
1048  /// Pointer to bool to indicate whether an alternative start codon was
1049  /// used
1050  /// @param is_5prime_complete
1051  /// If true, only translate first codon if start codon, otherwise
1052  /// translate as dash (-) to indicate problem with sequence
1053  NCBI_DEPRECATED static void Translate(const CSeqVector& seq,
1054  string& prot,
1055  const CGenetic_code* code,
1056  bool include_stop = true,
1057  bool remove_trailing_X = false,
1058  bool* alt_start = NULL,
1059  bool is_5prime_complete = true,
1060  bool is_3prime_complete = true);
1061 
1062  /// Translate a seq-vector using a specified genetic code
1063  /// if the code is NULL, then the default genetic code is used
1064  /// @param seq
1065  /// CSeqVector of sequence to be translated
1066  /// @param code
1067  /// Genetic code to use for translation (NULL to use default)
1068  /// @param flags
1069  /// Binary OR of "ETranslationFlags"
1070  /// @param alt_start
1071  /// Pointer to bool to indicate whether an alternative start codon was
1072  /// used
1073  static void Translate(const CSeqVector& seq,
1074  string& prot,
1075  TTranslationFlags flags = fDefault,
1076  const CGenetic_code* code = NULL,
1077  bool* alt_start = NULL);
1078 
1079  /// utility function: translate a given location on a sequence
1081  static void Translate(const CSeq_loc& loc,
1082  const CBioseq_Handle& handle,
1083  string& prot,
1084  const CGenetic_code* code = NULL,
1085  bool include_stop = true,
1086  bool remove_trailing_X = false,
1087  bool* alt_start = 0);
1088 
1089  /// utility function: translate a given location on a sequence
1090  static void Translate(const CSeq_loc& loc,
1091  CScope& scope,
1092  string& prot,
1093  const CGenetic_code* code = NULL,
1094  bool include_stop = true,
1095  bool remove_trailing_X = false,
1096  bool* alt_start = 0);
1097 
1098  /// Translate a CDRegion into a protein
1099  static void Translate(const CSeq_feat& cds,
1100  CScope& scope,
1101  string& prot,
1102  bool include_stop = true,
1103  bool remove_trailing_X = false,
1104  bool* alt_start = 0);
1105 
1106  static CRef<CBioseq> TranslateToProtein(const CSeq_feat& cds,
1107  CScope& scope);
1108 
1109  static bool ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein);
1110 
1111  /// Find "best" frame for a coding region. "Best" frame has no
1112  /// internal stop codons.
1113  static CCdregion::EFrame FindBestFrame(const CSeq_feat& cds, CScope& scope);
1114  static CCdregion::EFrame FindBestFrame(const CSeq_feat& cds, CScope& scope, bool& ambiguous);
1115 
1116 };
1117 
1118 
1119 
1120 /// Location relative to a base Seq-loc: one (usually) or more ranges
1121 /// of offsets.
1122 /// XXX - handle fuzz?
1124 {
1125  enum EFlags {
1126  fNoMerge = 0x1 ///< don't merge adjacent intervals
1127  };
1128  typedef int TFlags; ///< binary OR of EFlags
1129 
1130  /// For relative ranges (ONLY), id is irrelevant and normally unset.
1132  typedef vector<CRef<TRange> > TRanges;
1133 
1134  /// Beware: treats locations corresponding to different sequences as
1135  /// disjoint, even if one is actually a segment of the other. :-/
1136  SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope = 0,
1137  TFlags flags = 0);
1138 
1139  /// For manual work. As noted above, ranges need not contain any IDs.
1140  SRelLoc(const CSeq_loc& parent, const TRanges& ranges)
1141  : m_ParentLoc(&parent), m_Ranges(ranges) { }
1142 
1143  CRef<CSeq_loc> Resolve(CScope* scope = 0, TFlags flags = 0) const
1144  { return Resolve(*m_ParentLoc, scope, flags); }
1145  CRef<CSeq_loc> Resolve(const CSeq_loc& new_parent, CScope* scope = 0,
1146  TFlags flags = 0) const;
1147 
1150 };
1151 
1152 
1153 
1154 ///============================================================================//
1155 /// Sequence Search //
1156 ///============================================================================//
1157 
1158 /// CSeqSearch
1159 /// ==========
1160 ///
1161 /// Search a nucleotide sequence for one or more patterns
1162 ///
1163 
1165 {
1166 public:
1167 
1168  /// Holds information associated with a pattern, such as the name of the
1169  /// restriction enzyme, location of cut site etc.
1171  {
1172  public:
1173  /// constructor
1174  CPatternInfo(const string& name,
1175  const string& sequence,
1176  Int2 cut_site) :
1177  m_Name(name), m_Sequence(sequence), m_CutSite(cut_site),
1178  m_Strand(eNa_strand_unknown)
1179  {}
1180 
1181  const string& GetName (void) const { return m_Name; }
1182  const string& GetSequence (void) const { return m_Sequence; }
1183  Int2 GetCutSite (void) const { return m_CutSite; }
1184  ENa_strand GetStrand (void) const { return m_Strand; }
1185 
1186  private:
1187  friend class CSeqSearch;
1188 
1189  /// data
1190  string m_Name; /// user defined name
1191  string m_Sequence; /// nucleotide sequence
1194  };
1196 
1197  /// Client interface:
1198  /// ==================
1199  /// A class that uses the SeqSearch facility should implement the Client
1200  /// interface and register itself with the search utility to be notified
1201  /// of matches detection.
1202  class IClient
1203  {
1204  public:
1205  virtual ~IClient() { }
1206 
1208 
1209  virtual bool OnPatternFound(const TPatternInfo& pat_info, TSeqPos pos) = 0;
1210  };
1211 
1212 public:
1213 
1215  fNoFlags = 0,
1216  fJustTopStrand = 1,
1217  fExpandPattern = 2,
1218  fAllowMismatch = 4
1219  };
1220  typedef unsigned int TSearchFlags; ///< binary OR of ESearchFlag
1221 
1222  /// constructors
1223  /// @param client
1224  /// pointer to a client object (receives pattern match notifications)
1225  /// @param flags
1226  /// specify search flags
1227  CSeqSearch(IClient *client = 0, TSearchFlags flags = fNoFlags);
1228  /// destructor
1229  ~CSeqSearch(void);
1230 
1231  /// Add nucleotide pattern or restriction site to sequence search.
1232  /// Uses ambiguity codes, e.g., R = A and G, H = A, C and T
1233  void AddNucleotidePattern(const string& name, /// pattern's name
1234  const string& sequence, /// pattern's sequence
1235  Int2 cut_site,
1236  TSearchFlags flags = fNoFlags);
1237 
1238  /// Search the sequence for patterns
1239  /// @sa
1240  /// AddNucleotidePattern
1241  void Search(const CBioseq_Handle& seq);
1242 
1243  /// Low level search method.
1244  /// The user is responsible for feeding each character in turn,
1245  /// keep track of the position in the text and provide the length in case of
1246  /// a circular topoloy.
1247  int Search(int current_state, char ch, int position, int length = kMax_Int);
1248 
1249  /// Get / Set client.
1250  const IClient* GetClient() const { return m_Client; }
1251  void SetClient(IClient* client) { m_Client = client; }
1252 
1253 private:
1254 
1255  void x_AddNucleotidePattern(const string& name, string& pattern,
1256  Int2 cut_site, ENa_strand strand, TSearchFlags flags);
1257 
1258  void x_ExpandPattern(string& sequence, string& buffer, size_t pos,
1259  TPatternInfo& pat_info, TSearchFlags flags);
1260 
1261  void x_AddPattern(TPatternInfo& pat_info, string& sequence, TSearchFlags flags);
1262  void x_StorePattern(TPatternInfo& pat_info, string& sequence);
1263 
1265  return ((m_Flags | flags) & fJustTopStrand) != 0;
1266  }
1268  return ((m_Flags | flags) & fExpandPattern) != 0;
1269  }
1271  return ((m_Flags | flags) & fAllowMismatch) != 0;
1272  }
1273 
1274  // data
1275  IClient* m_Client; // pointer to client object
1276  TSearchFlags m_Flags; // search flags
1277  size_t m_LongestPattern; // longets search pattern
1278  CTextFsm<TPatternInfo> m_Fsa; // finite state machine
1279 }; // end of CSeqSearch
1280 
1281 
1282 /// This trims ambiguous bases from the start and/or end of
1283 /// sequences, using customizable rules.
1285 {
1286 public:
1287 
1288  /// This enum is used to set what is meant by "ambiguous".
1290  /// Here, only N for nucleotides and X for amino acids is considered
1291  /// ambiguous.
1293  /// Here, anything that's not certain is considered
1294  /// ambiguous. That is, anything but A, C, G, T for nucleotides,
1295  /// and most amino acids except, for example, B (which can be
1296  /// aspartic acid or asparagine), X (completely ambiguous), etc.
1298  };
1299 
1300  enum EFlags {
1301  fFlags_DoNotTrimBeginning = (1 << 0), ///< 0x01 ("Beginning" as defined by CSeqVector)
1302  fFlags_DoNotTrimEnd = (1 << 1), ///< 0x02 ("End" as defined by CSeqVector)
1303 
1304  fFlags_DoNotTrimSeqGap = (1 << 2), ///< 0x04 (Seq-gaps are not considered trimmable if this flag is set, only letter gaps (e.g. N's for nucs))
1305 
1306  // we might support this in the future
1307  // fFlags_TrimAnnot = (1 << 3) ///< 0x08 (Trim annots based on trimmed bioseq location)
1308  };
1309  typedef int TFlags;
1310 
1311  /// For example, if bases_to_check is 10 and max_bases_allowed_to_be_ambig
1312  /// is 5, then on each iteration we check the 10 terminal bases and
1313  /// trim off those 10 if there are more than 5 ambiguous bases there.
1317  };
1318  /// Multiple STrimRules are allowed, which are applied from
1319  /// smallest bases_to_check to largest bases_to_check, and
1320  /// redundant rules are automatically removed. When a rule is applied,
1321  /// we start over at the first sorted rule again.
1322  typedef vector<STrimRule> TTrimRuleVec;
1323 
1324  /// This returns a reasonable default for trimming rules.
1325  static const TTrimRuleVec & GetDefaultTrimRules(void);
1326 
1327  /// This sets up the parameters for how this trimmer will act
1328  ///
1329  /// @param eMeaningOfAmbig
1330  /// This indicates exactly what ambiguous means (e.g. just "N" or
1331  /// do all ambiguous symbols count? )
1332  /// @param fFlags
1333  /// miscellaneous parameters to control this. See TFlags.
1334  /// @param vecTrimRules
1335  /// This indicates how trimming will occur. See TTrimRuleVec.
1336  /// @param uMinSeqLen
1337  /// Trimming tries to halt if the sequence becomes smaller than this size.
1338  /// It is possible for the resulting sequence to be below the
1339  /// uMinSeqLen size (or even trimmed to nothing), but the trimmer
1340  /// will at least <i>try</i> not to do that.
1342  EMeaningOfAmbig eMeaningOfAmbig,
1343  TFlags fFlags = 0,
1344  const TTrimRuleVec & vecTrimRules = GetDefaultTrimRules(),
1345  TSignedSeqPos uMinSeqLen = 50 );
1346 
1347  /// Do-nothing destructor just to allow inheritance.
1349 
1350  /// This indicates what happened with the trim.
1351  /// Error states are indicated by an exception, not EResult.
1352  enum EResult {
1353  /// Bioseq is now trimmed.
1355 
1356  /// Bioseq is left unchanged because it did not need to be trimmed
1357  /// at all. This is NOT an error.
1358  eResult_NoTrimNeeded
1359  };
1360 
1361  /// This trims the given bioseq, using params
1362  /// set in the CSequenceAmbigTrimmer constructor. It will properly
1363  /// handle the annots and descs inside the bioseq, too, if requested.
1364  ///
1365  /// @param bioseq_handle
1366  /// The bioseq to trim.
1367  /// @param trimmed_ranges
1368  /// The ranges trimmed by DoTrim will be added to this.
1369  /// @return
1370  /// This returns how the trimming went. On error, an exception
1371  /// is thrown and the bioseq may be in an undefined state.
1372  virtual EResult DoTrim( CBioseq_Handle &bioseq_handle,
1373  CRangeCollection<TSeqPos> *trimmed_ranges = nullptr);
1374 
1375 protected:
1376  /// This holds the current interpretation for "ambiguous". For example,
1377  /// it indicates whether just 'N' is ambiguous or if any non-ACGT
1378  /// letter is ambiguous. Works for amino acids, too (e.g. 'X' for
1379  /// completely unknown, etc.)
1381  /// This holds the flags that affect the behavior of this class.
1383  /// This holds the trimming rules that will be applied.
1384  /// It should be normalized by the constructor
1385  /// to eliminate dups and to sort it from least to most bases.
1387  /// When the bioseq gets trimmed down to less than this size,
1388  /// we halt the trimming.
1390 
1391  /// Test if a given flag is set.
1392  bool x_TestFlag(TFlags fFlag) {
1393  return ( ( m_fFlags & fFlag ) != 0 );
1394  }
1395 
1396  /// This prepares the vector of trimming rules to be used
1397  /// by the trimming algorithm. For example, it eliminate duplicates
1398  /// and puts the rules in the correct order.
1399  ///
1400  /// @param vecTrimRules
1401  /// Input and output.
1402  virtual void x_NormalizeVecTrimRules( TTrimRuleVec & vecTrimRules );
1403 
1404  /// The bioseq is trimmed to size 0.
1405  ///
1406  /// @param bioseq_handle
1407  /// The bioseq to trim to nothing.
1408  /// @returns
1409  /// Works just like the DoTrim return value.
1410  virtual EResult x_TrimToNothing( CBioseq_Handle &bioseq_handle );
1411 
1412  // below this point, left/right means positions going numerically upward,
1413  // but start/end is relative to direction. That is,
1414  // a negative direction would imply end &lt;= start.
1415 
1416  /// This returns the last good base that won't be trimmed
1417  /// (note: last really means "first" when we're starting from the end)
1418  ///
1419  /// @param seqvec
1420  /// This lets us explore the Bioseq to find out where to trim.
1421  /// @param iStartPosInclusive_arg
1422  /// This is the where we start our trimming. Depending on
1423  /// direction, this could be &lt; or &gt; iEndPosInclusive_arg.
1424  /// @param iEndPosInclusive_arg
1425  /// This is where the trimming ends (inclusive). Analogous to
1426  /// iStartPosInclusive_arg.
1427  /// @param iTrimDirection
1428  /// 1 to trim from left to right, -1 to trim from right to left.
1429  /// @return
1430  /// The last good base (remember: last means "lower number" when we're
1431  /// checking from the end). If trimming would trim off the entire
1432  /// sequence, it returns a position past the end of the sequence.
1433  virtual TSignedSeqPos x_FindWhereToTrim(
1434  const CSeqVector & seqvec,
1435  const TSignedSeqPos iStartPosInclusive_arg,
1436  const TSignedSeqPos iEndPosInclusive_arg,
1437  TSignedSeqPos iTrimDirection );
1438 
1439  /// This adjusts in_out_uStartOfGoodBasesSoFar if we're at
1440  /// a CSeqMap gap. It does not notice ambiguous bases that
1441  /// are inside a normal sequence.
1442  ///
1443  /// @param seqvec
1444  /// This is used to access information about the sequence.
1445  /// @param in_out_uStartOfGoodBasesSoFar
1446  /// This is the start of where we check for a gap.
1447  /// It will be updated to be past the gap, if a gap is found.
1448  /// @param in_out_uRightmostGoodBaseSoFar
1449  /// Analogous to in_out_uLeftmostGoodBaseSoFar. It's inclusive.
1450  /// @param uEndOfGoodBasesSoFar
1451  /// This limits how far this function may search (inclusive)
1452  /// when looking for
1453  /// the end of a gap segment.
1454  /// @param iTrimDirection
1455  /// 1 to trim from left to right, -1 to trim from right to left.
1456  /// @param uChunkSize
1457  /// The gap size that we chop off must be a multiple of uChunkSize.
1458  /// We will chop off less if we would go more than 1 past the
1459  /// uEndOfGoodBasesSoFar.
1460  /// A uChunkSize of 1 means no chunking for obvious math reasons.
1461  virtual void x_EdgeSeqMapGapAdjust(
1462  const CSeqVector & seqvec,
1463  TSignedSeqPos & in_out_uStartOfGoodBasesSoFar,
1464  const TSignedSeqPos uEndOfGoodBasesSoFar,
1465  const TSignedSeqPos iTrimDirection,
1466  const TSignedSeqPos uChunkSize );
1467 
1468  /// This holds the output of x_CountAmbigInRange
1470  SAmbigCount(const TSignedSeqPos iTrimDirection) :
1471  num_ambig_bases(0),
1472  pos_after_last_gap(
1473  (iTrimDirection > 0)
1476  { }
1477 
1478  /// the number of ambiguous bases found in the range
1479  /// supplied to x_CountAmbigInRange
1481  /// Inclusive. This is far past the end if the whole range
1482  /// is ambiguous.
1484  };
1485 
1486  /// This counts the number of ambiguous bases in the range
1487  /// [leftmost_pos_to_check, rightmost_pos_to_check]. Note that
1488  /// rightmost_pos_to_check is inclusive.
1489  ///
1490  /// @param out_result
1491  /// This will store the result. Pass in a struct initialized
1492  /// by the default constructor.
1493  /// @param seqvec
1494  /// This is used to get the bases.
1495  /// @param iStartPosInclusive
1496  /// This is where we start our count.
1497  /// @param iEndPosInclusive
1498  /// This is where we end our count. Note that it can be &lt; or
1499  /// &gt; iStartPosInclusive, depending on trim direction.
1500  /// @param iTrimDirection
1501  /// 1 to trim from left to right, -1 to trim from right to left.
1502  virtual void x_CountAmbigInRange(
1503  SAmbigCount & out_result,
1504  const CSeqVector & seqvec,
1505  const TSignedSeqPos iStartPosInclusive_arg,
1506  const TSignedSeqPos iEndPosInclusive_arg,
1507  const TSignedSeqPos iTrimDirection );
1508 
1509  /// This returns the (inclusive) position at the beginning of the
1510  /// segment.
1511  ///
1512  /// @param segment
1513  /// This is the segment we're trying to find the beginning of.
1514  /// @param iTrimDirection
1515  /// This is which direction in which we're trimming. The beginning
1516  /// will be in the opposite direction.
1517  /// @return
1518  /// This returns the (inclusive) position at the beginning of the given
1519  /// segment. As always,
1520  /// the definition of "beginning" depends on iTrimDirection.
1522  const CSeqMap_CI & segment,
1523  const TSignedSeqPos iTrimDirection )
1524  {
1525  // symmetrical
1526  return x_SegmentGetEndInclusive( segment, -iTrimDirection );
1527  }
1528 
1529  /// This returns the (inclusive) position at the end of the
1530  /// segment currently at iStartPosInclusive_arg.
1531  ///
1532  /// @param segment
1533  /// This is the segment we're trying to find the end of.
1534  /// @param iTrimDirection
1535  /// This is which direction in which we're trimming. The end
1536  /// of the segment will be found by looking in that direction.
1537  /// @return
1538  /// This returns the (inclusive) position at the end of the given
1539  /// segment. The definition of "end" depends on iTrimDirection.
1540  TSignedSeqPos x_SegmentGetEndInclusive(
1541  const CSeqMap_CI & segment,
1542  const TSignedSeqPos iTrimDirection );
1543 
1544  /// Returns the "next" segment. The definition of "next"
1545  /// depends on iTrimDirection
1546  ///
1547  /// @param in_out_segment
1548  /// Caller gives the current CSeqMap_CI, which will be
1549  /// returned adjusted in the trim direction.
1550  /// @param iTrimDirection
1551  /// The direction in which to increment. 1 means normal incrementing
1552  /// and -1 really means decrementing.
1553  /// @return
1554  /// Reference to in_out_segment_it.
1555  CSeqMap_CI & x_SeqMapIterDoNext(
1556  CSeqMap_CI & in_out_segment_it,
1557  const TSignedSeqPos iTrimDirection );
1558 
1559  void x_SliceBioseq(
1560  TSignedSeqPos leftmost_good_base,
1561  TSignedSeqPos rightmost_good_base,
1562  CBioseq_Handle & bioseq_handle );
1563 
1564  // For each letter of the alphabet, returns whether or not it's
1565  // ambiguous. Index 0 is 'A', index 1 is 'B', etc.
1566  typedef bool TAmbigLookupTable[26];
1567  TAmbigLookupTable m_arrNucAmbigLookupTable;
1568  TAmbigLookupTable m_arrProtAmbigLookupTable;
1569 };
1570 
1571 /// This iterates over the runs of Ns of each sequence
1573 {
1574 public:
1575 
1576  /// The params that control the behavior of CBioseqGaps_CI
1578  /// Default ctor gives params which are usually reasonable.
1579  Params(void) : max_gap_len_to_ignore(10),
1580  max_num_gaps_per_seq( numeric_limits<TSeqPos>::max() ),
1581  max_num_seqs( numeric_limits<TSeqPos>::max() ),
1582  mol_filter(CSeq_inst::eMol_not_set),
1583  level_filter(CBioseq_CI::eLevel_All)
1584  {
1585  }
1586 
1587  /// We completely ignore any gaps we find that have this
1588  /// number of bases or fewer.
1590  /// We only return up to this many gaps for each sequence
1592  /// We only return gaps on up to this many sequences.
1594 
1595  /// CSeq_inst::eMol_na to only look at gaps on nucleotide
1596  /// sequences. CSeq_inst::eMol_aa to only look at gaps
1597  /// on amino acid sequences.
1598  /// CSeq_inst::eMol_not_set to avoid filtering.
1600  /// Works like the level filter in CBioseq_CI
1602  };
1603 
1604  /// This constructor initializes the iterator.
1605  ///
1606  /// @param entry_h
1607  /// This will iterate over all descendents of this entry.
1608  /// @param params
1609  /// Controls the behavior of the iterator. If not specified,
1610  /// a reasonable default will be used.
1612  const CSeq_entry_Handle & entry_h,
1613  const Params & params = Params() );
1614 
1615  /// Move the iterator forward to next gap
1616  /// (or the end, if there are no more to return)
1617  CBioseqGaps_CI & operator ++ (void) { x_Next(); return *this; }
1618 
1620 
1621  /// This indicates the state of the iterator right now.
1622  /// This structure is undefined if the iterator
1623  /// has reached the end, though the caller probably
1624  /// won't be able to access it anyway since x_GetCurrent
1625  /// will throw an exception in that case.
1627  /// Constructor initializes to state that it
1628  /// should be when the iterator first starts.
1629  SCurrentGapInfo(void) : num_seqs_seen_so_far(0),
1630  start_pos(0),
1631  length(0),
1632  num_gaps_seen_so_far_on_this_seq(0) { }
1633 
1634  /// The seq-id that this gap is on.
1636  /// This indicates how many sequences we've seen so far,
1637  /// including the one we're currently on.
1638  /// For example, 3 means we're on the 3rd sequence to
1639  /// contain a relevant gap.
1641 
1642  /// the 0-based position at which the current gap starts
1643  /// on the current sequence.
1645  /// the length of the current gap
1647  /// how many gaps we've seen so far on this sequence.
1648  /// For example, 2 would mean we're currently on
1649  /// the second relevant gap on this sequence.
1651  };
1652 
1653  /// Get information about the gap we're currently on.
1654  const SCurrentGapInfo & operator*(void) const {
1655  return x_GetCurrent();
1656  }
1657 
1658  /// Get information about the gap we're currently on.
1659  const SCurrentGapInfo * operator ->(void) const {
1660  return &x_GetCurrent();
1661  }
1662 
1663 protected:
1664  /// This points to the bioseq we're currently on.
1665  /// When this iterator becomes invalid, that means this
1666  /// CBioseqGaps_CI is invalid, too.
1668  /// This indicates information about the gap we're currently on.
1670  /// This holds the params the caller gave when this
1671  /// object was initially created.
1673 
1674  /// This gives info on the gap we're currently on.
1675  /// Throws if this iterator has finished.
1676  virtual const SCurrentGapInfo & x_GetCurrent(void) const;
1677 
1678  /// This moves this iterator to the next relevant gap.
1679  /// Throws if this iterator has finished.
1680  virtual void x_Next(void);
1681 
1682  /// This advances m_bioseq_CI although it
1683  /// has extra logic to terminate m_bioseq_CI
1684  /// if we've exceeded the number of bioseqs we can look for.
1685  virtual void x_NextBioseq(void);
1686 
1687  /// This indicates what happened when we tried to run
1688  /// x_FindNextGapOnBioseq.
1689  enum EFindNext {
1690  /// No more relevant gaps were found on this bioseq. The other output
1691  /// parameters will be in an undefined state.
1693  /// Another relevant gap was found, and the output parameters are
1694  /// filled in to represent information about it.
1695  eFindNext_Found
1696  };
1697 
1698  /// This finds the next gap on the bioseq, starting at given pos.
1699  ///
1700  /// @param bioseq_h
1701  /// the bioseq on which we're seeking the next relevant gap.
1702  /// @param pos_to_start_looking
1703  /// This is the position on bioseq_h to start looking for a
1704  /// relevant gap.
1705  /// @param out_pos_of_gap
1706  /// If a gap is found, this holds the 0-based position of the
1707  /// start of that gap. This is undefined if no gap was found.
1708  /// @param out_len_of_gap
1709  /// If a gap is found, this holds the length of the
1710  /// gap. This is undefined if no gap was found.
1711  /// @return
1712  /// This indicates whether or not a relevant gap was found.
1713  virtual EFindNext x_FindNextGapOnBioseq(
1714  const CBioseq_Handle & bioseq_h,
1715  const TSeqPos pos_to_start_looking,
1716  TSeqPos & out_pos_of_gap,
1717  TSeqPos & out_len_of_gap ) const;
1718 };
1719 
1720 /* @} */
1721 
1722 /// Reverse complement a Bioseq in place.
1723 /// If delta sequence, will also need to reverse order of segments
1725 
1726 
1729 
1730 #endif /* SEQUENCE__HPP */
User-defined methods of the data storage class.
ncbi::TMaskedQueryRegions mask
AutoPtr –.
Definition: ncbimisc.hpp:401
This iterates over the runs of Ns of each sequence.
Definition: sequence.hpp:1573
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
Public interface for coding region translation function Uses CTrans_table in <objects/seqfeat/Genetic...
Definition: sequence.hpp:945
CCdregion –.
Definition: Cdregion.hpp:66
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
CMappedFeat –.
Definition: mapped_feat.hpp:59
CObject –.
Definition: ncbiobj.hpp:180
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
CScope –.
Definition: scope.hpp:92
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
Holds information associated with a pattern, such as the name of the restriction enzyme,...
Definition: sequence.hpp:1171
Client interface: ================== A class that uses the SeqSearch facility should implement the Cl...
Definition: sequence.hpp:1203
============================================================================// Sequence Search // ===...
Definition: sequence.hpp:1165
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_loc_Mapper –.
This trims ambiguous bases from the start and/or end of sequences, using customizable rules.
Definition: sequence.hpp:1285
CTime –.
Definition: ncbitime.hpp:296
Definition: map.hpp:338
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
static const char location[]
Definition: config.c:97
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define NULL
Definition: ncbistd.hpp:225
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
#define EXCEPTION_VIRTUAL_BASE
Do not use virtual base classes in exception declaration at all, because in this case derived class s...
Definition: ncbiexpt.hpp:1388
void Write(CObjectOStream &out, TConstObjectPtr object, const CTypeRef &type)
Definition: serial.cpp:55
EOpFlags
CSeq_loc operations.
Definition: Seq_loc.hpp:324
@ fMerge_AbuttingOnly
Definition: Seq_loc.hpp:327
EOverlapType
TSignedSeqPos pos_after_last_gap
Inclusive.
Definition: sequence.hpp:1483
CRef< CSeq_loc > ProductToSource(const CSeq_feat &feat, const CSeq_loc &prod_loc, TP2SFlags flags=0, CScope *scope=0)
Definition: sequence.cpp:841
EAccessionVersion
Definition: sequence.hpp:90
virtual void processSAnnotSelector(SAnnotSelector &sel)=0
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
const IClient * GetClient() const
Get / Set client.
Definition: sequence.hpp:1250
CTextFsm< TPatternInfo > m_Fsa
Definition: sequence.hpp:1278
CConstRef< CSeq_feat > GetOverlappingSource(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1593
virtual void setUpFeatureIterator(CBioseq_Handle &bioseq_handle, unique_ptr< CFeat_CI > &feat_ci, TSeqPos circular_length, CRange< TSeqPos > &range, const CSeq_loc &loc, SAnnotSelector &sel, CScope &scope, ENa_strand &strand)=0
ES2PFlags
Definition: sequence.hpp:302
CConstRef< CSeq_feat > GetOverlappingmRNA(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1572
NCBI_EXCEPTION_DEFAULT(CSeqIdFromHandleException, CException)
virtual bool SkipBioseq(const CBioseq_Handle &handle)
Delegates to the non-handle version by default for compatibility with older code; newer code should o...
Definition: sequence.hpp:841
EGetIdFlags
Retrieve a particular seq-id from a given bioseq handle.
Definition: sequence.hpp:98
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
Definition: sequence.cpp:1428
CBioseq_Handle GetBioseqFromSeqLoc(const CSeq_loc &loc, CScope &scope, CScope::EGetBioseqFlag flag=CScope::eGetBioseq_Loaded)
Retrieve the Bioseq Handle from a location.
Definition: sequence.cpp:308
TSeqPos length
the length of the current gap
Definition: sequence.hpp:1646
virtual void processLoc(CBioseq_Handle &bioseq_handle, CRef< CSeq_loc > &loc, TSeqPos circular_length)=0
EMeaningOfAmbig m_eMeaningOfAmbig
This holds the current interpretation for "ambiguous".
Definition: sequence.hpp:1380
CConstRef< CSeq_feat > GetOverlappingGene(const CSeq_loc &loc, CScope &scope, ETransSplicing eTransSplicing=eTransSplicing_Auto)
Definition: sequence.cpp:1366
Int2 m_CutSite
nucleotide sequence
Definition: sequence.hpp:1192
EGetTitleFlags
This function is here rather than in CBioseq because it may need to inspect other sequences.
Definition: sequence.hpp:281
SRelLoc(const CSeq_loc &parent, const TRanges &ranges)
For manual work. As noted above, ranges need not contain any IDs.
Definition: sequence.hpp:1140
TSignedSeqPos m_uMinSeqLen
When the bioseq gets trimmed down to less than this size, we halt the trimming.
Definition: sequence.hpp:1389
CConstRef< CSeq_loc > m_HardMask
Definition: sequence.hpp:910
vector< CRef< TRange > > TRanges
Definition: sequence.hpp:1132
string GetProteinName(const CBioseq_Handle &seq)
Return protein name from corresponding Prot-ref feature.
Definition: sequence.cpp:356
virtual bool SkipBioseq(const CBioseq &)
Used only by Write(CSeq_entry[_Handle], ...); permissive by default.
Definition: sequence.hpp:837
CConstRef< CSeq_feat > GetBestCdsForMrna(const CSeq_feat &mrna_feat, CScope &scope, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
Definition: sequence.cpp:1874
void GetOrg_refForProduct(const CBioseq_Handle &bsh, const COrg_ref *)
Definition: sequence.cpp:194
size_t num_seqs_seen_so_far
This indicates how many sequences we've seen so far, including the one we're currently on.
Definition: sequence.hpp:1640
CBioseq_CI m_bioseq_CI
This points to the bioseq we're currently on.
Definition: sequence.hpp:1667
TSignedSeqPos max_bases_allowed_to_be_ambig
Definition: sequence.hpp:1316
SAmbigCount(const TSignedSeqPos iTrimDirection)
Definition: sequence.hpp:1470
long TFlags
binary OR of EFlags
Definition: sequence.hpp:798
const string & GetSequence(void) const
Definition: sequence.hpp:1182
TSearchFlags m_Flags
Definition: sequence.hpp:1276
TGi GetGiForAccession(const string &acc, CScope &scope, EGetIdType flags=0)
Given an accession string retrieve the GI id.
Definition: sequence.cpp:638
const string & GetName(void) const
Definition: sequence.hpp:1181
map< TSeqPos, int > TMSMap
Definition: sequence.hpp:928
SCurrentGapInfo m_infoOnCurrentGap
This indicates information about the gap we're currently on.
Definition: sequence.hpp:1669
string GetAccessionForId(const objects::CSeq_id &id, CScope &scope, EAccessionVersion use_version=eWithAccessionVersion, EGetIdType flags=0)
Retrieve the accession string for a Seq-id.
Definition: sequence.cpp:708
TSeqPos max_num_seqs
We only return gaps on up to this many sequences.
Definition: sequence.hpp:1593
TFlags m_Flags
Definition: sequence.hpp:904
CConstRef< CSeq_feat > GetSourceFeatForProduct(const CBioseq_Handle &bsh)
Definition: sequence.cpp:133
vector< string > gap_linkage_evidences
A vector representing the linkage-evidences of the gap.
Definition: sequence.hpp:871
EFindNext
This indicates what happened when we tried to run x_FindNextGapOnBioseq.
Definition: sequence.hpp:1689
TFlags GetAllFlags(void) const
Definition: sequence.hpp:857
CSeqSearch::TPatternInfo TPatternInfo
Definition: sequence.hpp:1207
CConstRef< CSeq_loc > m_ParentLoc
Definition: sequence.hpp:1148
virtual bool OnPatternFound(const TPatternInfo &pat_info, TSeqPos pos)=0
CRef< CSeq_loc > Resolve(CScope *scope=0, TFlags flags=0) const
Definition: sequence.hpp:1143
CConstRef< CSeq_feat > GetOverlappingCDS(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1579
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
Definition: seqtitle.cpp:106
ETransSplicing
Convenience functions for popular overlapping types.
Definition: sequence.hpp:579
pair< Int8, CConstRef< CSeq_feat > > TFeatScore
Storage for features and scores.
Definition: sequence.hpp:352
CPatternInfo TPatternInfo
Definition: sequence.hpp:1195
CConstRef< CSeq_feat > GetGeneForFeature(const CSeq_feat &feat, CScope &scope)
Finds gene for feature, but obeys SeqFeatXref directives.
Definition: sequence.cpp:1529
TSeq_id_HandleSet m_PreviousWholeIds
Definition: sequence.hpp:913
unique_ptr< sequence::CDeflineGenerator > m_Gen
Definition: sequence.hpp:895
CConstRef< CSeq_feat > GetBestGeneForMrna(const CSeq_feat &mrna_feat, CScope &scope, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
Definition: sequence.cpp:2040
CBioseq_CI::EBioseqLevelFlag level_filter
Works like the level filter in CBioseq_CI.
Definition: sequence.hpp:1601
CConstRef< CSeq_feat > GetBestMrnaForCds(const CSeq_feat &cds_feat, CScope &scope, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
Definition: sequence.cpp:1609
CSeq_inst::EMol mol_filter
CSeq_inst::eMol_na to only look at gaps on nucleotide sequences.
Definition: sequence.hpp:1599
TFlags m_fFlags
This holds the flags that affect the behavior of this class.
Definition: sequence.hpp:1382
TRanges m_Ranges
Definition: sequence.hpp:1149
Int2 GetCutSite(void) const
Definition: sequence.hpp:1183
Params m_Params
This holds the params the caller gave when this object was initially created.
Definition: sequence.hpp:1672
string m_Sequence
user defined name
Definition: sequence.hpp:1191
int TS2PFlags
Definition: sequence.hpp:306
bool x_IsAllowMismatch(TSearchFlags flags) const
Definition: sequence.hpp:1270
CSeq_id_Handle GetId(const CBioseq_Handle &handle, EGetIdType type=eGetId_Default)
Return a selected ID type for a given bioseq handle.
Definition: sequence.cpp:621
TTaxId GetTaxIdForProduct(const CBioseq_Handle &bsh)
Definition: sequence.cpp:171
CConstRef< CSeq_feat > GetBestOverlapForSNP(const CSeq_feat &snp_feat, CSeqFeatData::E_Choice type, CScope &scope, bool search_both_strands=true)
Get the best overlapping feature for a SNP (variation) feature.
Definition: sequence.cpp:1345
EMeaningOfAmbig
This enum is used to set what is meant by "ambiguous".
Definition: sequence.hpp:1289
CConstRef< CSeq_loc > m_SoftMask
Definition: sequence.hpp:909
EMaskType
Which residues to mask out in subsequent output.
Definition: sequence.hpp:847
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
TGi GetGiForId(const objects::CSeq_id &id, CScope &scope, EGetIdType flags=0)
Given a Seq-id retrieve the corresponding GI.
Definition: sequence.cpp:668
TSeqPos max_num_gaps_per_seq
We only return up to this many gaps for each sequence.
Definition: sequence.hpp:1591
virtual void postProcessDiffAmount(Int8 &cur_diff, CRef< CSeq_loc > &cleaned_loc_this_iteration, CRef< CSeq_loc > &candidate_feat_loc, CScope &scope, SAnnotSelector &sel, TSeqPos circular_length)=0
CConstRef< CSeq_feat > GetOverlappingPub(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1586
virtual ~CGetOverlappingFeaturesPlugin()
Definition: sequence.hpp:359
void SetClient(IClient *client)
Definition: sequence.hpp:1251
CConstRef< CSeq_feat > GetOverlappingOperon(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1600
IClient * m_Client
Definition: sequence.hpp:1275
CBioseq_Handle GetParentForPart(const CBioseq_Handle &part)
Get the parent bioseq for a part of a segmented bioseq.
Definition: sequence.cpp:2688
const SCurrentGapInfo & operator*(void) const
Get information about the gap we're currently on.
Definition: sequence.hpp:1654
CConstRef< CSeq_id > FindLatestSequence(const CSeq_id &id, CScope &scope)
Given a seq-id check its replace history and try to find the latest revision.
Definition: sequence.cpp:763
string GetAccessionForGi(TGi gi, CScope &scope, EAccessionVersion use_version=eWithAccessionVersion, EGetIdType flags=0)
Retrieve the accession for a given GI.
Definition: sequence.cpp:686
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:284
ENa_strand GetStrand(void) const
Definition: sequence.hpp:1184
TAmbigLookupTable m_arrNucAmbigLookupTable
Definition: sequence.hpp:1567
TSeqPos start_pos
the 0-based position at which the current gap starts on the current sequence.
Definition: sequence.hpp:1644
void GetMrnasForGene(const CSeq_feat &gene_feat, CScope &scope, list< CConstRef< CSeq_feat > > &mrna_feats, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
Definition: sequence.cpp:2195
CRef< CSeq_loc > SourceToProduct(const CSeq_feat &feat, const CSeq_loc &source_loc, TS2PFlags flags=0, CScope *scope=0, int *frame=0)
Definition: sequence.cpp:790
const CSeq_feat * GetPROTForProduct(const CBioseq &product, CScope *scope)
Get the mature peptide feature of a protein.
Definition: sequence.cpp:2593
EResult
This indicates what happened with the trim.
Definition: sequence.hpp:1352
AutoPtr< char, ArrayDeleter< char > > TCharBuf
Definition: sequence.hpp:915
int TTranslationFlags
Definition: sequence.hpp:988
TSeqPos m_Width
Definition: sequence.hpp:911
CConstRef< CSeq_feat > GetBestGeneForCds(const CSeq_feat &cds_feat, CScope &scope, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
Definition: sequence.cpp:2128
const COrg_ref * GetOrg_refOrNull(const CBioseq_Handle &handle)
Return the pointer to org-ref associated with a given sequence or null if there is no org-ref associa...
Definition: sequence.cpp:245
vector< STrimRule > TTrimRuleVec
Multiple STrimRules are allowed, which are applied from smallest bases_to_check to largest bases_to_c...
Definition: sequence.hpp:1322
void SetFlag(EFlags flag)
Definition: sequence.hpp:859
EGapMode GetGapMode(void) const
Definition: sequence.hpp:862
virtual ~CSequenceAmbigTrimmer()
Do-nothing destructor just to allow inheritance.
Definition: sequence.hpp:1348
CPatternInfo(const string &name, const string &sequence, Int2 cut_site)
constructor
Definition: sequence.hpp:1174
TAmbigLookupTable m_arrProtAmbigLookupTable
Definition: sequence.hpp:1568
string gap_type
String representing the gap type.
Definition: sequence.hpp:868
EP2SFlags
Definition: sequence.hpp:313
virtual void processMainLoop(bool &shouldContinueToNextIteration, CRef< CSeq_loc > &cleaned_loc_this_iteration, CRef< CSeq_loc > &candidate_feat_loc, EOverlapType &overlap_type_this_iteration, bool &revert_locations_this_iteration, CBioseq_Handle &bioseq_handle, const CMappedFeat &feat, TSeqPos circular_length, SAnnotSelector::EOverlapType annot_overlap_type)=0
CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle &product)
Definition: sequence.cpp:2568
CConstRef< CSeq_feat > GetmRNAforCDS(const CSeq_feat &cds, CScope &scope)
GetmRNAforCDS A function to find a CSeq_feat representing the appropriate mRNA for a given CDS.
Definition: sequence.cpp:1261
CSeq_interval TRange
For relative ranges (ONLY), id is irrelevant and normally unset.
Definition: sequence.hpp:1131
int EGetIdType
Definition: sequence.hpp:126
const CBioSource * GetBioSourceForBioseq(const CBioseq_Handle &bsh)
Find a BioSource for the given Bioseq: If it's a protein then look for the source feature of the prod...
Definition: sequence.cpp:220
SCurrentGapInfo(void)
Constructor initializes to state that it should be when the iterator first starts.
Definition: sequence.hpp:1629
vector< TFeatScore > TFeatScores
Definition: sequence.hpp:353
virtual void x_WriteBuffer(const char *buf, unsigned int count)
Definition: sequence.hpp:904
const CSeq_feat * GetmRNAForProduct(const CBioseq &product, CScope *scope)
Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
Definition: sequence.cpp:2617
void SetAllFlags(TFlags flags)
Definition: sequence.hpp:858
EGapMode
How to represent gaps with fInstantiateGaps enabled, as it is by default.
Definition: sequence.hpp:803
TTaxId GetTaxId(const CBioseq_Handle &handle)
return the tax-id associated with a given sequence.
Definition: sequence.cpp:274
bool x_IsExpandPattern(TSearchFlags flags) const
Definition: sequence.hpp:1267
int TP2SFlags
Definition: sequence.hpp:316
CNcbiOstream & m_Out
Definition: sequence.hpp:894
int TGetTitleFlags
Definition: sequence.hpp:287
CConstRef< CSeq_feat > GetBestOverlappingFeat(const CSeq_loc &loc, CSeqFeatData::E_Choice feat_type, EOverlapType overlap_type, CScope &scope, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
See the note above on 'overlap_type' meaning.
Definition: sequence.cpp:1208
void GetCdssForGene(const CSeq_feat &gene_feat, CScope &scope, list< CConstRef< CSeq_feat > > &cds_feats, TBestFeatOpts opts=fBestFeat_Defaults, CGetOverlappingFeaturesPlugin *plugin=NULL)
Definition: sequence.cpp:2318
void ReverseComplement(CSeq_inst &seq, CScope *scope)
Reverse complement a Bioseq in place.
Definition: sequence.cpp:5142
TSignedSeqPos x_SegmentGetBeginningInclusive(const CSeqMap_CI &segment, const TSignedSeqPos iTrimDirection)
This returns the (inclusive) position at the beginning of the segment.
Definition: sequence.hpp:1521
CMappedFeat GetMappedmRNAForProduct(const CBioseq_Handle &product)
Definition: sequence.cpp:2642
unsigned int TSearchFlags
binary OR of ESearchFlag
Definition: sequence.hpp:1220
Params(void)
Default ctor gives params which are usually reasonable.
Definition: sequence.hpp:1579
const CBioseq * GetNucleotideParent(const CBioseq &product, CScope *scope)
Get the encoding nucleotide sequnce of a protein.
Definition: sequence.cpp:2660
size_t m_LongestPattern
Definition: sequence.hpp:1277
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
Definition: sequence.cpp:264
bool x_IsJustTopStrand(TSearchFlags flags) const
Definition: sequence.hpp:1264
int TBestFeatOpts
Definition: sequence.hpp:348
CSeq_id_Handle seq_id
The seq-id that this gap is on.
Definition: sequence.hpp:1635
EGapMode m_GapMode
Definition: sequence.hpp:912
size_t num_gaps_seen_so_far_on_this_seq
how many gaps we've seen so far on this sequence.
Definition: sequence.hpp:1650
void ResetFlag(EFlags flag)
Definition: sequence.hpp:860
TSeqPos GetWidth(void) const
Other parameters...
Definition: sequence.hpp:855
void GetOverlappingFeatures(const CSeq_loc &loc, CSeqFeatData::E_Choice feat_type, CSeqFeatData::ESubtype feat_subtype, EOverlapType overlap_type, TFeatScores &feats, CScope &scope, const TBestFeatOpts opts=0, CGetOverlappingFeaturesPlugin *plugin=NULL)
Find all features overlapping the location.
Definition: sequence.cpp:945
const COrg_ref * GetOrg_refForBioseq(const CBioseq_Handle &bsh)
Find an Org-ref for the given Bioseq: If it's a protein then look on the source feature of the produc...
Definition: sequence.cpp:211
EBestFeatOpts
Definition: sequence.hpp:330
int TFlags
binary OR of EFlags
Definition: sequence.hpp:1128
TSeqPos max_gap_len_to_ignore
We completely ignore any gaps we find that have this number of bases or fewer.
Definition: sequence.hpp:1589
TSignedSeqPos num_ambig_bases
the number of ambiguous bases found in the range supplied to x_CountAmbigInRange
Definition: sequence.hpp:1480
TTrimRuleVec m_vecTrimRules
This holds the trimming rules that will be applied.
Definition: sequence.hpp:1386
DECLARE_OPERATOR_BOOL(m_bioseq_CI)
TCharBuf m_UC_Xs
Definition: sequence.hpp:916
void SetGapMode(EGapMode mode)
Definition: sequence.hpp:861
bool x_TestFlag(TFlags fFlag)
Test if a given flag is set.
Definition: sequence.hpp:1392
@ eWithAccessionVersion
accession.version (when possible)
Definition: sequence.hpp:91
@ eWithoutAccessionVersion
accession only, even if version is available
Definition: sequence.hpp:92
@ fS2P_NoMerge
don't merge adjacent intervals on the product
Definition: sequence.hpp:303
@ fS2P_AllowTer
map the termination codon as a legal location
Definition: sequence.hpp:304
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ eGetId_Default
Definition: sequence.hpp:124
@ eGetId_Seq_id_FastaNARank
use CSeq_id::FastaNARank() as the scoring function
Definition: sequence.hpp:110
@ eGetId_Seq_id_WorstRank
use CSeq_id::WorstRank() as the scoring function
Definition: sequence.hpp:108
@ eGetId_ThrowOnError
Throw exception on errors. If not set, an empty value is returned.
Definition: sequence.hpp:122
@ eGetId_ForceAcc
return only an accession based seq-id
Definition: sequence.hpp:100
@ eGetId_ForceGi
return only a gi-based seq-id
Definition: sequence.hpp:99
@ eGetId_Seq_id_Score
use CSeq_id::Score() as the scoring function
Definition: sequence.hpp:106
@ eGetId_Seq_id_FastaAARank
use CSeq_id::FastaAARank() as the scoring function
Definition: sequence.hpp:109
@ eGetId_Canonical
Definition: sequence.hpp:114
@ eGetId_Seq_id_BestRank
use CSeq_id::BestRank() as the scoring function
Definition: sequence.hpp:107
@ eGetId_VerifyId
Check if the seq-id is present in the scope.
Definition: sequence.hpp:119
@ eGetId_HandleDefault
returns the ID associated with a bioseq-handle
Definition: sequence.hpp:104
@ eGetId_TypeMask
Mask for requested id type.
Definition: sequence.hpp:116
@ fGetTitle_Organism
append [organism]
Definition: sequence.hpp:283
@ fGetTitle_Reconstruct
ignore existing title Seqdesc.
Definition: sequence.hpp:282
@ fGetTitle_NoExpensive
skip potential expensive operations
Definition: sequence.hpp:285
@ fGetTitle_AllProteins
normally just names the first
Definition: sequence.hpp:284
@ eFindNext_NotFound
No more relevant gaps were found on this bioseq.
Definition: sequence.hpp:1692
@ eTransSplicing_No
Definition: sequence.hpp:580
@ eTransSplicing_Yes
Definition: sequence.hpp:581
@ eTransSplicing_Auto
Ignore overlap strand if the source location has mixed/both strand.
Definition: sequence.hpp:582
@ eMeaningOfAmbig_AnyAmbig
Here, anything that's not certain is considered ambiguous.
Definition: sequence.hpp:1297
@ eMeaningOfAmbig_OnlyCompletelyUnknown
Here, only N for nucleotides and X for amino acids is considered ambiguous.
Definition: sequence.hpp:1292
@ eResult_SuccessfullyTrimmed
Bioseq is now trimmed.
Definition: sequence.hpp:1354
@ fP2S_Extend
if hitting ends, extend to include partial codons
Definition: sequence.hpp:314
@ eGM_letters
Multiple inline Ns or Xs as appropriate (default).
Definition: sequence.hpp:806
@ eGM_dashes
Multiple inline dashes.
Definition: sequence.hpp:805
@ eGM_one_dash
A single dash, followed by a line break.
Definition: sequence.hpp:804
@ fBestFeat_NoExpensive
don't perform any expensive tests, such as ones that require fetching additional sequences
Definition: sequence.hpp:336
@ fBestFeat_FavorLonger
favor longer features over shorter features
Definition: sequence.hpp:339
@ fBestFeat_Defaults
default options: do everything
Definition: sequence.hpp:346
@ fBestFeat_StrictMatch
requires explicit association, rather than analysis based on overlaps
Definition: sequence.hpp:332
@ fBestFeat_IgnoreStrand
Pay no attention to strands when finding the best feat.
Definition: sequence.hpp:343
EGetBioseqFlag
Definition: scope.hpp:125
@ eGetBioseq_Loaded
Search in all loaded TSEs in the scope.
Definition: scope.hpp:127
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
EBioseqLevelFlag
Class of bioseqs to iterate.
Definition: bioseq_ci.hpp:72
EOverlapType
Flag to indicate location overlapping method.
#define NCBI_DEPRECATED
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
#define numeric_limits
Pre-declaration of the "numeric_limits<>" template Forcibly overrides (using preprocessor) the origin...
Definition: ncbi_limits.hpp:92
#define kMax_Int
Definition: ncbi_limits.h:184
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NCBI_XOBJUTIL_EXPORT
Definition: ncbi_export.h:1339
E_Choice
Choice variants.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
char * buf
range(_Ty, _Ty) -> range< _Ty >
mdb_mode_t mode
Definition: lmdb++.h:38
T max(T x_, T y_)
T min(T x_, T y_)
static bool Translate(CSeq_feat &feat, string &prot)
Definition: nucprot.cpp:1393
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
@ Resolve
Try to resolve provided seq-ids.
static CNamedPipeClient * client
String search utilities.
The params that control the behavior of CBioseqGaps_CI.
Definition: sequence.hpp:1577
This indicates the state of the iterator right now.
Definition: sequence.hpp:1626
This indicates the text of the modifiers of a gap.
Definition: sequence.hpp:865
This holds the output of x_CountAmbigInRange.
Definition: sequence.hpp:1469
For example, if bases_to_check is 10 and max_bases_allowed_to_be_ambig is 5, then on each iteration w...
Definition: sequence.hpp:1314
SAnnotSelector –.
Location relative to a base Seq-loc: one (usually) or more ranges of offsets.
Definition: sequence.hpp:1124
Definition: inftrees.h:24
Definition: type.c:6
Modified on Fri Sep 20 14:58:28 2024 by modify_doxy.py rev. 669887