NCBI C++ ToolKit
cuSeqAnnotFromFasta.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuSeqAnnotFromFasta.hpp 40765 2009-01-15 19:18:36Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Chris Lanczycki
27 *
28 * File Description:
29 * Use data from a Fasta I/O object to construct a CSeq_annot
30 * intended for eventual installation in a CCdd-derived object.
31 *
32 * ===========================================================================
33 */
34 
35 #ifndef CU_SEQANNOT_FROM_FASTA__HPP
36 #define CU_SEQANNOT_FROM_FASTA__HPP
37 
38 #include <map>
39 #include <list>
40 #include <vector>
41 #include <corelib/ncbiexpt.hpp>
42 #include <corelib/ncbiapp.hpp>
43 
45 #include <objects/seq/Bioseq.hpp>
48 
51 BEGIN_SCOPE(cd_utils)
52 
54 {
55 
56 public:
57 
59  eFirstSequence = 0,
61  eMostAlignedAndFewestGaps, // looking in the IBM footprint
62  eUnassignedMaster = 99999999
63  };
64 
65  CSeqAnnotFromFasta(bool doIbm = true, bool preferStructureMaster = false, bool caseSensitive = false);
66  virtual ~CSeqAnnotFromFasta() {
67  m_seqAnnot.Reset();
68  }
69 
70  virtual bool IsSeqAnnotValid() const;
71 
72  // The 'masterIndex' parameter is ignored unless masterMethod == eSpecifiedSequence.
73  // Makes a Seq-annot that has been indexed to the de-gapped sequence data from the
74  // CFastaIOWrapper object, and caches those de-gapped sequeces in m_sequences.
75  virtual bool MakeSeqAnnotFromFasta(CNcbiIstream& is, CFastaIOWrapper& fastaIO, MasteringMethod masterMethod, unsigned int masterIndex = (unsigned int) eUnassignedMaster);
76 
77  // Return empty string if index out of range of m_sequences.
78  string GetSequence(unsigned int index) const;
79 
80  const CRef<CSeq_annot>& GetSeqAnnot() const {return m_seqAnnot;}
81 
82  // Removes any non-alphanumeric characters from the sequence data in
83  // 'bioseq', adjusting the length as necessary. Will change the encoding
84  // of the sequence to Ncbieaa if any characters were purged.
85  // Returns 'true' if any characters were removed.
86  static bool PurgeNonAlphaFromSequence(CBioseq& bioseq);
87 
88  // Same as above but works on the cached strings in m_sequences.
89  void PurgeNonAlphaFromCachedSequences();
90 
91 
92  unsigned int GetMasterIndex() const {return m_masterIndex;}
93  //void SetMasterIndex(unsigned int masterIndex) { m_masterIndex = masterIndex; //requires remaster}
94 
95  // Break up the vector 'counts' into blocks which contain no entries less than the 'threshold'.
96  // Resulting blocks are defined by their start index in count and length of the block in terms
97  // of number of consecutive indices that satisfy the threshold criteria.
98  // (algorithm adapted from cd_utils::BlockIntersector::getIntersectedAlignment)
99  static unsigned int GetBlocksFromCounts(unsigned int threshold, const vector<unsigned int>& counts, const set<unsigned int>& forcedBreak, vector<unsigned int>& starts, vector<unsigned int>& lengths);
100 
101  static void CountNonAlphaToPositions(const vector<unsigned int>& positions, const string& sequence, map<unsigned int, unsigned int>& numNonAlpha);
102 
103  static bool isNotAlpha(char c);
104 
105 private:
106 
107  // m_doIBM = false: the seq_annot will reflect the pairwise alignments found in the fasta
108  // m_doIBM = true: the seq_annot will contain only columns aligned in every pairwise alignment
109  bool m_doIBM;
110 
111  // If true, use a structure as a master where possible (i.e., the preference may be
112  // overriden for various MasteringMethods).
114 
115  // A Fasta variant allows for lowercase letters in a sequence string to indicate that that
116  // residue is not intended to be aligned, even if it potentially could have been.
117  // m_caseSensitive = true: do not align lowercase residues in the seq_annot
118  // m_caseSensitive = false: lowercase residues will be aligned in the seq_annot if possible
120 
121  // The seq_annot's pairwise alignments will be indexed to a common sequence.
122  // m_masterIndex is the zero-based index of that sequence in the input fasta.
123  unsigned int m_masterIndex;
124 
126  vector<string> m_sequences; // cache for the sequences in m_seqAnnot.
127 
128  // Cache sequence strings found in the 'sequences' field of 'dummyCD'.
129  // If 'degapSequences' is true, remove any gap characters prior to adding the string
130  // to m_sequences. Returns index of the longest sequence in m_sequences after their
131  // addition (i.e., longest sequences after any degapping was done).
132  void CacheSequences(CCdCore& dummyCD, unsigned int& longestSequenceIndex, bool degapSequences);
133 
134  bool MakeIBMSeqAnnot(CCdCore& dummyCD);
135  bool MakeAsIsSeqAnnot(CCdCore& dummyCD);
136 
137  // Make a seq_align from the block starts & lengths, where the starts are based
138  // on the sequences passed in. The seq-align is made by reindexing to a *gapless*
139  // version of the master and slave sequences.
140  bool BuildMasterSlaveSeqAlign(const CRef<CSeq_id>& masterSeqid, const CRef<CSeq_id>& slaveSeqid, const string& masterSequence, const string& slaveSequence, const vector<unsigned int>& blockStarts, const vector<unsigned int>& blockLengths, CRef<CSeq_align>& pairwiseSA);
141 
142  // Sets m_masterIndex as per the mastering method chosen. Returns m_masterIndex.
143  unsigned int DetermineMasterIndex(CCdCore& dummyCD, MasteringMethod masterMethod);
144 
145 
146 };
147 
148 END_SCOPE(cd_utils)
150 
151 #endif // CU_SEQANNOT_FROM_FASTA__HPP
const CRef< CSeq_annot > & GetSeqAnnot() const
CRef< CSeq_annot > m_seqAnnot
vector< string > m_sequences
unsigned int GetMasterIndex() const
Definition: map.hpp:338
USING_SCOPE(objects)
SBlastSequence GetSequence(const objects::CSeq_loc &sl, EBlastEncoding encoding, objects::CScope *scope, objects::ENa_strand strand=objects::eNa_strand_plus, ESentinelType sentinel=eSentinels, std::string *warnings=NULL)
Retrieves a sequence using the object manager.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define NCBI_CDUTILS_EXPORT
Definition: ncbi_export.h:376
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines NCBI C++ exception handling.
Modified on Sun Apr 21 03:38:49 2024 by modify_doxy.py rev. 669887