NCBI C++ ToolKit
seq_align_mapper_base.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef SEQ_ALIGN_MAPPER_BASE__HPP
2 #define SEQ_ALIGN_MAPPER_BASE__HPP
3 
4 /* $Id: seq_align_mapper_base.hpp 99064 2023-02-08 19:14:27Z ucko $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Aleksey Grichenko
30 *
31 * File Description:
32 * Alignment mapper base
33 *
34 */
35 
43 
46 
47 class CDense_seg;
48 class CPacked_seg;
49 class CSeq_align_set;
50 class CSpliced_seg;
51 class CSparse_seg;
52 class CMappingRange;
54 
55 /// Structure to hold information about a single alignment segment.
56 /// Used internally by CSeq_align_Mapper_Base.
58 {
59  /// Single row of a single alignment segment.
61  {
62  SAlignment_Row(void);
63 
64  /// Mark the row as mapped. Some rows or their parts are just
65  /// copied without real mapping. Setting this flag indicates
66  /// that the segment/row matched some mapping and was converted.
67  void SetMapped(void);
68 
69  /// Get segment start or -1 if it's a gap. The wrapper is required
70  /// mostly to convert kInvalidSeqPos to -1 (used in alignments).
71  int GetSegStart(void) const;
72 
73  /// Check if the query row has the same strand orientation.
74  bool SameStrand(const SAlignment_Row& r) const;
75 
76  CSeq_id_Handle m_Id; ///< Row's seq-id
77  TSeqPos m_Start; ///< kInvalidSeqPos means gap
78  bool m_IsSetStrand; ///< Is strand set for the row?
79  ENa_strand m_Strand; ///< Strand value
80  bool m_Mapped; ///< Flag indicating mapped rows
81  };
82  typedef vector<SAlignment_Row> TRows;
83 
84  /// Create a new segment with the given length and number of rows.
85  SAlignment_Segment(int len, size_t dim);
86 
87  /// Get row data with the given index.
88  SAlignment_Row& GetRow(size_t idx);
89  /// Create a copy of the given row, store is to this segment as
90  /// row number 'idx'. The source row may originate from a different
91  /// segment. Used to split segments when a row is truncated by mapping.
92  /// NOTE: the rows vector must already have entry [idx].
93  SAlignment_Row& CopyRow(size_t idx, const SAlignment_Row& src_row);
94  /// Add new row.
95  SAlignment_Row& AddRow(size_t idx,
96  const CSeq_id& id,
97  int start,
98  bool is_set_strand,
99  ENa_strand strand);
100  /// Add new row.
101  SAlignment_Row& AddRow(size_t idx,
102  const CSeq_id_Handle& id,
103  int start,
104  bool is_set_strand,
105  ENa_strand strand);
106 
107  typedef vector< CRef<CScore> > TScores;
109 
110  int m_Len; ///< Segment length
111  TRows m_Rows; ///< Segment rows
112  bool m_HaveStrands; ///< Do at least some rows have strand set?
113  TScores m_Scores; ///< Scores for this segment
114  int m_GroupIdx; ///< Group of segments (e.g. an exon)
115  /// Group of scores. Set when several segments share the same set of
116  /// scores. Currently used only for sparse-segs. -1 = unassigned.
118 
119  // Used only for spliced exon parts to indicate their type.
121 };
122 
123 
124 /// Class used to map seq-alignments. Parses, maps and generates alignments.
125 /// Does not contain mapping information and can be used only with an instance
126 /// of CSeq_loc_Mapper_Base class. The seq-loc mapper is also used to retrieve
127 /// information about types of sequences.
129 {
130 public:
133 
134  CSeq_align_Mapper_Base(const CSeq_align& align,
135  CSeq_loc_Mapper_Base& loc_mapper);
136  ~CSeq_align_Mapper_Base(void);
137 
138  /// Map the whole alignment through the linked seq-loc mapper.
139  void Convert(void);
140  /// Map a single row of the alignment through the linked seq-loc mapper.
141  void Convert(size_t row);
142 
143  /// Create mapped alignment.
144  CRef<CSeq_align> GetDstAlign(void) const;
145 
146  /// Some of the following methods use only the first segment to get
147  /// information about rows. They do not check if this information is
148  /// consistent through all segments, but it should be.
149 
150  /// Get maximal number of rows in the alignment's segments.
151  size_t GetDim(void) const { return m_Dim; }
152  /// Get seq-id for the given row. Throw exception if the row
153  /// does not exist. The function uses row id from the first segment.
154  /// Other segments may have different id for the same row.
155  const CSeq_id_Handle& GetRowId(size_t idx) const;
156 
157  typedef list<SAlignment_Segment> TSegments;
158 
159  /// Get parsed segments. There is no storage for the original set of
160  /// segments - it's modified during the mapping to produce mapped
161  /// alignment.
162  const TSegments& GetSegments() const;
163 
164 protected:
166 
167  // Get the linked seq-loc mapper
168  CSeq_loc_Mapper_Base& GetLocMapper(void) const { return m_LocMapper; }
169 
170  /// Get max number of rows of all segments
171  size_t GetMaxDim(void) const;
172 
173  // Create sub-mapper to map sub-alignment. Used to map nested alignments.
174  virtual CSeq_align_Mapper_Base*
175  CreateSubAlign(const CSeq_align& align);
176  // Create sub-mapper to map a single spliced-seg exon. Each exon is mapped
177  // by a separate sub-mapper.
178  virtual CSeq_align_Mapper_Base*
179  CreateSubAlign(const CSpliced_seg& spliced,
180  const CSpliced_exon& exon);
181  // Initialize the mapper with the exon.
182  void InitExon(const CSpliced_seg& spliced,
183  const CSpliced_exon& exon);
184 
185  // Initialize the mapper with the seq-align.
186  void x_Init(const CSeq_align& align);
187  // Add new segment before the specified position.
188  // Required to split segments which can not be mapped as a whole.
189  SAlignment_Segment& x_InsertSeg(TSegments::iterator& where,
190  int len,
191  size_t dim,
192  bool reverse);
193  // Reset scores for the given segment and/or for the whole alignment.
194  // This always resets global scores. Segment scores are reset only if
195  // the segment is not NULL.
196  // Resetting scores is done when a segment needs to be truncated (split)
197  // because this operation makes them invalid.
198  void x_InvalidateScores(SAlignment_Segment* seg = NULL);
199 
200 private:
201 
202  // Add new alignment segment. Sorting depends on the strand.
203  SAlignment_Segment& x_PushSeg(int len, size_t dim,
204  ENa_strand strand = eNa_strand_unknown);
205 
206  // Initialization methods for different alignment types.
207  void x_Init(const TDendiag& diags);
208  void x_Init(const CDense_seg& denseg);
209  void x_Init(const TStd& sseg);
210  void x_Init(const CPacked_seg& pseg);
211  void x_Init(const CSeq_align_set& align_set);
212  void x_Init(const CSpliced_seg& spliced);
213  void x_Init(const CSparse_seg& sparse);
214 
215  // Mapping through CSeq_loc_Mapper_Base
216 
217  // Map the whole alignment. If row is set, map only this row.
218  // Otherwise iterate all rows and try to map each of them.
219  void x_ConvertAlign(size_t* row);
220  // Map a single alignment row. Iterates all segments of the given row.
221  void x_ConvertRow(size_t row);
222  // Map a single segment of the given row. The iterator is advanced
223  // to the next segment to be mapped. Additional segments may be
224  // inserted before the new iterator position if the mapping is partial
225  // and the original segment is split.
226  CSeq_id_Handle x_ConvertSegment(TSegments::iterator& seg_it,
227  size_t row);
228 
229  // Scan all rows for ranges with strands, store the result.
230  // If the strand info can not be found, plus strand is used.
231  // The collected strands are used in gaps (in the alignments where
232  // strand can not be left unset). The method does not check consistency
233  // of strands in the whole row - it's not required in this case.
234  typedef vector<ENa_strand> TStrands;
235  void x_FillKnownStrands(TStrands& strands) const;
236 
237  // Create mapped alignment.
238  void x_GetDstDendiag(CRef<CSeq_align>& dst) const;
239  void x_GetDstDenseg(CRef<CSeq_align>& dst) const;
240  void x_GetDstStd(CRef<CSeq_align>& dst) const;
241  void x_GetDstPacked(CRef<CSeq_align>& dst) const;
242  void x_GetDstDisc(CRef<CSeq_align>& dst) const;
243  void x_GetDstSpliced(CRef<CSeq_align>& dst) const;
244  void x_GetDstSparse(CRef<CSeq_align>& dst) const;
245 
246  // Create mapped exon and add it to the spliced-seg.
247  // 'seg' is the segment to start with (the original exon could be split).
248  // 'gen_id' and 'prod_id' are used to return exon level seq-ids.
249  // 'gen_strand' and 'prod_strand' are used to return exon level strands.
250  // 'partial' indicates if the original exon was truncated.
251  // 'last_gen_id' and 'last_prod_id' provide the ids found in previous
252  // exons (if any).
253  // Return true if an exon was added to the spliced-seg, false otherwise.
254  bool x_GetDstExon(CSpliced_seg& spliced,
255  TSegments::const_iterator& seg,
256  CSeq_id_Handle& gen_id,
257  CSeq_id_Handle& prod_id,
258  ENa_strand& gen_strand,
259  ENa_strand& prod_strand,
260  bool& last_exon_partial,
261  const CSeq_id_Handle& last_gen_id,
262  const CSeq_id_Handle& last_prod_id) const;
263  // Adds new part to the exon. If last part had the same type, it is
264  // merged with the new one.
265  void x_PushExonPart(CRef<CSpliced_exon_chunk>& last_part,
267  int part_len,
268  CSpliced_exon& exon) const;
269 
270  // Some mapping results can not be represented by the original alignment
271  // type (e.g. when a row contains multiple ids). In this case the result
272  // is converted to to disc.
273  void x_ConvToDstDisc(CRef<CSeq_align>& dst) const;
274  // Get the next part of the disc align - see x_ConvToDstDisc.
275  ssize_t x_GetPartialDenseg(CRef<CSeq_align>& dst,
276  size_t start_seg) const;
277 
278  // Collect exons from a single sub-alignment.
279  void x_GetDstSplicedSubAlign(CSpliced_seg& spliced,
280  const CSeq_align_Mapper_Base& sub_align,
281  bool& last_exon_partial,
282  CSeq_id_Handle& gen_id,
283  CSeq_id_Handle& last_gen_id,
284  bool& single_gen_id,
285  ENa_strand& gen_strand,
286  bool& single_gen_str,
287  CSeq_id_Handle& prod_id,
288  CSeq_id_Handle& last_prod_id,
289  bool& single_prod_id,
290  ENa_strand& prod_strand,
291  bool& single_prod_str,
292  bool& partial) const;
293 
294  // Check if both nucs and prots are present in the segments.
295  bool x_HaveMixedSeqTypes(void) const;
296  // Check if each row contains only one strand.
297  bool x_HaveMixedStrand(void) const;
298 
299  // Check if the mapped alignment has at least one segment with at least
300  // two non-gap rows.
301  bool x_IsEmpty(void) const;
302 
304  // Original alignment
306  // Original exon when mapping a splices seg through multiple mappers
308  // Flag indicating if the original alignment contains any strands
310  // Number of rows in the original alignment (sometimes hard to calculate).
311  size_t m_Dim;
312 
313  // Alignment scores
315  typedef vector<TScores> TScoresGroups;
316 
317  // Global seq-align scores.
319  // Seq-align.segs scores.
321  // Group scores (e.g. per-exon).
323  // Flag used to invalidate parent's scores if any of the children
324  // is invalidated.
326 
327 protected:
328  // Used for nested alignments - a set of child mappers, each mapping
329  // its own sub-alignment.
330  typedef vector< CRef<CSeq_align_Mapper_Base> > TSubAligns;
331 
332  // Flags to indicate possible destination alignment types:
333  // multi-dim or multi-id alignments can be packed into std-seg
334  // or dense-diag only.
335  enum EAlignFlags {
336  eAlign_Normal, // Normal alignment, may be packed into any type
337  eAlign_Empty, // Empty alignment
338  eAlign_MultiId, // A row contains different IDs
339  eAlign_MultiDim // Segments have different number of rows
340  };
341 
342  mutable CRef<CSeq_align> m_DstAlign; // Mapped alignment
343  TSubAligns m_SubAligns; // Sub-mappers
344  mutable TSegments m_Segs; // Parsed segments
345  EAlignFlags m_AlignFlags; // Spesial case flags
346 };
347 
348 
349 inline
351  : m_Start(kInvalidSeqPos),
352  m_IsSetStrand(false),
353  m_Strand(eNa_strand_unknown),
354  m_Mapped(false)
355 {
356  return;
357 }
358 
359 
360 inline
362 {
363  m_Mapped = true;
364 }
365 
366 
367 inline
369 SameStrand(const SAlignment_Row& r) const
370 {
371  return SameOrientation(m_Strand, r.m_Strand);
372 }
373 
374 
375 inline
377 {
378  return m_Start != kInvalidSeqPos ? int(m_Start) : -1;
379 }
380 
381 
382 inline
385 {
386  return m_Segs;
387 }
388 
389 
390 inline
391 size_t
393 {
394  return m_Dim;
395 }
396 
397 
400 
401 #endif // SEQ_ALIGN_MAPPER_BASE__HPP
bool SameOrientation(ENa_strand a, ENa_strand b)
Definition: Na_strand.hpp:83
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CMappingRange - describes a single interval to interval mapping.
CObject –.
Definition: ncbiobj.hpp:180
CPacked_seg –.
Definition: Packed_seg.hpp:66
Class used to map seq-alignments.
CSeq_loc_Mapper_Base & m_LocMapper
CSeq_loc_Mapper_Base & GetLocMapper(void) const
CSeq_align::C_Segs::TDendiag TDendiag
vector< CRef< CSeq_align_Mapper_Base > > TSubAligns
size_t GetDim(void) const
Some of the following methods use only the first segment to get information about rows.
size_t GetMaxDim(void) const
Get max number of rows of all segments.
SAlignment_Segment::TScores TScores
CConstRef< CSpliced_exon > m_OrigExon
vector< ENa_strand > TStrands
CConstRef< CSeq_align > m_OrigAlign
CSeq_align::C_Segs::TStd TStd
const TSegments & GetSegments() const
Get parsed segments.
list< SAlignment_Segment > TSegments
CSeq_loc_Mapper_Base –.
CSeq_align::C_Segs::TDendiag TDendiag
Definition: cuAlign.hpp:48
#define false
Definition: bool.h:36
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_SEQ_EXPORT
Definition: ncbi_export.h:825
list< CRef< CStd_seg > > TStd
Definition: Seq_align_.hpp:196
list< CRef< CDense_diag > > TDendiag
Definition: Seq_align_.hpp:194
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int len
int ssize_t
Definition: ncbiconf_msvc.h:93
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define row(bind, expected)
Definition: string_bind.c:73
Single row of a single alignment segment.
TSeqPos m_Start
kInvalidSeqPos means gap
int GetSegStart(void) const
Get segment start or -1 if it's a gap.
bool m_IsSetStrand
Is strand set for the row?
bool SameStrand(const SAlignment_Row &r) const
Check if the query row has the same strand orientation.
bool m_Mapped
Flag indicating mapped rows.
void SetMapped(void)
Mark the row as mapped.
Structure to hold information about a single alignment segment.
vector< SAlignment_Row > TRows
TRows m_Rows
Segment rows.
vector< CRef< CScore > > TScores
TScores m_Scores
Scores for this segment.
CSpliced_exon_chunk::E_Choice TPartType
ssize_t m_ScoresGroupIdx
Group of scores.
bool m_HaveStrands
Do at least some rows have strand set?
int m_Len
Segment length.
int m_GroupIdx
Group of segments (e.g.
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
Modified on Thu Apr 25 08:20:22 2024 by modify_doxy.py rev. 669887