NCBI C++ ToolKit
aln_converters.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_ALNMGR___ALN_CONVERTERS__HPP
2 #define OBJTOOLS_ALNMGR___ALN_CONVERTERS__HPP
3 /* $Id: aln_converters.hpp 88953 2020-02-05 20:27:33Z vasilche $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Author: Kamen Todorov, NCBI
29 *
30 * File Description:
31 * Alignment converters
32 *
33 * ===========================================================================
34 */
35 
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
42 
47 
48 
50 
51 
52 typedef vector<TAlnSeqIdIRef> TAlnSeqIdVec;
53 
54 /// Build pairwise alignment from the selected rows of a seq-align.
55 /// @param pairwise_aln
56 /// Output pairwise alignment. Should be initialized with the correct ids
57 /// (the functions does not check if the ids in the pairwise alignment
58 /// correspond to the ids of the selected rows).
59 /// @param sa
60 /// Input seq-align object.
61 /// @param row_1
62 /// First row index.
63 /// @param row_2
64 /// Second row index.
65 /// @param direction
66 /// Flag indicating if the output pairwise alignment should include
67 /// direct, reverse, or any segments.
68 /// NOTE: segment direction in pariwise alignments is relative
69 /// (second vs first row).
70 /// @param ids
71 /// Optional vector of alignment seq-ids used only to check if the source alignment
72 /// contains mixed sequence types. All ids from the vector are compared, not just
73 /// the two selected rows.
76  CPairwiseAln& pairwise_aln,
77  const objects::CSeq_align& sa,
81  const TAlnSeqIdVec* ids = 0);
82 
83 
84 /// Build pairwise alignment from the selected rows of a dense-seg.
85 /// @sa ConvertSeqAlignToPairwiseAln
88  CPairwiseAln& pairwise_aln,
89  const objects::CDense_seg& ds,
93  const TAlnSeqIdVec* ids = 0);
94 
95 
96 /// Build pairwise alignment from the selected rows of a packed-seg.
97 /// @sa ConvertSeqAlignToPairwiseAln
100  CPairwiseAln& pairwise_aln,
101  const objects::CPacked_seg& ps,
105  const TAlnSeqIdVec* ids = 0);
106 
107 
108 /// Build pairwise alignment from the selected rows of an std-seg.
109 /// @sa ConvertSeqAlignToPairwiseAln
112  CPairwiseAln& pairwise_aln,
113  const objects::CSeq_align::TSegs::TStd& stds,
118  const TAlnSeqIdVec* ids = 0);
119 
120 
121 /// Build pairwise alignment from the selected rows of a dendiag.
122 /// @sa ConvertSeqAlignToPairwiseAln
125  CPairwiseAln& pairwise_aln,
126  const objects::CSeq_align::TSegs::TDendiag& dendiags,
131  const TAlnSeqIdVec* ids = 0);
132 
133 
134 /// Build pairwise alignment from the selected rows of a sparse-seg.
135 /// @sa ConvertSeqAlignToPairwiseAln
138  CPairwiseAln& pairwise_aln,
139  const objects::CSparse_seg& sparse_seg,
143  const TAlnSeqIdVec* ids = 0);
144 
145 
146 /// Build pairwise alignment from the selected rows of a spliced-seg.
147 /// @sa ConvertSeqAlignToPairwiseAln
150  CPairwiseAln& pairwise_aln,
151  const objects::CSpliced_seg& spliced_seg,
155  const TAlnSeqIdVec* ids = 0);
156 
157 
158 /// Build pairwise alignment from a pair of seq-locs. Each seq-loc must
159 /// reference a single sequence.
160 /// @param aln
161 /// Output pairwise alignment. Should be initialized with the correct ids
162 /// (the functions does not check if the ids in the pairwise alignment
163 /// correspond to the ids of the seq-locs).
164 /// @param loc_1
165 /// First seq-loc.
166 /// @param loc_2
167 /// Second seq-loc.
168 /// @param direction
169 /// Flag indicating if the output pairwise alignment should include
170 /// direct, reverse, or any segments.
171 /// NOTE: segment direction in pariwise alignments is relative
172 /// (second vs first row).
175  CPairwiseAln& aln,
176  const objects::CSeq_loc& loc_1,
177  const objects::CSeq_loc& loc_2,
179 
180 
181 typedef list< CRef<CPairwiseAln> > TPairwiseAlnList;
182 
183 /// Build a list of pairwise alignments from a seq-loc mapper's mappings.
185 void SeqLocMapperToPairwiseAligns(const objects::CSeq_loc_Mapper_Base& mapper,
186  TPairwiseAlnList& aligns);
187 
188 
189 /// Create an anchored alignment from Seq-align using hints.
190 /// Optionally, choose the anchor row explicitly (this overrides
191 /// options.GetAnchorId()).
192 /// NOTE: Potentially, this "shrinks" the alignment vertically in case some
193 /// row was not aligned to the anchor.
194 /// @param aln_stats
195 /// Input alignment stats (see CAlnStats template).
196 /// @param aln_idx
197 /// Index of the input alignment in the stats.
198 /// @param options
199 /// Options for building the anchored alignment.
200 /// @param explicit_anchor_row
201 /// Explicit anchor row index (this overrides anchor id set in the options).
202 /// By default the anchor row is selected automatically.
203 /// @sa CAlnStats
204 template<class _TAlnStats>
206  const _TAlnStats& aln_stats,
207  size_t aln_idx,
208  const CAlnUserOptions& options,
209  objects::CSeq_align::TDim explicit_anchor_row = -1)
210 {
211  typedef typename _TAlnStats::TDim TDim;
212  TDim dim = aln_stats.GetDimForAln(aln_idx);
213 
214  // What anchor?
215  TDim anchor_row;
216  if (explicit_anchor_row >= 0) {
217  if (explicit_anchor_row >= dim) {
218  NCBI_THROW(CAlnException, eInvalidRequest,
219  "Invalid explicit_anchor_row");
220  }
221  anchor_row = explicit_anchor_row;
222  }
223  else {
224  size_t anchor_id_idx = 0; // Prevent warning
225  if ( aln_stats.CanBeAnchored() ) {
226  if ( options.GetAnchorId() ) {
227  // if anchor was chosen by the user
228  typedef typename _TAlnStats::TIdMap TIdMap;
229  typename TIdMap::const_iterator it =
230  aln_stats.GetAnchorIdMap().find(options.GetAnchorId());
231  if (it == aln_stats.GetAnchorIdMap().end()) {
232  NCBI_THROW(CAlnException, eInvalidRequest,
233  "Invalid options.GetAnchorId()");
234  }
235  anchor_id_idx = it->second[0];
236  }
237  else {
238  // if not explicitly chosen, just choose the first potential
239  // anchor that is preferably not aligned to itself
240  for (size_t i = 0; i < aln_stats.GetAnchorIdVec().size(); ++i) {
241  const TAlnSeqIdIRef& anchor_id = aln_stats.GetAnchorIdVec()[i];
242  if (aln_stats.GetAnchorIdMap().find(anchor_id)->second.size() > 1) {
243  // this potential anchor is aligned to itself, not
244  // the best choice
245  if (i == 0) {
246  // but still, keep the first one in case all
247  // are bad
248  anchor_id_idx = aln_stats.GetAnchorIdxVec()[i];
249  }
250  }
251  else {
252  // perfect: the first anchor that is not aligned
253  // to itself
254  anchor_id_idx = aln_stats.GetAnchorIdxVec()[i];
255  break;
256  }
257  }
258  }
259  }
260  else {
261  NCBI_THROW(CAlnException, eInvalidRequest,
262  "Alignments cannot be anchored.");
263  }
264  anchor_row = aln_stats.GetRowVecVec()[anchor_id_idx][aln_idx];
265  }
266  _ALNMGR_ASSERT(anchor_row >= 0 && anchor_row < dim);
267 
268  // If there are different sequence types involved, force genomic coordinates.
269  // No need to explicitly check this if the anchor is a nucleotide.
270  bool force_widths = false;
271  if ( aln_stats.GetIdVec()[anchor_row]->IsProtein() ) {
272  for (size_t i = 0; i < aln_stats.GetIdVec().size(); ++i) {
273  if ( !aln_stats.GetIdVec()[i]->IsProtein() ) {
274  force_widths = true;
275  break;
276  }
277  }
278  }
279 
280  const CSeq_align& seq_aln = *aln_stats.GetAlnVec()[aln_idx];
281  // Flags
282  int anchor_flags = CPairwiseAln::fKeepNormalized;
285  if ( seq_aln.GetSegs().IsStd() ) {
286  // Std-segs may contain overlaps (in this case the alignment will be split
287  // into segments later).
288  anchor_flags |= CPairwiseAln::fAllowOverlap;
290  }
291 
292  if ((options.m_MergeFlags & CAlnUserOptions::fIgnoreInsertions) != 0) {
293  anchor_flags |= CPairwiseAln::fIgnoreInsertions;
295  }
296 
297  // Create pairwises
298  typedef typename _TAlnStats::TIdVec TIdVec;
299  const TIdVec ids = aln_stats.GetSeqIdsForAln(aln_idx);
301  pairwises.resize(dim);
302  int empty_rows = 0;
303  for (TDim row = 0; row < dim; ++row) {
304  CRef<CPairwiseAln> pairwise_aln(new CPairwiseAln(ids[anchor_row],
305  ids[row],
306  row == anchor_row ? anchor_flags : flags));
307 
309  *pairwise_aln, seq_aln,
310  anchor_row, row,
311  row == anchor_row ? CAlnUserOptions::eDirect : options.m_Direction,
312  &ids);
313 
314  if ( force_widths ) {
315  // Need to convert coordinates to genomic.
316  pairwise_aln->ForceGenomicCoords();
317  }
318 
319  if ( pairwise_aln->empty() ) {
320  ++empty_rows;
321  }
322 
323  pairwises[row].Reset(pairwise_aln);
324  }
325  _ALNMGR_ASSERT(empty_rows >= 0 && empty_rows < dim);
326  if (empty_rows == dim - 1) {
327  return CRef<CAnchoredAln>();
328  // Alternatively, perhaps we can continue processing here
329  // which would result in a CAnchoredAln that only contains
330  // the anchor.
331  }
332 
333  // Create the anchored aln (which may shrink vertically due to resulting empty rows)
334  TDim new_dim = dim - empty_rows;
335  _ALNMGR_ASSERT(new_dim > 0);
336 
337  // Anchor row goes at the last row (TODO: maybe a candidate for a user option?)
338  TDim target_anchor_row =
340  0 : new_dim - 1;
341 
342  CRef<CAnchoredAln> anchored_aln(new CAnchoredAln);
343  anchored_aln->SetDim(new_dim);
344 
345  for (TDim row = 0, target_row = 0; row < dim; ++row) {
346  if ( !pairwises[row]->empty() ) {
347  if (target_row == target_anchor_row) {
348  target_row++;
349  }
350  anchored_aln->SetPairwiseAlns()[row == anchor_row ?
351  target_anchor_row :
352  target_row++].Reset(pairwises[row]);
353  }
354  }
355  anchored_aln->SetAnchorRow(target_anchor_row);
356  return anchored_aln;
357 }
358 
359 
360 /// Create anchored alignment from each seq-align in the stats.
361 /// @sa CreateAnchoredAlnFromAln
362 template<class _TAlnStats>
363 void CreateAnchoredAlnVec(_TAlnStats& aln_stats,
364  TAnchoredAlnVec& out_vec,
365  const CAlnUserOptions& options)
366 {
367  _ASSERT(out_vec.empty());
368  out_vec.reserve(aln_stats.GetAlnCount());
369  for (size_t aln_idx = 0; aln_idx < aln_stats.GetAlnCount(); ++aln_idx) {
370  CRef<CAnchoredAln> anchored_aln =
371  CreateAnchoredAlnFromAln(aln_stats, aln_idx, options);
372  if ( !anchored_aln ) continue;
373 
374  const CSeq_align& aln = *aln_stats.GetAlnVec()[aln_idx];
375  if (aln.GetSegs().IsStd()) {
376  bool need_split = false;
378  const CPairwiseAln& pw = **it;
379  if (pw.IsSet(CPairwiseAln::fMixedDir) ||
381  need_split = true;
382  break;
383  }
384  }
385  if (need_split) {
386  // The std-seg contains overlaps and needs to be split into
387  // separate rows/segments.
388  CAnchoredAln::TDim anchor_row = anchored_aln->GetAnchorRow();
389  const CPairwiseAln& apw = *anchored_aln->GetPairwiseAlns()[anchor_row];
390  for (CAnchoredAln::TDim row = 0; row < anchored_aln->GetDim(); ++row) {
391  if (row == anchor_row) continue;
392  const CPairwiseAln& rpw = *anchored_aln->GetPairwiseAlns()[row];
393  for (auto apw_seg = apw.begin(), rpw_seg = rpw.begin();
394  rpw_seg != rpw.end(); ++apw_seg, ++rpw_seg) {
395  CRef<CAnchoredAln> sub_anchored_aln(new CAnchoredAln);
396  sub_anchored_aln->SetPairwiseAlns().resize(2);
397  sub_anchored_aln->SetDim(2);
398  CRef<CPairwiseAln> sub_row(new CPairwiseAln(
399  rpw.GetFirstId(), rpw.GetSecondId(), rpw.GetPolicyFlags()));
400  sub_row->insert(sub_row->end(), *rpw_seg);
401  CRef<CPairwiseAln> sub_anchor_row(new CPairwiseAln(
402  apw.GetFirstId(), apw.GetSecondId(), apw.GetPolicyFlags()));
403  sub_anchor_row->insert(sub_anchor_row->end(), *apw_seg);
404  if (anchor_row == 0) {
405  sub_anchored_aln->SetAnchorRow(0);
406  sub_anchored_aln->SetPairwiseAlns()[0] = sub_anchor_row;
407  sub_anchored_aln->SetPairwiseAlns()[1] = sub_row;
408  }
409  else {
410  sub_anchored_aln->SetAnchorRow(1);
411  sub_anchored_aln->SetPairwiseAlns()[0] = sub_row;
412  sub_anchored_aln->SetPairwiseAlns()[1] = sub_anchor_row;
413  }
414  out_vec.push_back(sub_anchored_aln);
415  sub_anchored_aln->SetScore(rpw_seg->GetLength());
416  }
417  }
418  continue; // Done splitting std-seg
419  }
420  }
421 
422  out_vec.push_back(anchored_aln);
423  // Calc scores
424  for (typename _TAlnStats::TDim row = 0; row < anchored_aln->GetDim(); ++row) {
425  ITERATE(CPairwiseAln, rng_it, *anchored_aln->GetPairwiseAlns()[row]) {
426  anchored_aln->SetScore() += rng_it->GetLength();
427  }
428  }
429  anchored_aln->SetScore() /= anchored_aln->GetDim();
430  }
431 }
432 
433 
434 /// A simple API that assumes that the seq_align has exactly two rows
435 /// and you want to create a pairwise with the default policy.
436 /// @sa ConvertSeqAlignToPairwiseAln
439 CreatePairwiseAlnFromSeqAlign(const objects::CSeq_align& seq_align);
440 
441 
443 
444 #endif // OBJTOOLS_ALNMGR___ALN_CONVERTERS__HPP
CRef< CPairwiseAln > CreatePairwiseAlnFromSeqAlign(const objects::CSeq_align &seq_align)
A simple API that assumes that the seq_align has exactly two rows and you want to create a pairwise w...
void ConvertSeqAlignToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSeq_align &sa, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a seq-align.
void ConvertStdsegToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSeq_align::TSegs::TStd &stds, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of an std-seg.
void ConvertSeqLocsToPairwiseAln(CPairwiseAln &aln, const objects::CSeq_loc &loc_1, const objects::CSeq_loc &loc_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections)
Build pairwise alignment from a pair of seq-locs.
void ConvertDensegToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CDense_seg &ds, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a dense-seg.
void ConvertSparseToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSparse_seg &sparse_seg, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a sparse-seg.
void ConvertSplicedToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSpliced_seg &spliced_seg, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a spliced-seg.
void ConvertDendiagToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSeq_align::TSegs::TDendiag &dendiags, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a dendiag.
list< CRef< CPairwiseAln > > TPairwiseAlnList
void SeqLocMapperToPairwiseAligns(const objects::CSeq_loc_Mapper_Base &mapper, TPairwiseAlnList &aligns)
Build a list of pairwise alignments from a seq-loc mapper's mappings.
void CreateAnchoredAlnVec(_TAlnStats &aln_stats, TAnchoredAlnVec &out_vec, const CAlnUserOptions &options)
Create anchored alignment from each seq-align in the stats.
void ConvertPackedsegToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CPacked_seg &ps, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a packed-seg.
CRef< CAnchoredAln > CreateAnchoredAlnFromAln(const _TAlnStats &aln_stats, size_t aln_idx, const CAlnUserOptions &options, objects::CSeq_align::TDim explicit_anchor_row=-1)
Create an anchored alignment from Seq-align using hints.
vector< TAlnSeqIdIRef > TAlnSeqIdVec
CAnchoredAln::TDim TDim
#define _ALNMGR_ASSERT(expr)
bool IsSet(int flags) const
const_iterator begin() const
@ fAllowOverlap
allow segments with different orientation
@ fIgnoreInsertions
allows segments not separated by gaps
@ fAllowMixedDir
enforce all policies after any modification
const_iterator end() const
Options for different alignment manager operations.
TMergeFlags m_MergeFlags
const TAlnSeqIdIRef & GetAnchorId(void) const
Get anchor id.
EDirection m_Direction
EDirection
Row direction flags.
@ eBothDirections
No filtering: use both direct and reverse sequences.
@ eDirect
Use only sequences whose strand is the same as that of the anchor.
@ fAnchorRowFirst
Store anchor row in the first pairwise alignment (by default it's stored in the last one).
@ fIgnoreInsertions
Do not collect and store insertions (gaps on the anchor).
Query-anchored alignment can be 2 or multi-dimentional.
const TPairwiseAlnVector & GetPairwiseAlns(void) const
The vector of pairwise alns.
vector< CRef< CPairwiseAln > > TPairwiseAlnVector
TDim GetDim(void) const
How many rows.
void SetAnchorRow(TDim anchor_row)
Modify anchor row (never do this unless you are creating a new alignment and know what you're doing).
TPairwiseAlnVector & SetPairwiseAlns(void)
Modify pairwise alns.
void SetScore(int score)
Set the total score.
TDim GetAnchorRow(void) const
Which is the anchor row?
void SetDim(TDim dim)
Modify the number of rows.
A pairwise aln is a collection of ranges for a pair of rows.
const TAlnSeqIdIRef & GetFirstId(void) const
Get first sequence id.
const TAlnSeqIdIRef & GetSecondId(void) const
Get second sequence id.
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
CSeq_align::C_Segs::TDendiag TDendiag
Definition: cuAlign.hpp:48
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALNMGR_EXPORT
Definition: ncbi_export.h:1065
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
int i
CMSAToolJob::TIdMap TIdMap
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
vector< CRef< CAnchoredAln > > TAnchoredAlnVec
Collection of anchored alignments.
#define row(bind, expected)
Definition: string_bind.c:73
#define _ASSERT
Modified on Sat May 25 14:18:38 2024 by modify_doxy.py rev. 669887