NCBI C++ ToolKit
su_alignment_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: su_alignment_set.cpp 92066 2020-12-21 14:58:25Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * Classes to hold sets of alignments
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistl.hpp>
36 
39 
40 #include <map>
41 #include <memory>
42 
43 #include "su_alignment_set.hpp"
46 #include "su_private.hpp"
47 
50 
51 
52 BEGIN_SCOPE(struct_util)
53 
55 {
56  // screen alignments to make sure they're a type we can handle
57  typedef list < CRef < CSeq_align > > SeqAlignList;
58  SeqAlignList seqAligns;
59  SeqAnnotList::const_iterator n, ne = seqAnnots.end();
60  for (n=seqAnnots.begin(); n!=ne; ++n) {
61 
62  if (!n->GetObject().GetData().IsAlign())
63  THROW_MESSAGE("AlignmentSet::AlignmentSet() - confused by Seq-annot data format");
64  if (n != seqAnnots.begin())
65  TRACE_MESSAGE("multiple Seq-annots");
66 
67  CSeq_annot::C_Data::TAlign::const_iterator
68  a, ae = n->GetObject().GetData().GetAlign().end();
69  for (a=n->GetObject().GetData().GetAlign().begin(); a!=ae; ++a) {
70 
71  if (!(a->GetObject().GetType() != CSeq_align::eType_partial ||
72  a->GetObject().GetType() != CSeq_align::eType_diags) ||
73  !a->GetObject().IsSetDim() || a->GetObject().GetDim() != 2 ||
74  (!a->GetObject().GetSegs().IsDendiag() && !a->GetObject().GetSegs().IsDenseg()))
75  THROW_MESSAGE("AlignmentSet::AlignmentSet() - confused by alignment type");
76 
77  seqAligns.push_back(*a);
78  }
79  }
80  if (seqAligns.size() == 0)
81  THROW_MESSAGE("AlignmentSet::AlignmentSet() - must have at least one Seq-align");
82 
83  // we need to determine the identity of the master sequence; most rigorous way is to look
84  // for a Seq-id that is present in all pairwise alignments
85  m_master = NULL;
86  const Sequence *seq1 = NULL, *seq2 = NULL;
87  bool seq1PresentInAll = true, seq2PresentInAll = true;
88 
89  // first, find sequences for first pairwise alignment
90  const CSeq_id& frontSid = seqAligns.front()->GetSegs().IsDendiag() ?
91  seqAligns.front()->GetSegs().GetDendiag().front()->GetIds().front().GetObject() :
92  seqAligns.front()->GetSegs().GetDenseg().GetIds().front().GetObject();
93  const CSeq_id& backSid = seqAligns.front()->GetSegs().IsDendiag() ?
94  seqAligns.front()->GetSegs().GetDendiag().front()->GetIds().back().GetObject() :
95  seqAligns.front()->GetSegs().GetDenseg().GetIds().back().GetObject();
96  SequenceSet::SequenceList::const_iterator s, se = sequenceSet.m_sequences.end();
97  for (s=sequenceSet.m_sequences.begin(); s!=se; ++s) {
98  if ((*s)->MatchesSeqId(frontSid)) seq1 = *s;
99  if ((*s)->MatchesSeqId(backSid)) seq2 = *s;
100  if (seq1 && seq2) break;
101  }
102  if (!(seq1 && seq2))
103  THROW_MESSAGE("AlignmentSet::AlignmentSet() - can't match first pair of Seq-ids to Sequences");
104 
105  // now, make sure one of these sequences is present in all the other pairwise alignments
106  SeqAlignList::const_iterator a = seqAligns.begin(), ae = seqAligns.end();
107  for (++a; a!=ae; ++a) {
108  const CSeq_id& frontSid2 = (*a)->GetSegs().IsDendiag() ?
109  (*a)->GetSegs().GetDendiag().front()->GetIds().front().GetObject() :
110  (*a)->GetSegs().GetDenseg().GetIds().front().GetObject();
111  const CSeq_id& backSid2 = (*a)->GetSegs().IsDendiag() ?
112  (*a)->GetSegs().GetDendiag().front()->GetIds().back().GetObject() :
113  (*a)->GetSegs().GetDenseg().GetIds().back().GetObject();
114  if (!seq1->MatchesSeqId(frontSid2) && !seq1->MatchesSeqId(backSid2))
115  seq1PresentInAll = false;
116  if (!seq2->MatchesSeqId(frontSid2) && !seq2->MatchesSeqId(backSid2))
117  seq2PresentInAll = false;
118  }
119  if (!seq1PresentInAll && !seq2PresentInAll)
120  THROW_MESSAGE("AlignmentSet::AlignmentSet() - "
121  "all pairwise sequence alignments must have a common master sequence");
122  else if (seq1PresentInAll && !seq2PresentInAll)
123  m_master = seq1;
124  else if (seq2PresentInAll && !seq1PresentInAll)
125  m_master = seq2;
126  else if (seq1PresentInAll && seq2PresentInAll && seq1 == seq2)
127  m_master = seq1;
128 
129  // if still ambiguous, just use the first one for now
130  if (!m_master) {
131  WARNING_MESSAGE("alignment master sequence is ambiguous - using the first one ("
132  << seq1->IdentifierString() << ')');
133  m_master = seq1;
134  }
135  TRACE_MESSAGE("determined master sequence: " << m_master->IdentifierString());
136 
137  // parse the pairwise alignments
138  SeqAlignList::const_iterator l, le = seqAligns.end();
139  for (l=seqAligns.begin(); l!=le; ++l)
140  m_alignments.push_back(
141  CRef<MasterSlaveAlignment>(new MasterSlaveAlignment(**l, m_master, sequenceSet)));
142 
143  TRACE_MESSAGE("number of alignments: " << m_alignments.size());
144 }
145 
147  const BlockMultipleAlignment *multiple, SeqAnnotList *newAsnAlignmentData,
148  const SequenceSet& sequenceSet, const vector < unsigned int > *rowOrder)
149 {
150  newAsnAlignmentData->clear();
151 
152  // sanity check on the row order map
153  if (rowOrder) {
155  for (unsigned int i=0; i<rowOrder->size(); ++i)
156  rowCheck[(*rowOrder)[i]] = i;
157  if (rowOrder->size() != multiple->NRows() || rowCheck.size() != multiple->NRows() || (*rowOrder)[0] != 0) {
158  ERROR_MESSAGE("AlignmentSet::CreateFromMultiple() - bad row order vector");
159  return NULL;
160  }
161  }
162 
163  // create a single Seq-annot, with 'align' data that holds one Seq-align per slave
164  SeqAnnotList newSeqAnnots;
165  CRef < CSeq_annot > seqAnnot(new CSeq_annot());
166  newSeqAnnots.push_back(seqAnnot);
167 
168  CSeq_annot::C_Data::TAlign& seqAligns = seqAnnot->SetData().SetAlign();
169  seqAligns.resize((multiple->NRows() == 1) ? 1 : multiple->NRows() - 1);
170  CSeq_annot::C_Data::TAlign::iterator sa = seqAligns.begin();
171 
173  multiple->GetUngappedAlignedBlocks(&blocks);
174 
175  // create Seq-aligns; if there's only one row (the master), then cheat and create an alignment
176  // of the master with itself, because asn data doesn't take well to single-row "alignment"
177  if (multiple->NRows() > 1) {
178  for (unsigned int row=1; row<multiple->NRows(); ++row, ++sa) {
179  sa->Reset(CreatePairwiseSeqAlignFromMultipleRow(multiple, blocks,
180  (rowOrder ? (*rowOrder)[row] : row)));
181  }
182  } else
183  sa->Reset(CreatePairwiseSeqAlignFromMultipleRow(multiple, blocks, 0));
184 
185  unique_ptr<AlignmentSet> newAlignmentSet;
186  try {
187  newAlignmentSet.reset(new AlignmentSet(newSeqAnnots, sequenceSet));
188  } catch (CException& e) {
190  "AlignmentSet::CreateFromMultiple() - failed to create AlignmentSet from new asn object: "
191  << e.GetMsg());
192  return NULL;
193  }
194 
195  *newAsnAlignmentData = newSeqAnnots;
196  return newAlignmentSet.release();
197 }
198 
199 
200 ///// MasterSlaveAlignment methods /////
201 
202 MasterSlaveAlignment::MasterSlaveAlignment(const ncbi::objects::CSeq_align& seqAlign,
203  const Sequence *masterSequence, const SequenceSet& sequenceSet) :
204  m_master(masterSequence), m_slave(NULL)
205 {
206  // resize alignment and block vector
209 
210  // find slave sequence for this alignment, and order (master or slave first)
211  const CSeq_id& frontSeqId = seqAlign.GetSegs().IsDendiag() ?
212  seqAlign.GetSegs().GetDendiag().front()->GetIds().front().GetObject() :
213  seqAlign.GetSegs().GetDenseg().GetIds().front().GetObject();
214  const CSeq_id& backSeqId = seqAlign.GetSegs().IsDendiag() ?
215  seqAlign.GetSegs().GetDendiag().front()->GetIds().back().GetObject() :
216  seqAlign.GetSegs().GetDenseg().GetIds().back().GetObject();
217 
218  bool masterFirst = true;
219  SequenceSet::SequenceList::const_iterator s, se = sequenceSet.m_sequences.end();
220  for (s=sequenceSet.m_sequences.begin(); s!=se; ++s) {
221  if (m_master->MatchesSeqId(frontSeqId) &&
222  (*s)->MatchesSeqId(backSeqId)) {
223  break;
224  } else if ((*s)->MatchesSeqId(frontSeqId) &&
225  m_master->MatchesSeqId(backSeqId)) {
226  masterFirst = false;
227  break;
228  }
229  }
230  if (s == se) {
231  ERROR_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - couldn't find matching sequences; "
232  << "both " << frontSeqId.AsFastaString() << " and "
233  << backSeqId.AsFastaString() << " must be in the sequence list for this file!");
234  THROW_MESSAGE("couldn't find matching sequences");
235  } else {
236  m_slave = *s;
237  }
238 
239  int masterRes, slaveRes, blockNum = 0;
240  unsigned int i;
241 
242  // unpack dendiag alignment
243  if (seqAlign.GetSegs().IsDendiag()) {
244 
245  CSeq_align::C_Segs::TDendiag::const_iterator d , de = seqAlign.GetSegs().GetDendiag().end();
246  for (d=seqAlign.GetSegs().GetDendiag().begin(); d!=de; ++d, ++blockNum) {
247  const CDense_diag& block = d->GetObject();
248 
249  if (block.GetDim() != 2 || block.GetIds().size() != 2 || block.GetStarts().size() != 2)
250  THROW_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - incorrect dendiag block dimensions");
251 
252  // make sure identities of master and slave sequences match in each block
253  if ((masterFirst &&
254  (!m_master->MatchesSeqId(block.GetIds().front().GetObject()) ||
255  !m_slave->MatchesSeqId(block.GetIds().back().GetObject()))) ||
256  (!masterFirst &&
257  (!m_master->MatchesSeqId(block.GetIds().back().GetObject()) ||
258  !m_slave->MatchesSeqId(block.GetIds().front().GetObject()))))
259  THROW_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - mismatched Seq-id in dendiag block");
260 
261  // finally, actually unpack the data into the alignment vector
262  for (i=0; i<block.GetLen(); ++i) {
263  if (masterFirst) {
264  masterRes = block.GetStarts().front() + i;
265  slaveRes = block.GetStarts().back() + i;
266  } else {
267  masterRes = block.GetStarts().back() + i;
268  slaveRes = block.GetStarts().front() + i;
269  }
270  if ((unsigned) masterRes >= m_master->Length() || (unsigned) slaveRes >= m_slave->Length())
271  THROW_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - seqloc in dendiag block > length of sequence!");
272  m_masterToSlave[masterRes] = slaveRes;
273  m_blockStructure[masterRes] = blockNum;
274  }
275  }
276  }
277 
278  // unpack denseg alignment
279  else if (seqAlign.GetSegs().IsDenseg()) {
280 
281  const CDense_seg& block = seqAlign.GetSegs().GetDenseg();
282 
283  if (!block.IsSetDim() || block.GetDim() != 2 ||
284  block.GetIds().size() != 2 ||
285  block.GetStarts().size() != ((unsigned int) 2 * block.GetNumseg()) ||
286  block.GetLens().size() != ((unsigned int) block.GetNumseg()))
287  THROW_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - incorrect denseg block dimension");
288 
289  // make sure identities of master and slave sequences match in each block
290  if ((masterFirst &&
291  (!m_master->MatchesSeqId(block.GetIds().front().GetObject()) ||
292  !m_slave->MatchesSeqId(block.GetIds().back().GetObject()))) ||
293  (!masterFirst &&
294  (!m_master->MatchesSeqId(block.GetIds().back().GetObject()) ||
295  !m_slave->MatchesSeqId(block.GetIds().front().GetObject()))))
296  THROW_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - mismatched Seq-id in denseg block");
297 
298  // finally, actually unpack the data into the alignment vector
299  CDense_seg::TStarts::const_iterator starts = block.GetStarts().begin();
300  CDense_seg::TLens::const_iterator lens, le = block.GetLens().end();
301  for (lens=block.GetLens().begin(); lens!=le; ++lens) {
302  if (masterFirst) {
303  masterRes = *(starts++);
304  slaveRes = *(starts++);
305  } else {
306  slaveRes = *(starts++);
307  masterRes = *(starts++);
308  }
309  if (masterRes != -1 && slaveRes != -1) { // skip gaps
310  if ((masterRes + *lens - 1) >= m_master->Length() ||
311  (slaveRes + *lens - 1) >= m_slave->Length())
312  THROW_MESSAGE("MasterSlaveAlignment::MasterSlaveAlignment() - "
313  "seqloc in denseg block > length of sequence!");
314  for (i=0; i<*lens; ++i) {
315  m_masterToSlave[masterRes + i] = slaveRes + i;
316  m_blockStructure[masterRes + i] = blockNum;
317  }
318  ++blockNum; // a "block" of a denseg is an aligned (non-gap) segment
319  }
320  }
321  }
322 }
323 
324 END_SCOPE(struct_util)
User-defined methods of the data storage class.
list< const CSeq_align * > SeqAlignList
#define ERROR_MESSAGE(s)
Definition: block_align.cpp:49
#define WARNING_MESSAGE(s)
Definition: block_align.cpp:50
AlignmentSet(SequenceSet *seqSet, const SeqAnnotList &seqAnnots, bool ignoreBadPairwiseAlignments=false)
SeqAnnotList * newAsnAlignmentData
std::list< ncbi::CRef< ncbi::objects::CSeq_annot > > SeqAnnotList
static AlignmentSet * CreateFromMultiple(const BlockMultipleAlignment *multiple, SeqAnnotList *newAsnAlignmentData, const SequenceSet &sequenceSet, const std::vector< unsigned int > *rowOrder=NULL)
std::vector< const UngappedAlignedBlock * > UngappedAlignedBlockList
void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const
CRef –.
Definition: ncbiobj.hpp:618
ResidueVector m_blockStructure
MasterSlaveAlignment(const SequenceSet *sequenceSet, const Sequence *masterSequence, const objects::CSeq_align &seqAlign)
const Sequence * m_slave
const Sequence * m_master
ResidueVector m_masterToSlave
SequenceList m_sequences
unsigned int Length(void) const
bool MatchesSeqId(const ncbi::objects::CSeq_id &seqID) const
std::string IdentifierString(void) const
size_type size() const
Definition: map.hpp:148
Definition: map.hpp:338
#define NULL
Definition: ncbistd.hpp:225
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
Definition: Dense_seg_.hpp:396
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TLen GetLen(void) const
Get the Len member data.
TDim GetDim(void) const
Get the Dim member data.
const TIds & GetIds(void) const
Get the Ids member data.
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
const TStarts & GetStarts(void) const
Get the Starts member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
@ eType_diags
unbroken, but not ordered, diagonals
Definition: Seq_align_.hpp:102
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
int i
yy_size_t n
unsigned int a
Definition: ncbi_localip.c:102
The NCBI C++/STL use hints.
bool le(T x_, T y_, T round_)
Definition: njn_approx.hpp:84
bool ne(T x_, T y_, T round_)
Definition: njn_approx.hpp:82
static DP_BlockInfo * blocks
#define TRACE_MESSAGE(s)
USING_SCOPE(objects)
USING_NCBI_SCOPE
ncbi::objects::CSeq_align * CreatePairwiseSeqAlignFromMultipleRow(const BlockMultipleAlignment *multiple, const BlockMultipleAlignment::UngappedAlignedBlockList &blocks, unsigned int slaveRow)
#define THROW_MESSAGE(str)
Definition: su_private.hpp:48
#define const
Definition: zconf.h:230
Modified on Sat Dec 02 09:21:58 2023 by modify_doxy.py rev. 669887