NCBI C++ ToolKit
alignment_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: alignment_set.cpp 92487 2021-01-26 18:35:08Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * Classes to hold sets of alignments
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistl.hpp>
38 
39 #include <map>
40 #include <memory>
41 
43 
44 #include "alignment_set.hpp"
45 #include "sequence_set.hpp"
46 #include "structure_set.hpp"
48 #include "cn3d_tools.hpp"
49 #include "molecule_identifier.hpp"
50 
53 
54 
55 BEGIN_SCOPE(Cn3D)
56 
57 typedef list < const CSeq_align * > SeqAlignList;
58 
60  const SeqAnnotList& seqAnnots) :
61  StructureBase(parent), master(masterSequence), newAsnAlignmentData(NULL)
62 {
63  if (!master || !parentSet->sequenceSet) {
64  ERRORMSG("AlignmentSet::AlignmentSet() - need sequenceSet and master before parsing alignments");
65  return;
66  }
67 
68  // assume the data manager has collapsed all valid seqaligns into a single seqannot
69  CSeq_annot::C_Data::TAlign::const_iterator
70  a, ae = seqAnnots.front()->GetData().GetAlign().end();
71  for (a=seqAnnots.front()->GetData().GetAlign().begin(); a!=ae; ++a)
72  alignments.push_back(new MasterDependentAlignment(this, master, **a));
73  TRACEMSG("number of alignments: " << alignments.size());
74 }
75 
77 {
79 }
80 
82  const BlockMultipleAlignment *multiple, const vector < unsigned int >& rowOrder)
83 {
84  // sanity check on the row order map
86  for (unsigned int i=0; i<rowOrder.size(); ++i) rowCheck[rowOrder[i]] = i;
87  if (rowOrder.size() != multiple->NRows() || rowCheck.size() != multiple->NRows() || rowOrder[0] != 0) {
88  ERRORMSG("AlignmentSet::CreateFromMultiple() - bad row order vector");
89  return NULL;
90  }
91 
92  // create a single Seq-annot, with 'align' data that holds one Seq-align per dependent
93  unique_ptr<SeqAnnotList> newAsnAlignmentData(new SeqAnnotList(1));
94  CSeq_annot *seqAnnot = new CSeq_annot();
95  newAsnAlignmentData->back().Reset(seqAnnot);
96 
97  CSeq_annot::C_Data::TAlign& seqAligns = seqAnnot->SetData().SetAlign();
98  seqAligns.resize((multiple->NRows() == 1) ? 1 : multiple->NRows() - 1);
99  CSeq_annot::C_Data::TAlign::iterator sa = seqAligns.begin();
100 
102  multiple->GetUngappedAlignedBlocks(&blocks);
103 
104  // create Seq-aligns; if there's only one row (the master), then cheat and create an alignment
105  // of the master with itself, because asn data doesn't take well to single-row "alignment"
106  if (multiple->NRows() > 1) {
107  unsigned int newRow;
108  for (unsigned int row=1; row<multiple->NRows(); ++row, ++sa) {
109  newRow = rowOrder[row];
110  CSeq_align *seqAlign = CreatePairwiseSeqAlignFromMultipleRow(multiple, blocks, newRow);
111  sa->Reset(seqAlign);
112  }
113  } else
114  sa->Reset(CreatePairwiseSeqAlignFromMultipleRow(multiple, blocks, 0));
115 
116  unique_ptr<AlignmentSet> newAlignmentSet;
117  try {
118  newAlignmentSet.reset(new AlignmentSet(parent, multiple->GetMaster(), *newAsnAlignmentData));
119  } catch (exception& e) {
120  ERRORMSG(
121  "AlignmentSet::CreateFromMultiple() - failed to create AlignmentSet from new asn object; "
122  << "exception: " << e.what());
123  return NULL;
124  }
125 
126  newAlignmentSet->newAsnAlignmentData = newAsnAlignmentData.release();
127  return newAlignmentSet.release();
128 }
129 
130 
131 ///// MasterDependentAlignment methods /////
132 
134  const ncbi::objects::CSeq_align& seqAlign) :
135  StructureBase(parent), master(masterSequence), dependent(NULL)
136 {
137  // resize alignment and block vector
138  masterToDependent.resize(master->Length(), -1);
139  blockStructure.resize(master->Length(), -1);
140 
141  // find dependent sequence for this alignment, and order (master or dependent first)
142  const CSeq_id& frontSeqId = seqAlign.GetSegs().IsDendiag() ?
143  seqAlign.GetSegs().GetDendiag().front()->GetIds().front().GetObject() :
144  seqAlign.GetSegs().GetDenseg().GetIds().front().GetObject();
145  const CSeq_id& backSeqId = seqAlign.GetSegs().IsDendiag() ?
146  seqAlign.GetSegs().GetDendiag().front()->GetIds().back().GetObject() :
147  seqAlign.GetSegs().GetDenseg().GetIds().back().GetObject();
148 
149  bool masterFirst = true;
150  SequenceSet::SequenceList::const_iterator
151  s, se = master->parentSet->sequenceSet->sequences.end();
152  for (s=master->parentSet->sequenceSet->sequences.begin(); s!=se; ++s) {
153  if (master->identifier->MatchesSeqId(frontSeqId) &&
154  (*s)->identifier->MatchesSeqId(backSeqId)) {
155  break;
156  } else if ((*s)->identifier->MatchesSeqId(frontSeqId) &&
157  master->identifier->MatchesSeqId(backSeqId)) {
158  masterFirst = false;
159  break;
160  }
161  }
162  if (s == se) {
163  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - couldn't find matching sequences; "
164  << "both " << frontSeqId.AsFastaString() << " and "
165  << backSeqId.AsFastaString() << " must be in the sequence list for this file!");
166  return;
167  } else {
168  dependent = *s;
169  }
170 
171  unsigned int i, blockNum = 0;
172  int masterRes, dependentRes;
173 
174  // unpack dendiag alignment
175  if (seqAlign.GetSegs().IsDendiag()) {
176 
177  CSeq_align::C_Segs::TDendiag::const_iterator d , de = seqAlign.GetSegs().GetDendiag().end();
178  for (d=seqAlign.GetSegs().GetDendiag().begin(); d!=de; ++d, ++blockNum) {
179  const CDense_diag& block = d->GetObject();
180 
181  if (block.GetDim() != 2 || block.GetIds().size() != 2 || block.GetStarts().size() != 2) {
182  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - \n"
183  "incorrect dendiag block dimensions");
184  return;
185  }
186 
187  // make sure identities of master and dependent sequences match in each block
188  if ((masterFirst &&
189  (!master->identifier->MatchesSeqId(block.GetIds().front().GetObject()) ||
190  !dependent->identifier->MatchesSeqId(block.GetIds().back().GetObject()))) ||
191  (!masterFirst &&
192  (!master->identifier->MatchesSeqId(block.GetIds().back().GetObject()) ||
193  !dependent->identifier->MatchesSeqId(block.GetIds().front().GetObject())))) {
194  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - "
195  "mismatched Seq-id in dendiag block");
196  return;
197  }
198 
199  // finally, actually unpack the data into the alignment vector
200  for (i=0; i<block.GetLen(); ++i) {
201  if (masterFirst) {
202  masterRes = block.GetStarts().front() + i;
203  dependentRes = block.GetStarts().back() + i;
204  } else {
205  masterRes = block.GetStarts().back() + i;
206  dependentRes = block.GetStarts().front() + i;
207  }
208  if (masterRes < 0 || masterRes >= (int)master->Length() || dependentRes < 0 || dependentRes >= (int)dependent->Length()) {
209  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - seqloc in dendiag block > length of sequence!");
210  return;
211  }
212  masterToDependent[masterRes] = dependentRes;
213  blockStructure[masterRes] = blockNum;
214  }
215  }
216  }
217 
218  // unpack denseg alignment
219  else if (seqAlign.GetSegs().IsDenseg()) {
220 
221  const CDense_seg& block = seqAlign.GetSegs().GetDenseg();
222 
223  if (!block.IsSetDim() || block.GetDim() != 2 ||
224  block.GetIds().size() != 2 ||
225  (int)block.GetStarts().size() != 2 * block.GetNumseg() ||
226  (int)block.GetLens().size() != block.GetNumseg()) {
227  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - \n"
228  "incorrect denseg block dimension");
229  return;
230  }
231 
232  // make sure identities of master and dependent sequences match in each block
233  if ((masterFirst &&
234  (!master->identifier->MatchesSeqId(block.GetIds().front().GetObject()) ||
235  !dependent->identifier->MatchesSeqId(block.GetIds().back().GetObject()))) ||
236  (!masterFirst &&
237  (!master->identifier->MatchesSeqId(block.GetIds().back().GetObject()) ||
238  !dependent->identifier->MatchesSeqId(block.GetIds().front().GetObject())))) {
239  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - \n"
240  "mismatched Seq-id in denseg block");
241  return;
242  }
243 
244  // finally, actually unpack the data into the alignment vector
245  CDense_seg::TStarts::const_iterator starts = block.GetStarts().begin();
246  CDense_seg::TLens::const_iterator lens, le = block.GetLens().end();
247  for (lens=block.GetLens().begin(); lens!=le; ++lens) {
248  if (masterFirst) {
249  masterRes = *(starts++);
250  dependentRes = *(starts++);
251  } else {
252  dependentRes = *(starts++);
253  masterRes = *(starts++);
254  }
255  if (masterRes != -1 && dependentRes != -1) { // skip gaps
256  if ((masterRes + *lens - 1) >= master->Length() ||
257  (dependentRes + *lens - 1) >= dependent->Length()) {
258  ERRORMSG("MasterDependentAlignment::MasterDependentAlignment() - \n"
259  "seqloc in denseg block > length of sequence!");
260  return;
261  }
262  for (i=0; i<*lens; ++i) {
263  masterToDependent[masterRes + i] = dependentRes + i;
264  blockStructure[masterRes + i] = blockNum;
265  }
266  ++blockNum; // a "block" of a denseg is an aligned (non-gap) segment
267  }
268  }
269  }
270 
271  //TESTMSG("got alignment for dependent gi " << dependent->identifier->gi);
272 }
273 
274 END_SCOPE(Cn3D)
User-defined methods of the data storage class.
USING_SCOPE(objects)
list< const CSeq_align * > SeqAlignList
USING_NCBI_SCOPE
AlignmentSet(SequenceSet *seqSet, const SeqAnnotList &seqAnnots, bool ignoreBadPairwiseAlignments=false)
SeqAnnotList * newAsnAlignmentData
std::list< ncbi::CRef< ncbi::objects::CSeq_annot > > SeqAnnotList
static AlignmentSet * CreateFromMultiple(const BlockMultipleAlignment *multiple, SeqAnnotList *newAsnAlignmentData, const SequenceSet &sequenceSet, const std::vector< unsigned int > *rowOrder=NULL)
const Sequence * GetMaster(void) const
std::vector< const UngappedAlignedBlock * > UngappedAlignedBlockList
void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const
MasterDependentAlignment(StructureBase *parent, const Sequence *masterSequence, const ncbi::objects::CSeq_align &seqAlign)
const Sequence * master
ResidueVector masterToDependent
const Sequence * dependent
ResidueVector blockStructure
bool MatchesSeqId(const ncbi::objects::CSeq_id &sid) const
SequenceList sequences
Definition: cav_seqset.hpp:73
const MoleculeIdentifier * identifier
unsigned int Length(void) const
StructureSet * parentSet
const SequenceSet * sequenceSet
size_type size() const
Definition: map.hpp:148
Definition: map.hpp:338
#define TRACEMSG(stream)
Definition: cn3d_tools.hpp:83
#define ERRORMSG(stream)
Definition: cn3d_tools.hpp:86
#define NULL
Definition: ncbistd.hpp:225
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
Definition: Dense_seg_.hpp:396
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TLen GetLen(void) const
Get the Len member data.
TDim GetDim(void) const
Get the Dim member data.
const TIds & GetIds(void) const
Get the Ids member data.
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
const TStarts & GetStarts(void) const
Get the Starts member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
int i
unsigned int a
Definition: ncbi_localip.c:102
The NCBI C++/STL use hints.
bool le(T x_, T y_, T round_)
Definition: njn_approx.hpp:84
#define row(bind, expected)
Definition: string_bind.c:73
static DP_BlockInfo * blocks
ncbi::objects::CSeq_align * CreatePairwiseSeqAlignFromMultipleRow(const BlockMultipleAlignment *multiple, const BlockMultipleAlignment::UngappedAlignedBlockList &blocks, unsigned int slaveRow)
#define const
Definition: zconf.h:232
Modified on Sun Apr 14 05:27:38 2024 by modify_doxy.py rev. 669887